### imports

In [1]:
import pandas as pd
import spacy
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt



In [3]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

# Explicitly load the correct Punkt model from the 'punkt' package
nltk.download('punkt')

# Manually instantiate the tokenizer
tokenizer = PunktSentenceTokenizer()

# Use it directly if needed
text = "This is a test sentence. Let's see if it works!"
sentences = tokenizer.tokenize(text)
print("Sentences:", sentences)

# Then use word_tokenize as usual
print("Words:", word_tokenize(text))

Sentences: ['This is a test sentence.', "Let's see if it works!"]


[nltk_data] Downloading package punkt to /Users/Devyani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/Devyani/nltk_data'
**********************************************************************


In [2]:
from textstat import flesch_reading_ease
from collections import Counter
import numpy as np
from nltk.util import ngrams
from nltk.corpus import stopwords
nltk.download('stopwords')
import string

In [10]:
from nltk.tokenize import word_tokenize
from textstat import flesch_reading_ease

In [3]:
import json

### taking a closer look at cleaned data

In [4]:
df = pd.read_csv('coauthor_combined_cleaned.csv')

In [5]:
df.head()

Unnamed: 0,eventName,eventSource,eventTimestamp,textDelta,cursorRange,currentDoc,currentCursor,currentSuggestions,currentSuggestionIndex,currentHoverIndex,currentN,currentMaxToken,currentTemperature,currentTopP,currentPresencePenalty,currentFrequencyPenalty,eventNum,session_id
0,system-initialize,api,2021-08-16 07:00:41.033,,,"A woman has been dating guy after guy, but it ...",244,[],0,,5,30,0.3,1,0,0.0,0,e0435f4cf6fc435c872ffc5b66b66b0c
1,text-insert,user,2021-08-16 07:00:46.487,"{'ops': [{'retain': 244}, {'insert': ' '}]}",,,245,[],0,,5,30,0.3,1,0,0.0,1,e0435f4cf6fc435c872ffc5b66b66b0c
2,text-insert,user,2021-08-16 07:00:46.731,"{'ops': [{'retain': 245}, {'insert': ' '}]}",,,246,[],0,,5,30,0.3,1,0,0.0,2,e0435f4cf6fc435c872ffc5b66b66b0c
3,text-insert,user,2021-08-16 07:00:46.897,"{'ops': [{'retain': 246}, {'insert': ' '}]}",,,247,[],0,,5,30,0.3,1,0,0.0,3,e0435f4cf6fc435c872ffc5b66b66b0c
4,text-delete,user,2021-08-16 07:00:47.247,"{'ops': [{'retain': 246}, {'delete': 1}]}",,,246,[],0,,5,30,0.3,1,0,0.0,4,e0435f4cf6fc435c872ffc5b66b66b0c


In [6]:
df.describe()

Unnamed: 0,currentCursor,currentSuggestionIndex,currentHoverIndex,currentN,currentMaxToken,currentTemperature,currentTopP,currentPresencePenalty,currentFrequencyPenalty,eventNum
count,2701458.0,2701458.0,2299216.0,2701458.0,2701458.0,2701458.0,2701458.0,2701458.0,2701458.0,2701458.0
mean,1413.145,0.7657195,1.691181,5.0,30.0,0.5214587,1.0,0.0,0.4902231,1065.809
std,779.2813,1.238773,1.4321,0.0,0.0,0.2810858,0.0,0.0,0.3878369,763.014
min,0.0,0.0,0.0,5.0,30.0,0.2,1.0,0.0,0.0,0.0
25%,805.0,0.0,0.0,5.0,30.0,0.3,1.0,0.0,0.0,466.0
50%,1314.0,0.0,2.0,5.0,30.0,0.3,1.0,0.0,0.5,944.0
75%,1880.0,1.0,3.0,5.0,30.0,0.75,1.0,0.0,1.0,1512.0
max,5679.0,4.0,4.0,5.0,30.0,0.9,1.0,0.0,1.0,6621.0


In [7]:
print(df.columns.tolist())

['eventName', 'eventSource', 'eventTimestamp', 'textDelta', 'cursorRange', 'currentDoc', 'currentCursor', 'currentSuggestions', 'currentSuggestionIndex', 'currentHoverIndex', 'currentN', 'currentMaxToken', 'currentTemperature', 'currentTopP', 'currentPresencePenalty', 'currentFrequencyPenalty', 'eventNum', 'session_id']


### filter 

In [12]:
df = df[
    (df['eventName'] == 'text-insert') &
    (df['eventSource'].isin(['user', 'model'])) &
    (df['textDelta'].notna()) &
    (df['textDelta'].str.strip() != "")
].copy()

In [9]:
print(df.columns.tolist())


['eventName', 'eventSource', 'eventTimestamp', 'textDelta', 'cursorRange', 'currentDoc', 'currentCursor', 'currentSuggestions', 'currentSuggestionIndex', 'currentHoverIndex', 'currentN', 'currentMaxToken', 'currentTemperature', 'currentTopP', 'currentPresencePenalty', 'currentFrequencyPenalty', 'eventNum', 'session_id']


### analysis

In [13]:
def compute_stylometric_features(text):
    tokens = word_tokenize(text)
    words = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))

    return {
        'readability': flesch_reading_ease(text),
        'avg_word_length': np.mean([len(w) for w in words]) if words else 0,
        'lexical_diversity': len(set(words)) / len(words) if words else 0,
        'punctuation_freq': sum(1 for c in text if c in string.punctuation) / len(text) if text else 0,
        'stopword_ratio': sum(1 for w in words if w.lower() in stop_words) / len(words) if words else 0
    }

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Devyani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
import nltk
nltk.data.path.append('/Users/Devyani/nltk_data')

In [18]:
features_list = []
for _, row in df.iterrows():
    text = str(row['textDelta']) 
    source = row['eventSource']
    features = compute_stylometric_features(text)
    features['source'] = source
    features_list.append(features)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/Devyani/nltk_data'
    - '/Applications/anaconda3/envs/mina/nltk_data'
    - '/Applications/anaconda3/envs/mina/share/nltk_data'
    - '/Applications/anaconda3/envs/mina/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/Users/Devyani/nltk_data'
**********************************************************************
