## Setup

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.cluster import KMeans
import heapq
import scipy.stats as stats

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
import string
from nltk.util import ngrams
from nltk.corpus import stopwords
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

from statsmodels.stats.multitest import multipletests

english_stopwords = stopwords.words('english')

## Read in processed data from R

In [None]:
df = pd.read_csv('d_analysis.csv')

In [None]:
df['outcome_next_bin'] = df['outcome'].map(lambda s: 0 if pd.isna(s) else 1 if s=='CORRECT' else 0)

In [None]:
df = df[['utterance_combined', 'outcome_next_bin']]

In [None]:
def tokenize_with_ngrams(s, n=2):
    if pd.isna(s):
        return np.nan
    
    translator = str.maketrans('', '', string.punctuation)
    s = s.lower()
    s = s.replace('/', ' ')
    s = s.translate(translator)  # Remove punctuation
    
    tokens = s.split()
    filtered_tokens = [word for word in tokens if word not in english_stopwords and not word.isnumeric()]
    
    ngram_list = list(ngrams(filtered_tokens, n))
    
    return filtered_tokens, ngram_list

In [None]:
df['unigrams'] = df.utterance_combined.map(lambda s: tokenize_with_ngrams(s, n=2)[0])

In [None]:
all_words_correct = [word for sublist in df[df['outcome_next_bin']==1]['unigrams'] for word in sublist]
all_words_incorrect = [word for sublist in df[df['outcome_next_bin']==0]['unigrams'] for word in sublist]

## Perform ChiSq filtering

In [None]:
def identity_tokenizer(text):
    return [text]

In [None]:
# Combine the lists into a corpus
corpus = all_words_incorrect + all_words_correct

# Create the CountVectorizer to convert text into a matrix of word counts
vectorizer = CountVectorizer(tokenizer=identity_tokenizer, lowercase=False, preprocessor=None)

# Fit and transform the corpus using CountVectorizer
X = vectorizer.fit_transform(corpus)

# Determine the length of each individual list (corpus size for each category)
n_incorrect = len(all_words_incorrect)
n_correct = len(all_words_correct)

# Create labels for the corpora (0 for dlfb, 1 for dlweb)
labels = np.array([0] * n_incorrect + [1] * n_correct)

# Perform chi-square test on the word counts and labels
chi2_scores, p_values = chi2(X, labels)

# Create a list of (word, chi-square score) tuples
word_scores = list(zip(vectorizer.get_feature_names(), chi2_scores, p_values))

# Sort the list based on the chi-square scores in descending order
word_scores = sorted(word_scores, key=lambda x: abs(x[1]), reverse=True)

# Get the top 10 distinctive words based on chi-square scores
top_words_chi2 = word_scores

In [None]:
count_correct = Counter(all_words_correct)
count_incorrect = Counter(all_words_incorrect)
dat = [(word, count_correct.get(word), count_incorrect.get(word), chi, p) for word, chi, p in top_words_chi2]
 
# Define column names for the DataFrame
columns = ['word', 'count_correct', 'count_incorrect', 'chisq', 'p']

# Create the DataFrame
ans = pd.DataFrame(dat, columns=columns).fillna(0)

## Top unigrams

In [None]:
ans[ans['count_correct']>=ans['count_incorrect']].head(10)

In [None]:
ans[ans['count_correct']<=ans['count_incorrect']].head(10)

## P-Adjustment

In [None]:
# Apply p-adjustment
corrections = multipletests(ans['p'], alpha=0.1, method='fdr_bh')
ans['p_corrected'] = corrections[1]
ans['significant'] = corrections[0]

In [None]:
# Show any significant p-values
ans[ans['significant']]

In [None]:
# Export results
ans.to_csv('ans_sorted.csv', index=False)