In [2]:
from collections import Counter
import math, re, random
import pandas as pd

word_pattern = re.compile("\w[\w\-\']*\w|\w")

gptboys_tokens = []
gptgirls_tokens = []
gpt_tokens = []
bigmouth_tokens = []
pen15_tokens = []
tv_tokens = []
girls_tokens = []
boys_tokens =[]
youth_tokens = []

# gpt
with open("gpt_boys.txt", encoding="utf-8") as gptboys:
    for line in gptboys:
        gptboys_tokens.extend(word_pattern.findall(line))
with open("gpt_girls.txt", encoding="utf-8") as gptgirls:
    for line in gptgirls:
        gptgirls_tokens.extend(word_pattern.findall(line))
with open("gpt_combined.txt", encoding="utf-8") as gpt:
    for line in gpt:
        gpt_tokens.extend(word_pattern.findall(line))
# tv
with open("pen15_all.txt", encoding="utf-8") as pen15:
    for line in pen15:
        pen15_tokens.extend(word_pattern.findall(line))
with open("bigmouth_all.txt", encoding="utf-8") as bigmouth:
    for line in bigmouth:
        bigmouth_tokens.extend(word_pattern.findall(line))
with open("tvcombined.txt", encoding="utf-8") as tv:
    for line in tv:
        tv_tokens.extend(word_pattern.findall(line))
# youth
with open("youth_girls.txt", encoding="utf-8") as girls:
    for line in girls:
        girls_tokens.extend(word_pattern.findall(line))
with open("youthboys.txt", encoding="utf-8") as boys:
    for line in boys:
        boys_tokens.extend(word_pattern.findall(line))
with open("youth_all.txt", encoding="utf-8") as youth:
    for line in youth:
        youth_tokens.extend(word_pattern.findall(line))

In [3]:
### Evaluate the "surprise factor" of two proportions that are expressed as counts.
###  ie x1 "heads" out of n1 flips.
def dunning_score(x1, n1, x2, n2):
    p1 = float(x1) / n1
    p2 = float(x2) / n2
    p = float(x1 + x2) / (n1 + n2)
    
    return -2 * ( x1 * math.log(p / p1) + (n1 - x1) * math.log((1 - p)/(1 - p1)) + 
                  x2 * math.log(p / p2) + (n2 - x2) * math.log((1 - p)/(1 - p2)) )

def score_differences(a, b):
    a_counter = Counter(a)
    b_counter = Counter(b)

    a_length = len(a)
    b_length = len(b)
    vocabulary = a_counter.keys() & b_counter.keys()
    
    scored_words = []
    
    for w in vocabulary:
        a_n = a_counter[w]
        b_n = b_counter[w]
        
        ## Create a tuple containing information about each word
        g_score = dunning_score(a_n, a_length, b_n, b_length)
        scored_words.append( (g_score, a_n, b_n, w) )
        scored_words.sort(reverse = True)
    
    return scored_words

def shuffle_lists(a, b):
    a_length = len(a)
    b_length = len(b)
    
    merged = list(a)
    merged.extend(b)
    random.shuffle(merged)
    
    return (merged[:a_length], merged[a_length:])

## Low-res bar plot
bars = ["  ", "\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"]

def unicode_barplot(x, y):
    ratio = round(7 * (x / (x+y)))
    return bars[ratio] + bars[7-ratio]

def print_nicely(scores):
    for word_info in scores:
        print("{:.3f}\t{}\t{}\t{}\t{}".format(word_info[0], unicode_barplot(word_info[1], word_info[2]), word_info[1], word_info[2], word_info[3]))

In [26]:
print(" ".join(bars))
#uses unicode output to visualize data

   ▁ ▂ ▃ ▄ ▅ ▆ ▇ █


### Compare GPT boys and girls

In [4]:
#left is more boys and right is more girls
word_scores_gptboysgirls = score_differences(gptboys_tokens, gptgirls_tokens)
print_nicely(word_scores_gptboysgirls[:30])

134.237	▇  	32	1	man
126.563	▅▂	41	13	voice
29.276	  ▇	74	1110	like
26.162	▆▁	8	2	deeper
21.427	▂▅	20	41	taller
21.353	▄▃	9	6	deep
21.271	▆▁	6	1	cracks
16.595	  ▇	1	103	important
16.171	▃▄	13	23	keep
14.399	▅▂	5	2	mostly
14.399	▅▂	5	2	Alright
14.053	  ▇	2	111	little
13.783	▂▅	28	98	weird
13.647	▂▅	26	88	guess
13.540	  ▇	5	158	way
13.431	▄▃	7	7	roll
13.079	▆▁	4	1	jokes
13.079	▆▁	4	1	huh
13.079	▆▁	4	1	guys
12.883	  ▇	1	85	kind
12.762	▄▃	6	5	forget
12.762	▄▃	6	5	Yeah
12.097	  ▇	4	134	we're
12.001	▂▅	18	53	much
11.676	  ▇	7	175	our
11.583	  ▇	26	404	feel
11.536	▂▅	24	85	yeah
11.511	▄▃	6	6	squeaky
11.304	▂▅	18	55	getting
11.054	▁▆	35	149	some


In [5]:
print(word_scores_gptboysgirls)

[(134.23676254911769, 32, 1, 'man'), (126.56263380919341, 41, 13, 'voice'), (29.275805244884168, 74, 1110, 'like'), (26.161787966316687, 8, 2, 'deeper'), (21.427097693748703, 20, 41, 'taller'), (21.35250421457858, 9, 6, 'deep'), (21.270698067058902, 6, 1, 'cracks'), (16.595226563165827, 1, 103, 'important'), (16.1705058884868, 13, 23, 'keep'), (14.398594650168718, 5, 2, 'mostly'), (14.398594650168718, 5, 2, 'Alright'), (14.05318483160363, 2, 111, 'little'), (13.783222152122477, 28, 98, 'weird'), (13.646750786948513, 26, 88, 'guess'), (13.540388253733063, 5, 158, 'way'), (13.43074801295168, 7, 7, 'roll'), (13.078769735263748, 4, 1, 'jokes'), (13.078769735263748, 4, 1, 'huh'), (13.078769735263748, 4, 1, 'guys'), (12.883000585677713, 1, 85, 'kind'), (12.761811192078113, 6, 5, 'forget'), (12.761811192078113, 6, 5, 'Yeah'), (12.0968722642395, 4, 134, "we're"), (12.000889733760669, 18, 53, 'much'), (11.675831942529346, 7, 175, 'our'), (11.583223966975211, 26, 404, 'feel'), (11.53626776071521

In [32]:
# convert to df and save to csv
gpt_boysgirls = pd.DataFrame(word_scores_gptboysgirls)
gpt_boysgirls.columns =['G_Score', 'Count_Boys', 'Count_Girls', 'Word']
gpt_boysgirls.to_csv('gpt_boysgirls.csv',index=False)

shuffle text files

In [28]:
(fake_gptboys, fake_gptgirls) = shuffle_lists(gptboys_tokens, gptgirls_tokens)
# this creates word lists that are no longer related to authorship
# we didn't change the overall frequency of the words, just their association with authors
#the output will give us an idea of what dunning g scores would look like if there were no statistical difference

In [31]:
fake_word_scores_gptboysgirls = score_differences(fake_gptboys, fake_gptgirls)
#print_nicely(fake_word_scores_gptboysgirls[:30])

In [44]:
# convert to df and save to csv
fake_gpt_boysgirls = pd.DataFrame(fake_word_scores_gptboysgirls)
fake_gpt_boysgirls.columns =['G_Score_shuffled', 'Count_Boys_shuffled', 'Count_Girls_shuffled', 'Word']
fake_gpt_boysgirls.to_csv('shuffled_gpt_boysgirls.csv',index=False)

### Compare girls and boys

In [34]:
#left is more boys and right is more girls
word_scores_boysgirls = score_differences(boys_tokens, girls_tokens)
#print_nicely(word_scores_boysgirls[:30])

In [35]:
# convert to df and save to csv
boysgirls = pd.DataFrame(word_scores_boysgirls)
boysgirls.columns =['G_Score', 'Count_Boys', 'Count_Girls', 'Word']
boysgirls.to_csv('boysgirls.csv',index=False)

Shuffle text files

In [36]:
(fake_boys, fake_girls) = shuffle_lists(boys_tokens, girls_tokens)
# this creates word lists that are no longer related to authorship
# we didn't change the overall frequency of the words, just their association with authors
#the output will give us an idea of what dunning g scores would look like if there were no statistical difference

In [37]:
fake_word_scores_boysgirls = score_differences(fake_boys, fake_girls)
#print_nicely(fake_word_scores_boysgirls[:30])

In [45]:
fake_boysgirls = pd.DataFrame(fake_word_scores_boysgirls)
fake_boysgirls.columns =['G_Score_shuffled', 'Count_Boys_shuffled', 'Count_Girls_shuffled', 'Word']
fake_boysgirls.to_csv('shuffled_boysgirls.csv',index=False)

### Compare pen15 and big mouth

In [39]:
#left is more big mouth and right is more pen15
word_scores_tv = score_differences(bigmouth_tokens, pen15_tokens)
#print_nicely(word_scores_tv)

In [40]:
# convert to df and save to csv
tv_boysgirls = pd.DataFrame(word_scores_tv)
tv_boysgirls.columns = ['G_Score', 'Count_BigMouth', 'Count_Pen15', 'Word']
tv_boysgirls.to_csv('tv_boysgirls.csv',index=False)

shuffle text files

In [41]:
(fake_bigmouth, fake_pen15) = shuffle_lists(bigmouth_tokens, pen15_tokens)

In [43]:
fake_word_scores_tv = score_differences(fake_bigmouth,fake_pen15)

In [46]:
fake_tv = pd.DataFrame(fake_word_scores_tv)
fake_tv.columns = ['G_Score_shuffled', 'Count_BigMouth_shuffled', 'Count_Pen15_shuffled', 'Word']
fake_tv.to_csv('tv_boysgirls_shuffled.csv',index=False)

### Compare collapsed gpt to collapsed youth

In [48]:
word_scores_gpt_youth = score_differences(gpt_tokens, youth_tokens)

In [53]:
# convert to df and save to csv
gpt_v_youth = pd.DataFrame(word_scores_gpt_youth)
gpt_v_youth.columns = ['G_Score', 'Count_GPT', 'Count_Youth', 'Word']
gpt_v_youth.to_csv('gpt_v_youth.csv', index=False)

shuffle text files

In [54]:
(fake_gpt, fake_youth) = shuffle_lists(gpt_tokens, youth_tokens)

In [55]:
fake_word_scores_gpt_youth = score_differences(fake_gpt, fake_youth)

In [56]:
fake_gpt_youth = pd.DataFrame(fake_word_scores_gpt_youth)
fake_gpt_youth.columns = ['G_Score_shuffled', 'Count_GPT_shuffled', 'Count_Youth_shuffled', 'Word']
fake_gpt_youth.to_csv('gpt_v_youth_shuffled.csv', index=False)

### Compare gpt girls to youth girls

In [57]:
word_scores_gpt_girls = score_differences(gptgirls_tokens,girls_tokens)

In [58]:
# convert to df and save to csv
gpt_girls = pd.DataFrame(word_scores_gpt_girls)
gpt_girls.columns = ['G_Score', 'Count_GPT_Girls', 'Count_Youth_Girls', 'Word']
gpt_girls.to_csv('gpt_girls.csv',index=False)

shuffle text files

In [59]:
(fake_gptgirls, fake_girls) = shuffle_lists(gptgirls_tokens,girls_tokens)

In [60]:
fake_word_scores_gpt_girls = score_differences(fake_gptgirls,fake_girls)

In [62]:
fake_gpt_girls = pd.DataFrame(fake_word_scores_gpt_girls)
fake_gpt_girls.columns = ['G_Score_shuffled', 'Count_GPT_Girls_shuffled', 'Count_Youth_Girls_shuffled', 'Word']
fake_gpt_girls.to_csv('gpt_girls_shuffled.csv',index=False)

### Compare gpt boys to youth boys

In [63]:
word_scores_gpt_boys = score_differences(gptboys_tokens, boys_tokens)

In [67]:
# convert to df and save to csv
gpt_boys = pd.DataFrame(word_scores_gpt_boys)
gpt_boys.columns = ['G_Score', 'Count_GPT_Boys', 'Count_Youth_Boys', 'Word']
gpt_boys.to_csv('gpt_boys.csv', index=False)

shuffle text files

In [65]:
(fake_gpt_boys,fake_boys) = shuffle_lists(gptboys_tokens, boys_tokens)

In [66]:
fake_word_scores_gpt_boys = score_differences(fake_gpt_boys,fake_boys)

In [73]:
# convert to df and save to csv
fake_gpt_boys = pd.DataFrame(fake_word_scores_gpt_boys)
fake_gpt_boys.columns = ['G_Score_Shuffled', 'Count_GPT_Boys_Shuffled', 'Count_Youth_Boys_Shuffled', 'Word']
fake_gpt_boys.to_csv('gpt_boys_shuffled.csv',index=False)

### Compare collapsed tv to collapsed youth

In [69]:
word_scores_tv_youth = score_differences(tv_tokens,youth_tokens)

In [70]:
# convert to df and save to csv
tv_youth = pd.DataFrame(word_scores_tv_youth)
tv_youth.columns = ['G_Score', 'Count_TV', 'Count_Youth', 'Word']
tv_youth.to_csv('tv_youth.csv',index=False)

shuffle text files

In [71]:
(fake_tv, fake_youth) = shuffle_lists(tv_tokens,youth_tokens)

In [72]:
fake_word_scores_tv_youth = score_differences(fake_tv,fake_youth)

In [74]:
# convert to df and save to csv
fake_tv_youth = pd.DataFrame(fake_word_scores_tv_youth)
fake_tv_youth.columns = ['G_Score_Shuffled', 'Count_TV_Shuffled', 'Count_Youth_Shuffled', 'Word']
fake_tv_youth.to_csv('tv_youth_shuffled.csv',index=False)

### Compare pen15 to youth girls

In [75]:
word_scores_pen15_girls = score_differences(pen15_tokens,girls_tokens)

In [76]:
# convert to df and save to csv
pen15_girls = pd.DataFrame(word_scores_pen15_girls)
pen15_girls.columns = ['G_Score', 'Count_Pen15', 'Count_Girls', 'Word']
pen15_girls.to_csv('pen15_girls.csv',index=False)

shuffle text files

In [77]:
(fake_pen15, fake_girls) = shuffle_lists(pen15_tokens, girls_tokens)

In [78]:
fake_word_scores_pen15_girls = score_differences(fake_tv,fake_youth)

In [79]:
# convert to df and save to csv
fake_pen15_girls = pd.DataFrame(fake_word_scores_pen15_girls)
fake_pen15_girls.columns = ['G_Score_Shuffled', 'Count_Pen15_Shuffled', 'Count_Girls_Shuffled', 'Word']
fake_pen15_girls.to_csv('pen15_girls_shuffled.csv',index=False)

### Compare bigmouth to youth boys

In [80]:
word_scores_bigmouth_boys = score_differences(bigmouth_tokens, boys_tokens)

In [81]:
# convert to df and save to csv
bigmouth_boys = pd.DataFrame(word_scores_bigmouth_boys)
bigmouth_boys.columns = ['G_Score', 'Count_BigMouth', 'Count_Boys', 'Word']
bigmouth_boys.to_csv('bigmouth_boys.csv',index=False)

shuffle text files

In [82]:
(fake_bigmouth, fake_boys) = shuffle_lists(bigmouth_tokens,boys_tokens)

In [83]:
fake_word_scores_bigmouth_boys = score_differences(fake_bigmouth,fake_boys)

In [84]:
#convert to df and save to csv
fake_bigmouth_boys = pd.DataFrame(fake_word_scores_bigmouth_boys)
fake_bigmouth_boys.columns = ['G_Score_Shuffled', 'Count_BigMouth_Shuffled', 'Count_Boys_Shuffled', 'Word']
fake_bigmouth_boys.to_csv('bigmouth_boys_shuffled.csv',index=False)

### Compare collapsed tv to collapsed gpt

In [4]:
word_scores_tv_gpt = score_differences(tv_tokens,gpt_tokens)

In [5]:
# convert to df and save to csv
tv_gpt = pd.DataFrame(word_scores_tv_gpt)
tv_gpt.columns = ['G_Score', 'Count_TV', 'Count_GPT', 'Word']
tv_gpt.to_csv('tv_gpt.csv', index=False)

shuffle text files

In [6]:
(fake_tv,fake_gpt) = shuffle_lists(tv_tokens, gpt_tokens)

In [7]:
fake_word_scores_tv_gpt = score_differences(fake_tv,fake_gpt)

In [9]:
# convert to df and save to csv
fake_tv_gpt = pd.DataFrame(fake_word_scores_tv_gpt)
fake_tv_gpt.columns = ['G_Score_Shuffled', 'Count_TV_Shuffled', 'Count_GPT_Shuffled', 'Word']
fake_tv_gpt.to_csv('tv_gpt_shuffled.csv',index=False)

### Compare pen15 to gpt girls

In [10]:
word_scores_pen15_gptgirls = score_differences(pen15_tokens,gptgirls_tokens)

In [11]:
# convert to df and save to csv
pen15_gptgirls = pd.DataFrame(word_scores_pen15_gptgirls)
pen15_gptgirls.columns = ['G_Score', 'Count_Pen15', 'Count_GPT_Girls', 'Word']
pen15_gptgirls.to_csv('pen15_gptgirls.csv',index=False)

shuffle text files

In [13]:
(fake_pen15,fake_gptgirls) = shuffle_lists(pen15_tokens,girls_tokens)

In [14]:
fake_word_scores_pen15_gptgirls = score_differences(fake_pen15,fake_gptgirls)

In [15]:
# convert to df and save to csv
fake_pen15_gptgirls = pd.DataFrame(fake_word_scores_pen15_gptgirls)
fake_pen15_gptgirls.columns = ['G_Score', 'Count_Pen15_Shuffled', 'Count_GPT_Girls_Shuffled', 'Word']
fake_pen15_gptgirls.to_csv('pen15_gptgirls_shuffled.csv',index=False)

### Compare bigmouth to gpt boys

In [16]:
word_scores_bigmouth_gptboys = score_differences(bigmouth_tokens, gptboys_tokens)

In [18]:
# convert to df and save to csv
bigmouth_gptboys = pd.DataFrame(word_scores_bigmouth_gptboys)
bigmouth_gptboys.columns = ['G_Score', 'Count_BigMouth', 'Count_GPT_Boys', 'Word']
bigmouth_gptboys.to_csv('bigmouth_gptboys.csv',index=False)

shuffle text files

In [19]:
(fake_bigmouth,fake_gptboys) = shuffle_lists(bigmouth_tokens,gptboys_tokens)

In [20]:
fake_word_scores_bigmouth_gptboys = score_differences(fake_bigmouth,fake_gptboys)

In [21]:
# convert to df and save to csv
fake_bigmouth_gptboys = pd.DataFrame(fake_word_scores_bigmouth_gptboys)
fake_bigmouth_gptboys.columns = ['G_Score', 'Count_BigMouth_Shuffled', 'Count_GPT_Boys_Shuffled', 'Word']
fake_bigmouth_gptboys.to_csv('bigmouth_gptboys_shuffled.csv',index=False)