In [1]:
import sys
import json
import nltk
import operator
from collections import Counter

In [2]:
def read_tweets_from_json(filename):
    tweets=[]
    with open(filename, encoding="utf-8") as file:
        data=json.load(file)
        for tweet in data:
            tweets.append(tweet["text"])
    return tweets

In [3]:
trump_tweets=read_tweets_from_json("../data/trump_tweets.json")

In [4]:
aoc_tweets=read_tweets_from_json("../data/aoc_tweets.json")

In [6]:
print(len(trump_tweets))
print(len(aoc_tweets))

36583
3195


Explore your assumptions between the words you think will most distinguish the tweets of Donald Trump from those Alexandria Ocasio-Cortez.  Before looking at the data, what words do you think will be comparatively distinct to both?  (If you're not familiar with either, see http://twitter.com/realDonaldTrump and http://twitter.com/AOC).

In [None]:
## Hypothesis:
## AOC: women, climate change, democracy
## Trump: wall, immigrants, America 

In [7]:
def convert_tweets_to_tokens(tweets):
    tokens=[]
    for tweet in tweets:
        tokens.extend(nltk.casual_tokenize(tweet))
    return tokens

In [8]:
def get_counts(tokens):
    counts=Counter()
    for token in tokens:
        counts[token]+=1
    return counts

The $\chi^2$ test as used in the comparison of different texts is designed to measure how statistically significant the distriubtion of counts in a 2x2 contingency table is.  Use the following function to analyze the difference between these accounts.  How do the most distinct terms comport with your assumptions?

In [15]:
def chi_square(one_counts, two_counts):

    ## number of words in one
    one_sum=0.
    
    ## number of words in two
    two_sum=0.
    
    vocab={}
    for word in one_counts:
        one_sum+=one_counts[word]
        vocab[word]=1
    for word in two_counts:
        vocab[word]=1
        two_sum+=two_counts[word]

    N=one_sum+two_sum
    vals={}
    
    for word in vocab:
        O11=one_counts[word]
        O12=two_counts[word]
        O21=one_sum-one_counts[word]
        O22=two_sum-two_counts[word]
        
        # We'll use the simpler form given in Manning and Schuetze (1999) 
        # for 2x2 contingency tables: 
        # https://nlp.stanford.edu/fsnlp/promo/colloc.pdf, equation 5.7
        
        vals[word]=(N*(O11*O22 - O12*O21)**2)/((O11 + O12)*(O11+O21)*(O12+O22)*(O21+O22))
        
    sorted_chi = sorted(vals.items(), key=operator.itemgetter(1), reverse=True)
    one=[]
    two=[]
    for k,v in sorted_chi:
        if one_counts[k]/one_sum > two_counts[k]/two_sum:
            one.append(k)
        else:
            two.append(k)
    
    print ("@realdonaldtrump:\n")
    for k in one[:20]:
        print("%s\t%s" % (k,vals[k]))

    print ("\n\n@AOC:\n")
    for k in two[:20]:
        print("%s\t%s" % (k,vals[k]))

In [9]:
trump_tokens=convert_tweets_to_tokens(trump_tweets)
trump_counts=get_counts(trump_tokens)

In [10]:
aoc_tokens=convert_tweets_to_tokens(aoc_tweets)
aoc_counts=get_counts(aoc_tokens)

In [14]:
len(trump_counts)

55379

In [13]:
len(aoc_counts)

11885

In [16]:
chi_square(trump_counts, aoc_counts)

@realdonaldtrump:

"	1846.0471326639422
@realDonaldTrump	768.6947014444459
!	737.1138026377167
.	393.1171330535697
Trump	308.693399407266
will	226.67776556672456
great	206.63912488977334
Donald	139.52629445728704
Obama	122.59656855049468
Thanks	118.86880413791343
be	108.4550504242881
...	106.49875120719744
Great	103.66802373829586
he	101.60223124851736
President	79.99338961443827
#Trump2016	74.30069252201419
president	71.84224339299604
?	71.53755970078453
his	69.52428837154493
U	68.89137238692489


@AOC:

…	15775.033127020248
@Ocasio2018	6510.979769421301
RT	5528.185337780998
💜	2088.561687592688
’	1629.2190140789667
🏽	1458.0959881309388
*	988.0513935775591
Queens	945.9898232954332
Bronx	923.9200653261158
+	791.8713283128353
Ocasio-Cortez	746.9189836450635
Alexandria	711.1734096344572
@AOC	667.695041657371
💪	607.1218070676405
Ocasio	599.7897995008764
s	522.0862730914129
re	521.2344195370262
progressive	507.5371612862981
Crowley	496.2260907784563
️	472.12741963295275


We saw earlier that $\chi^2$ is not a perfect estimator since it doesn't account for the burstiness of language (the tendency of mentions of the same word to clump together in a text).  Do you expect this to still hold on Twitter?  Why or why not?  How are the differences identified by a $\chi^2$ similar to those by Mann-Whitney?

In [None]:
def 