In [1]:
import sys
import json
import nltk
import operator
from collections import Counter

In [2]:
def read_tweets_from_json(filename):
    tweets=[]
    with open(filename, encoding="utf-8") as file:
        data=json.load(file)
        for tweet in data:
            tweets.append(tweet["text"])
    return tweets

In [3]:
trump_tweets=read_tweets_from_json("../data/trump_tweets.json")

In [4]:
aoc_tweets=read_tweets_from_json("../data/aoc_tweets.json")

Explore your assumptions between the words you think will most distinguish the tweets of Donald Trump from those Alexandria Ocasio-Cortez.  Before looking at the data, what words do you think will be comparatively distinct to both?  (If you're not familiar with either, see http://twitter.com/realDonaldTrump and http://twitter.com/AOC).

In [5]:
def convert_tweets_to_tokens(tweets):
    tokens=[]
    for tweet in tweets:
        tokens.extend(nltk.casual_tokenize(tweet))
    return tokens

In [6]:
def get_counts(tokens):
    counts=Counter()
    for token in tokens:
        counts[token]+=1
    return counts

The $\chi^2$ test as used in the comparison of different texts is designed to measure how statistically significant the distriubtion of counts in a 2x2 contingency table is.  Use the following function to analyze the difference between these accounts.  How do the most distinct terms comport with your assumptions?

In [7]:
def chi_square(one_counts, two_counts):

    one_sum=0.
    two_sum=0.
    vocab={}
    for word in one_counts:
        one_sum+=one_counts[word]
        vocab[word]=1
    for word in two_counts:
        vocab[word]=1
        two_sum+=two_counts[word]

    N=one_sum+two_sum
    vals={}
    
    for word in vocab:
        O11=one_counts[word]
        O12=two_counts[word]
        O21=one_sum-one_counts[word]
        O22=two_sum-two_counts[word]
        
        # We'll use the simpler form given in Manning and Schuetze (1999) 
        # for 2x2 contingency tables: 
        # https://nlp.stanford.edu/fsnlp/promo/colloc.pdf, equation 5.7
        
        vals[word]=(N*(O11*O22 - O12*O21)**2)/((O11 + O12)*(O11+O21)*(O12+O22)*(O21+O22))
        
    sorted_chi = sorted(vals.items(), key=operator.itemgetter(1), reverse=True)
    one=[]
    two=[]
    for k,v in sorted_chi:
        if one_counts[k]/one_sum > two_counts[k]/two_sum:
            one.append(k)
        else:
            two.append(k)
    
    print ("@realdonaldtrump:\n")
    for k in one[:20]:
        print("%s\t%s" % (k,vals[k]))

    print ("\n\n@AOC:\n")
    for k in two[:20]:
        print("%s\t%s" % (k,vals[k]))

In [8]:
trump_tokens=convert_tweets_to_tokens(trump_tweets)
trump_counts=get_counts(trump_tokens)

In [9]:
aoc_tokens=convert_tweets_to_tokens(aoc_tweets)
aoc_counts=get_counts(aoc_tokens)

In [10]:
chi_square(trump_counts, aoc_counts)

@realdonaldtrump:

"	1836.5978163266286
@realDonaldTrump	765.3957802230726
!	730.2198468922954
.	385.17047434426564
Trump	307.07605091108377
will	225.080703561608
great	205.48410043237178
Donald	138.89657461273904
Obama	122.04741592424662
Thanks	118.23258945517674
be	107.05307572399745
...	105.73645950394442
Great	103.12559394526428
he	100.91439311458447
President	79.4996887480999
#Trump2016	73.98301679890264
president	71.50269968536783
?	70.64265522231581
his	68.99794812850777
U	68.52167356066273


@AOC:

…	15848.594909887808
@Ocasio2018	6538.9447503114425
RT	5556.5061575955115
💜	2097.5568448843
’	1641.361057546218
*	992.541514459823
Queens	950.0773593348507
Bronx	927.9916062679271
+	795.6276763073928
Ocasio-Cortez	750.126729044148
Alexandria	714.2360862065979
@AOC	670.5625452449386
️	617.2331111090801
Ocasio	602.365672234485
💪🏽	568.2675814675245
s	526.7096314218563
re	524.0649876632918
progressive	509.7252317377511
Crowley	498.3655797345922
NY	456.801919843675


We saw earlier that $\chi^2$ is not a perfect estimator since it doesn't account for the burstiness of language (the tendency of mentions of the same word to clump together in a text).  Do you expect this to still hold on Twitter?  Why or why not?  How are the differences identified by a $\chi^2$ similar to those by Mann-Whitney?