In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [2]:
import pandas as pd
import us

from collections import OrderedDict
from wordfreq import top_n_list
from wordcloud import WordCloud

from twitter_geo.state_word_counts.db import StateWordCount

In [3]:
states = [s.name.lower() for s in us.states.STATES]

In [4]:
cities = [
    'new york',
    'los angeles',
    'chicago',
    'houston',
    'phoenix',
    'philadelphia',
    'san antonio',
    'san diego',
    'dallas',
    'san jose',
    'austin',
    'jacksonville',
    'san francisco',
    'columbus',
    'indianapolis',
    'fort worth',
    'charlotte',
    'seattle',
    'denver',
    'el paso',
    'washington',
    'boston',
    'detroit',
    'nashville',
    'memphis',
    'portland',
    'oklahoma city',
    'las vegas',
    'louisville',
    'baltimore',
    'milwaukee',
    'albuquerque',
    'tuscon',
    'fresno',
    'sacramento',
    'mesa',
    'kansas city',
    'atlanta',
    'long beach',
    'colorado springs',
    'raleigh',
    'miami',
    'virginia beach',
    'omaha',
    'oakland',
    'minneapolis',
    'tulsa',
    'arlington',
    'new orleans',
    'wichita',
]

In [5]:
def make_geo_set(names):
    geo = set()
    for name in names:
        geo.update(name.split())
    return geo

In [6]:
city_tokens = make_geo_set(cities)

In [7]:
state_tokens = make_geo_set(states)

In [8]:
stopwords = set(top_n_list('en', 50))

In [9]:
blacklist = set.union(stopwords, city_tokens, state_tokens)

In [10]:
state_counts = StateWordCount.state_counts()

In [15]:
token_counts = StateWordCount.token_counts(10000)

In [16]:
import math

def ttest(key1, key2):

    data = []
    for token in token_counts.keys():

        if token in blacklist or len(token) < 3:
            continue
            
        # Total count of key1 + key2.
        n = state_counts[key1] + state_counts[key2]
        
        p_key1 = state_counts[key1] / n
        
        key1_token_count = StateWordCount.state_token_count(key1, token) or 0
        key2_token_count = StateWordCount.state_token_count(key2, token) or 0
        p_token = (key1_token_count + key2_token_count) / n
        
        p_key1_token = key1_token_count
            
        
            
            
            
        # Total count of key1 + key2.
        n = state_counts[key1] + state_counts[key2]
        
        key1_token_count = StateWordCount.state_token_count(key1, token) or 0
        key2_token_count = StateWordCount.state_token_count(key2, token) or 0
        
        token_count = key1_token_count + key2_token_count
        
        p_key1 = state_counts[key1] / n
        
        key1_token_count = StateWordCount.state_token_count(key1, token) or 0
        key2_token_count = StateWordCount.state_token_count(key2, token) or 0
        
        p_token = (key1_token_count + key2_token_count) / n
        
        p = key1_token_count / state_counts[key1]
        
        h0 = p_key1 * p_token
        
        var = p * (1-p)
        
        t = (p - h0) / (math.sqrt(var / n))
        
        data.append((token, t))
    
    df = pd.DataFrame(data, columns=('token', 't'))
    
    return df.sort_values('t', ascending=False)

In [20]:
ttest('MA', 'WI')

Unnamed: 0,token,t
5,now,163.926507
0,amp,157.945245
4,how,147.525106
3,more,146.332733
16,via,136.442013
1,get,134.368833
11,our,130.184352
2,don,128.634727
7,people,121.574140
6,love,118.539899
