In [36]:
import shelve
import math
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.util import ngrams
from read_tweets import parse_raw_tweets
from tokenizer import tokenize

In [37]:
# Load Tweet Data
batch_path = '../private_data/batch.txt'
parse_raw_tweets(batch_path)
tweet_shelf = shelve.open('../private_data/tweet_processing')
users_df = tweet_shelf['users_df']
tweets_df = tweet_shelf['tweets_df']
tweet_shelf.close()

In [38]:
# Load location data
location_shelf = shelve.open('../public_data/location_data')
state_strings = location_shelf['state_strings']
states_df = location_shelf['states_df']
states_dict = location_shelf['states_dict']
all_entities = location_shelf['all_entities']
location_shelf.close()

### Combining with Location data

In [39]:
def get_ngrams(text, n):
    n_grams = ngrams(TweetTokenizer().tokenize(text), n)
    return [ ' '.join(grams) for grams in n_grams]
def state_from_loc_str(loc_s):
    tokens = get_ngrams(loc_s, 1) + get_ngrams(loc_s, 2)
    states = [states_dict[token] for token in tokens if token in state_strings]
    if 'District of Columbia' in states:
        return ['District of Columbia']
    else:
        return states

In [40]:
def city_search(loc_s):
    subset = all_entities[all_entities['name'].apply(lambda name: name in loc_s)]
    best_guess = subset.sort_values('pop', ascending=False).drop_duplicates(subset='name', keep='first')
    if best_guess.empty:
        return 'no match'
    state = best_guess.iloc[0,:]['state']
    if state is not None:
        return state
    elif best_guess.iloc[0,:]['is_foreign']:
        return 'foreign'

In [41]:
merged = pd.merge(tweets_df, users_df, on='author_id')
merged['states'] = merged['location'].apply(state_from_loc_str)
merged['state_from_city'] = merged['location'].apply(city_search)
merged = merged.explode('states')
# Return state if state extracted straight from string,
# city search otherwise.
# This means that state matching takes precedence over foreign city matching.
# Multiple US state matches are assigned one value to each matched state.
# For duplicate matching on US city and foreign city, use highest population as estime.
merged['final_state'] = np.where(~(merged['states'].isna()), merged['states'], merged['state_from_city'])
merged.head()

Unnamed: 0,author_id,tweet_id,tweet_text,location,states,state_from_city,final_state
0,197715581,1271114681168658435,RT @keithboykin: General Mark Milley apologize...,Nederland,,Texas,Texas
1,3411140663,1271114680996630530,RT @charliekirk11: Do you think if Trump suppo...,"Ponte Vedra Beach, FL",Florida,foreign,Florida
2,882582543400402944,1271114680849793024,RT @charliekirk11: Do you think if Trump suppo...,USA,,no match,no match
3,362306960,1271114680753414145,RT @AprilDRyan: Pastor Darrell Scott is the ar...,where I'm at,,no match,no match
4,550561121,1271114680463998977,RT @bessbell: I say this as a Jewish person wh...,"Virginia, USA",Virginia,Virginia,Virginia
