In [316]:
import shelve
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.util import ngrams
pd.set_option('display.max_rows', 500)
from read_tweets import parse_raw_tweets
from tokenizer import tokenize

In [317]:
batch_path = '../private_data/batch.txt'
parse_raw_tweets(batch_path)
shelf = shelve.open('../private_data/tweet_processing')
users_df = shelf['users_df']
tweets_df = shelf['tweets_df']
shelf.close()
tweets_df.head()

Unnamed: 0,author_id,tweet_id,tweet_text
0,197715581,1271114681168658435,RT @keithboykin: General Mark Milley apologize...
1,3411140663,1271114680996630530,RT @charliekirk11: Do you think if Trump suppo...
2,882582543400402944,1271114680849793024,RT @charliekirk11: Do you think if Trump suppo...
3,362306960,1271114680753414145,RT @AprilDRyan: Pastor Darrell Scott is the ar...
4,949811328121794561,1271114680610820098,RT @thehill: JUST IN: Trump admin proposes cra...


In [318]:
merged = pd.merge(tweets_df, users_df, on='author_id')
merged.head()

Unnamed: 0,author_id,tweet_id,tweet_text,location
0,197715581,1271114681168658435,RT @keithboykin: General Mark Milley apologize...,Nederland
1,3411140663,1271114680996630530,RT @charliekirk11: Do you think if Trump suppo...,"Ponte Vedra Beach, FL"
2,882582543400402944,1271114680849793024,RT @charliekirk11: Do you think if Trump suppo...,USA
3,362306960,1271114680753414145,RT @AprilDRyan: Pastor Darrell Scott is the ar...,where I'm at
4,550561121,1271114680463998977,RT @bessbell: I say this as a Jewish person wh...,"Virginia, USA"


### Combining with Location data

In [319]:
import shelve
shelf = shelve.open('../public_data/location_data')
state_strings = shelf['state_strings']
foreign_entities = shelf['foreign_entities_df']
states_df = shelf['states_df']
states_dict = shelf['states_dict']
all_entities = shelf['all_entities']
shelf.close()
foreign_entities.head()

Unnamed: 0,name,pop,is_foreign,state,raw_name_string
4,Adrar,200834,True,,
5,Ain Defla,450280,True,,
6,Ain Temouchent,299341,True,,
7,ALGIERS (EL DJAZAIR),2712944,True,,
8,Annaba,442230,True,,


In [320]:
def is_foreign(s):
    strings = [n for n in foreign_entities['name'] if n in s]
    # marked foreign if strings is not empty
    return bool(strings)

#merged['is_foreign'] = merged['location'].apply(lambda s: any(n in s for n in foreign_entities['name']))
#merged['strings'] = merged['location'].apply(lambda s: [n for n in foreign_entities['name'] if n in s])
merged['is_foreign'] = merged['location'].apply(is_foreign)
merged[merged['is_foreign']].tail(100)

Unnamed: 0,author_id,tweet_id,tweet_text,location,is_foreign
1,3411140663,1271114680996630530,RT @charliekirk11: Do you think if Trump suppo...,"Ponte Vedra Beach, FL",True
8,2259766448,1271114679432040448,RT @FriendEden100: Health officials warn that ...,YYC That's in Canada eh !,True
9,4748941452,1271114679046123522,It feels like Biden and trump are both activel...,"Portland, OR",True
13,382662642,1271114678333247490,RT @politvidchannel: BREAKING: the mayor of Se...,"New York, USA",True
16,457610431,1271114677586743297,"RT @Lexual__: No but seriously, the whole thin...","Cleveland, OH",True
17,1426563144,1271114677536219138,"RT @Lexual__: No but seriously, the whole thin...","Los Angeles, CA",True
23,1320758359,1271114677120974848,"RT @Lexual__: No but seriously, the whole thin...","California, USA",True
27,1238467556962381824,1271114676160598021,@CameronRidle That “Unplug” will occur Novembe...,"Indianapolis, IN",True
33,30706941,1271114674130608130,RT @keithboykin: General Mark Milley apologize...,New York,True
34,102532798,1271114674004561920,"RT @TeaPainUSA: Trump calls for ""Law and Order...","Hamilton, Ontario, Canada",True


In [321]:
to_append = merged[['author_id', 'tweet_id', 'tweet_text']].copy()
to_append.head(3)

Unnamed: 0,author_id,tweet_id,tweet_text
0,197715581,1271114681168658435,RT @keithboykin: General Mark Milley apologize...
1,3411140663,1271114680996630530,RT @charliekirk11: Do you think if Trump suppo...
2,882582543400402944,1271114680849793024,RT @charliekirk11: Do you think if Trump suppo...


In [322]:
def get_ngrams(text, n):
    n_grams = ngrams(TweetTokenizer().tokenize(text), n)
    return [ ' '.join(grams) for grams in n_grams]
def get_state(loc_s):
    tokens = get_ngrams(loc_s, 1) + get_ngrams(loc_s, 2)
    states = [states_dict[token] for token in tokens if token in state_strings]
    return states

In [323]:
merged['states'] = merged['location'].apply(get_state)
merged.loc[37]

author_id                                             247138283
tweet_id                                    1271114673404903431
tweet_text    RT @Lexual__: No but seriously, the whole thin...
location                                                Jamaica
is_foreign                                                 True
states                                                       []
Name: 37, dtype: object

In [330]:
def is_foreign_lookup(loc_s):
    subset = all_entities[all_entities['name'].apply(lambda name: name in loc_s)]
    best_guess = subset.sort_values('pop', ascending=False).drop_duplicates(subset='name', keep='first')
    if best_guess.empty:
        return 'no match'
    state = best_guess.iloc[0,:]['state']
    if state is not None:
        return state
    elif best_guess.iloc[0,:]['is_foreign']:
        return 'foreign'
merged['lookup'] = merged['location'].apply(is_foreign_lookup)