In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import time
import re
import us
from city_to_state import city_to_state_dict

In [2]:
with open('tweets.json') as f:
    data = json.load(f)

In [3]:
type(data)

list

In [4]:
total = 0
stripped_data = []
for i in range(len(data)):
    if data[i] != None:
        total += len(data[i])
        for j in range(len(data[i])):
            batch = {}
            batch['created'] = data[i][j].get("created_at", np.nan)
            batch['id'] = data[i][j].get("id", np.nan)
            batch['text'] = data[i][j].get("text", np.nan)
            batch['location'] = data[i][j]['user'].get("location", np.nan)
            if data[i][j]['place'] != None:
                batch['city'] = data[i][j]['place'].get("full_name", np.nan)
                batch['country_code'] = data[i][j]['place'].get("country_code", np.nan)
                batch['country'] = data[i][j]['place'].get("country", np.nan)
            stripped_data.append(batch)
print(total)

16100


In [5]:
df = pd.DataFrame(stripped_data)

In [6]:
df.head()

Unnamed: 0,created,id,text,location,city,country_code,country
0,Tue Mar 17 23:59:59 +0000 2020,1240065376433647616,"Hi Twitter, Hi #COVIDChat! ICYMI check out @ma...","New York, New York",,,
1,Tue Mar 17 23:59:53 +0000 2020,1240065348705058819,While necessary measures are being taken to ke...,"Syracuse, NY",,,
2,Tue Mar 17 23:59:51 +0000 2020,1240065343915208706,The latest numbers in Ontario @Canada Note: t...,"Cleveland, OH",,,
3,Tue Mar 17 23:59:51 +0000 2020,1240065342212317185,That’s why we all have to act like we have it ...,"New York, USA",,,
4,Tue Mar 17 23:59:51 +0000 2020,1240065340773711873,Please stay home! The phrase #SocialDistancing...,"New York, USA",,,


In [7]:
df.location.sample(20)

7770                                   
14222                             Texas
11988                     United States
15288                 Oklahoma City, OK
12426    Istanbul / Fort Pierce / Dubai
8472                         Peoria, AZ
13503                    Torfaen, Wales
3459                          Tampa, FL
2191                      United States
12862              36.0609°N, 95.7975°W
4429                Tasmania, Australia
14085                  Orlando, Florida
2145                                   
3948                          Omaha, NE
10155                    Tennessee, USA
332                                  AZ
11436                        Austin, TX
2692                    Worthington, MA
1881                                   
3942                     Washington, DC
Name: location, dtype: object

In [8]:
len(df[(df.city.notna())])

1667

In [9]:
len(df[(df.location.str.contains('\w+', regex=True))])

13196

In [10]:
two_word_states = [
    'new york',
    'new hampshire',
    'new jersey',
    'new mexico',
    'north dakota',
    'south dakota',
    'north carolina',
    'south carolina',
    'rhode island',
    'west virginia'
]

In [11]:
def get_state_abbr(x):
    if re.match('({})'.format("|".join(two_word_states)), x.lower()):
        tokens = [re.match('({})'.format("|".join(two_word_states)), x.lower()).group(0)]
    elif re.match('({})'.format("|".join(city_to_state_dict.keys()).lower()), x.lower()):
        k = re.match('({})'.format("|".join(city_to_state_dict.keys()).lower()), x.lower()).group(0)
        tokens = [city_to_state_dict.get(k.title(), np.nan)]
    else:
        tokens = [j for j in re.split("\s|,", x) if j not in ['in', 'la', 'me', 'oh', 'or']]
    for i in tokens:
        if re.match('\w+', str(i)):
            if us.states.lookup(str(i)):
                return us.states.lookup(str(i)).abbr

In [12]:
split_df = lambda x: (x[(x.city.notna())], x[x.city.isna()])

In [13]:
df_city, df_location = split_df(df)

In [14]:
df_city.head()

Unnamed: 0,created,id,text,location,city,country_code,country
10,Tue Mar 17 23:59:43 +0000 2020,1240065310356635653,#SocialDistancing is necessary. But so is stay...,"San Antonio, TX","San Antonio, TX",US,United States
20,Tue Mar 17 23:59:30 +0000 2020,1240065252169039872,Don’t be those people in the line! Stay home. ...,"Halifax, Nova Scotia","Halifax, Nova Scotia",CA,Canada
32,Tue Mar 17 23:59:00 +0000 2020,1240065130156580864,Naps are an important part of #SocialDistancin...,"Colorado Springs, CO","Colorado, USA",US,United States
36,Tue Mar 17 23:58:55 +0000 2020,1240065108518322180,Just cause it’s true ☺️ 🤲🏼🧘🏽❤️🙏🏼🌷 #thanksnatur...,,"Penticton, British Columbia",CA,Canada
37,Tue Mar 17 23:58:53 +0000 2020,1240065097336336385,Day one of #COVID19 quarantine...This is going...,"Pittsburgh, PA","Whitehall, PA",US,United States


In [15]:
df_location.head()

Unnamed: 0,created,id,text,location,city,country_code,country
0,Tue Mar 17 23:59:59 +0000 2020,1240065376433647616,"Hi Twitter, Hi #COVIDChat! ICYMI check out @ma...","New York, New York",,,
1,Tue Mar 17 23:59:53 +0000 2020,1240065348705058819,While necessary measures are being taken to ke...,"Syracuse, NY",,,
2,Tue Mar 17 23:59:51 +0000 2020,1240065343915208706,The latest numbers in Ontario @Canada Note: t...,"Cleveland, OH",,,
3,Tue Mar 17 23:59:51 +0000 2020,1240065342212317185,That’s why we all have to act like we have it ...,"New York, USA",,,
4,Tue Mar 17 23:59:51 +0000 2020,1240065340773711873,Please stay home! The phrase #SocialDistancing...,"New York, USA",,,


In [16]:
def apply_state(df, col):
    df['us_state'] = df[col].apply(get_state_abbr)
    return df

In [17]:
df_updated = pd.concat(
    [
        apply_state(df_city, 'city'),
        apply_state(df_location, 'location')
    ]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
df_updated.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16100 entries, 10 to 16099
Data columns (total 8 columns):
created         16100 non-null object
id              16100 non-null int64
text            16100 non-null object
location        16100 non-null object
city            1667 non-null object
country_code    1667 non-null object
country         1667 non-null object
us_state        8511 non-null object
dtypes: int64(1), object(7)
memory usage: 1.1+ MB


In [19]:
df_us = df_updated[(df_updated.us_state.notna())]
df_us.reset_index(inplace=True, drop=True)

In [20]:
len(df_us)

8511

In [42]:
df_us[['location', 'city', 'us_state']].sample(15)

Unnamed: 0,location,city,us_state
2517,"Wimberley, TX",,TX
3713,"Guthrie, OK",,OK
1392,"Arkansas, USA",,AR
4481,"Ontario, Canada",,CA
3151,"New Mexico, USA",,NM
3286,"Los Angeles, CA",,CA
4964,"Hawaii, USA",,HI
8119,Brighton UK,,CO
114,Yosemite National Park,"California, USA",CA
3682,Atlanta,,GA
