# 2. Data Cleaning
*Author: Boom*

### Import Packages and Data

In [1]:
# Import Packages
import numpy as np
import pandas as pd
import scipy.stats as stats

In [2]:
# Read in pulled data
city_info = pd.read_csv("./datasets/new_england_cities_geo-data.csv")
tweets_df = pd.read_csv("./datasets/dirty_tweets_20180101-20181231.csv")

In [3]:
# Clean these data frames by getting rid of unnamed column
city_info.drop(columns=["Unnamed: 0"], inplace=True)
tweets_df.drop(columns=["Unnamed: 0"], inplace=True)

### Simulate City Location for each Tweet
- At this stage, the purpose of this project is to provide proof-of-concept.
- Due to our numerous attempts to determine the optimal way to pull relevant Tweets with location, Twitter's API (via `twitterscrape`) has locked us out thereby preventing us from pulling any more Tweets.
- These tweets we currently have here are relevant to power outages, but do not have informative location information.
- Therefore, in order to proceed with modeling and geospatial visualization, we need to simulate the locations by randomly sampling from the unique list of cities.

In [4]:
# Get unique list of cities
city_list = list(city_info['city'].unique())

In [5]:
# Randomly assign a city to each tweet
np.random.seed(42)
tweets_df["city"] = np.random.choice(city_list, size=tweets_df.shape[0], replace=True)

In [6]:
# Merge rest of city_info dataframe to tweets_df
tweets_location_df = pd.merge(tweets_df, city_info, how="left", on="city")

# Get rid of duplicate Tweets
tweets_location_df.drop_duplicates(subset='text', keep="last", inplace=True)

# Set Tweet ID and index and get rid of ID column
tweets_location_df.set_index(tweets_location_df['id'], inplace=True)
tweets_location_df.drop(columns=["id"], inplace=True)

# Peak
tweets_location_df.head(5)

Unnamed: 0_level_0,text,timestamp,city,state_id,state_name,county_name,lat,lng
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1079527340617723904,I bought a portable cell charger. It stays cha...,2018-12-30 23:59:09,Arlington,MA,Massachusetts,Middlesex,42.4186,-71.1638
1079527053555392513,The filthy scum media @cnn didn’t show the REA...,2018-12-30 23:58:00,Blaine,ME,Maine,Aroostook,46.499,-67.8688
1079523909312098305,Massive power outage hits southern Zim http://...,2018-12-30 23:45:31,Groton Long Point,CT,Connecticut,New London,41.3145,-72.0087
1079522345134538752,Massive power outage hits southern Zim https:/...,2018-12-30 23:39:18,Shelburne,VT,Vermont,Chittenden,44.3759,-73.2265
1079522317284237312,A major power outage this afternoon is impacti...,2018-12-30 23:39:11,North Woodstock,NH,New Hampshire,Grafton,44.0364,-71.6895


In [7]:
print("We now have", tweets_location_df.shape[0],"unique Tweets remaining")

# Save as .csv
tweets_location_df.to_csv("./datasets/clean_tweets_20180101-20181231.csv")

We now have 954 unique Tweets remaining
