# **Installations, Imports, Mounting: run me!**

In [None]:
# ------------------------------------------------------------------------------
# INSTALLATIONS
# ------------------------------------------------------------------------------
! pip install tweepy --upgrade # for version with 'tweet_mode' param
! pip install emoji
! pip install jsonlines
! pip install vaderSentiment

In [None]:
# ------------------------------------------------------------------------------
# IMPORTS
# ------------------------------------------------------------------------------
import ast
from csv import reader
from datetime import datetime
import glob
import io
import json
import jsonlines
import math
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
from random import randint
import re
import requests
import seaborn as sns
import string
import subprocess
from time import sleep
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm
import tweepy
import vaderSentiment

In [None]:
# ------------------------------------------------------------------------------
# MOUNT TO GOOGLE DRIVE (use princeton account)
# ------------------------------------------------------------------------------
from google.colab import drive
drive.mount('/content/drive')

# **State Setup: run me!**

In [None]:
# ------------------------------------------------------------------------------
# SAVE STATE ABBREVIATIONS FOR REFERENCE
# ------------------------------------------------------------------------------
# lower & uppercase, ordered alphabetically (index is fips code)
states_lower = ['al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'de', 'fl', 'ga', 
                'hi', 'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 
                'ma', 'mi', 'mn', 'ms', 'mo', 'mt', 'ne', 'nv', 'nh', 'nj', 
                'nm', 'ny', 'nc', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 
                'sd', 'tn', 'tx', 'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy']
states_upper = [state.upper() for state in states_lower] # uppercase

# territories (not currently using but keeping my options open!)
terrs_lower = ['as', 'dc', 'fm', 'gu', 'mh', 'mp', 'pw', 'pr', 'vi']
terrs_upper = [terr.upper() for terr in terrs_lower] # uppercase

In [None]:
# dict of state name to abbreviation (all lowercase)
states_abbrev = {
    'alabama': 'al',
    'alaska': 'ak',
    'arizona': 'az',
    'arkansas': 'ar',
    'california': 'ca',
    'colorado': 'co',
    'connecticut': 'ct',
    'delaware': 'de',
    'florida': 'fl',
    'georgia': 'ga',
    'hawaii': 'hi',
    'idaho': 'id',
    'illinois': 'il',
    'indiana': 'in',
    'iowa': 'ia',
    'kansas': 'ks',
    'kentucky': 'ky',
    'louisiana': 'la',
    'maine': 'me',
    'maryland': 'md',
    'massachusetts': 'ma',
    'michigan': 'mi',
    'minnesota': 'mn',
    'mississippi': 'ms',
    'missouri': 'mo',
    'montana': 'mt',
    'nebraska': 'ne',
    'nevada': 'nv',
    'new hampshire': 'nh',
    'new jersey': 'nj',
    'new mexico': 'nm',
    'new york': 'ny',
    'north carolina': 'nc',
    'north dakota': 'nd',
    'ohio': 'oh',
    'oklahoma': 'ok',
    'oregon': 'or',
    'pennsylvania': 'pa',
    'rhode island': 'ri',
    'south carolina': 'sc',
    'south dakota': 'sd',
    'tennessee': 'tn',
    'texas': 'tx',
    'utah': 'ut',
    'vermont': 'vt',
    'virginia': 'va',
    'washington': 'wa',
    'west virginia': 'wv',
    'wisconsin': 'wi',
    'wyoming': 'wy'
}

# **COVID Tracking Project: Data Collection**
This section accumulates COVID-19 data for all 50 states including official Twitter handles for each state and historical case data.

API Information: https://covidtracking.com/data/api

## Official COVID-19 Twitter handles for each state
(e.g. state health departments). This can be used later to query the Twitter API.

In [None]:
query_url = ('https://api.covidtracking.com/v1/states/info.json')
response = requests.get(query_url)

# create dict {state_name: handle}
# name: upper case abbreviation, value: None if no handle exists
# includes the 50 states + 6 territories
payload = response.json()
df = pd.DataFrame(payload)
state_twitters = dict(zip(df.state, df.twitter))

# print(state_twitters)

In [None]:
state_twitters

## Historical COVID-19 Case Data for States
Note: no need to run this unless want data past 2020. Code to get saved data from GDrive below.

###Individual State: Example and Visualization
Used for getting a sense for the type, form, and trends of data for each state.

In [None]:
# ------------------------------------------------------------------------------
# HISTORICAL STATE DATA EXAMPLE (start date varies, end date yesterday)
# ------------------------------------------------------------------------------
query_url = ('https://api.covidtracking.com/v2/states/ct/daily.json')
response = requests.get(query_url)
payload = response.json()

# df = pd.DataFrame(payload) # create dataframe of info
# pd.set_option('display.max_rows', None) # Change to None to see all rows
# df # display dataframe

In [None]:
payload['data'][0]

In [None]:
for entry in payload['data']:
  print(entry['date'], entry['meta']['data_quality_grade'], entry['cases']['total']['value'])

In [None]:
# ------------------------------------------------------------------------------
# DATA VISUALIZATION EXAMPLE
# ------------------------------------------------------------------------------
data = df['positiveIncrease'] # can change what category to visualize here
data.time = pd.to_datetime(df['date'], format='%Y%m%d')
graph = data.plot(title="NJ Daily Increase in Positive COVID-19 Cases")
graph.set_xlabel('days ago')
graph.set_ylabel('positive case count')
plt.gca().invert_xaxis()
plt.show()

### Query Historical Data and save to CSV files by state in GDrive

In [None]:
# ------------------------------------------------------------------------------
# QUERY HISTORICAL DATA, SAVE INTO DICT
# ------------------------------------------------------------------------------
# Key: lowercase state abbreviation
# Value: dataframe with values, going backwards in date
state_historical = {}
for state in states_lower:
  url = ('https://api.covidtracking.com/v1/states/' + state + '/daily.json')
  response = requests.get(url)

  payload = response.json()
  df = pd.DataFrame(payload)
  # Note: 'negativeIncrease' category is deprecated
  state_historical[state] = df[['date', 'positive', 'positiveIncrease', 'negative', 'hospitalizedIncrease', 'deathIncrease', 'dataQualityGrade']]

In [None]:
# convert dict to master df
state_historical_df = pd.DataFrame(list(state_historical.values()), index=state_historical.keys(), columns = ['info']) 

In [None]:
# example indexing into data for reference
state_historical_df.loc['nj']['info']

In [None]:
# ------------------------------------------------------------------------------
# SAVE HISTORICAL DATA DF TO CSV FILES IN GDRIVE
# ------------------------------------------------------------------------------
for state in states_lower:
  date = '2021-03-12' # put current date as string in format'YYYY-MM-DD'. Create folder with this name in GDrive before running.
  state_historical_df.loc[state]['info'].to_csv('{}_{}_historical.csv'.format(date, state))
  !cp $date"_"$state"_historical.csv" "drive/My Drive/Thesis/"$date

##Query Data Quality Grades from V2 of API and Save to CSVs  
Data collected separately because V1 of the API stopped supporting this field.

In [None]:
# ------------------------------------------------------------------------------
# QUERY DATA QUALITY GRADES, SAVE INTO DICT
# ------------------------------------------------------------------------------
# Key: lowercase state abbreviation
# Value: dataframe with values, going backwards in date
state_grades = {}
for state in states_lower:
  url = ('https://api.covidtracking.com/v2/states/' + state + '/daily.json')
  response = requests.get(url)
  payload = response.json()

  grades = []
  for entry in payload['data']:
    grades.append([entry['date'], entry['meta']['data_quality_grade']])

  state_df = pd.DataFrame(grades)
  state_df.columns=['date', 'dataQualityGrade']
  state_grades[state] = state_df

In [None]:
# ------------------------------------------------------------------------------
# SAVE DATA QUALITY GRADES DF TO CSV FILES IN GDRIVE
# ------------------------------------------------------------------------------
for state in states_lower:
  state_grades[state].to_csv('{}_dataQualityGrade.csv'.format(state))
  !cp $state"_dataQualityGrade.csv" "drive/My Drive/Thesis/dataQualityGrades"

# **CoronaVis: Twitter Data Collection**

## Authentication, set up
Run before any requests to API

In [None]:
# Twitter Developer Authentication Info
# Fill in your info here
API_key = ''
API_secret_key = ''
bearer_token = ''
access_token = ''
access_token_secret = ''

In [None]:
# Tweepy setup
auth = tweepy.OAuthHandler(API_key, API_secret_key)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

## 1. CoronaVis: Hydration
NOTE: 05-05-2020 done separately

Tweet IDs from https://github.com/mykabir/COVID19. Paper compiled COVID-19 related tweet IDs by date 

### Example Tweet Query

In [None]:
tweet = api.statuses_lookup([1258116423924289536], tweet_mode="extended")
tweet[0]._json

### Check Twitter API Limits
Make sure to run this before calling functions that check API limits

In [None]:
# use to check current limits on api calls (900/session, 15min sessions)
def checkLimit():
  return api.rate_limit_status()['resources']['statuses']['/statuses/lookup']

In [None]:
checkLimit()

### Obtain and Hydrate Tweets
DONE! Ran for CoronaVis IDs on dates 03-05-2020 to 05-05-2020. Ran twarc locally on dates 05-06-2020 to 08-07-2020 (data folder) and 08-07-2020 to 12-31-2020 (data2 folder). To do this, just `cd` into the applicable folder and run `bash twarc_hydrate.sh`. Then upload to google drive



In [None]:
# ------------------------------------------------------------------------------
# DOWNLOAD DEHYDRATED TWEET IDS (GDrive folder, originally from the Git repo)
# coronavis_ids dict: {yyyy-mm-dd: tweet_ids}
# Run this code cell before count analyses in the next section
# ------------------------------------------------------------------------------
coronavis_files = glob.glob("drive/My Drive/Thesis/CoronaVis/*.csv")
coronavis_ids = {}
for f in coronavis_files:
  date = f[-14:-4]
  coronavis_ids[date] = pd.read_csv(f)

In [None]:
# initialize request count variable before calling hydrating function
request_count = 900 - checkLimit()['remaining']
request_count

In [None]:
# ------------------------------------------------------------------------------
# TWEET HYDRATING FUNCTION. Called by next code cell.
# ------------------------------------------------------------------------------
# given csv file of tweet IDs, returns df with index=id and the following cols: 
# tweet full text, user_loc (user defined location on their profile), place
# (tweet location), coords (tweet coords in geoJSON longitude, latitude)
def hydrateTweetCSV(file, curr_request_count):
  tweet_df = pd.DataFrame()
  request_count = curr_request_count

  for i in range(int(len(file['tweet_id']) / 100)):
    # print to get a sense for timing
    if request_count % 100 == 0 and request_count != 0:
      print(datetime.now().time().strftime('%H:%M:%S'), 'finished 100 requests')
    # 15min session limit of 900 requests
    if request_count % 900 == 0 and request_count != 0:
      print(datetime.now().time().strftime('%H:%M:%S'), 'sleeping')
      while(checkLimit()['remaining'] < 900):
        sleep(60)
      print(datetime.now().time().strftime('%H:%M:%S'), 'awake! :)')
      
    # arrange 100 ids into one list for each request
    start_index = i * 100
    end_index = min(len(file['tweet_id']), (i+1)*100)
    ids = file['tweet_id'][start_index:end_index]

    # request extended mode to get full text
    tweets = api.statuses_lookup(ids.to_list(), tweet_mode="extended")

    # skip the ID if corresponding tweet no longer exists
    if tweets != []:
      for j in range(len(tweets)):
        # fill in dict of info
        tweet_info = {}
        tweet_info['id'] = tweets[j]._json['id']
        if ('retweeted_status' in tweets[j]._json.keys()):
          tweet_info['text'] = tweets[j]._json['retweeted_status']['full_text']
        else:
          tweet_info['text'] = tweets[j]._json['full_text']
        tweet_info['user_loc'] = tweets[j]._json['user']['location']
        tweet_info['place'] = tweets[j]._json['place']
        tweet_info['coords'] = tweets[j]._json['coordinates']
        # add row of info to the master df
        tweet_df = tweet_df.append(tweet_info, ignore_index=True)
    
    # increment request count
    request_count = request_count + 1
  
  return tweet_df.set_index('id'), request_count

In [None]:
# ------------------------------------------------------------------------------
# HYDRATE CORONAVIS TWEET IDS (by calling hydrateTweetCSV() function)
# hydrated_coronavis dict: {yyyy-mm-dd: df}
# ------------------------------------------------------------------------------
# grab CSV files already hydrated
hydrated_files = glob.glob("drive/My Drive/Thesis/CoronaVis Hydrated/*.csv")
hydrated_dates = []
for f in hydrated_files:
  hydrated_dates.append(f[-33:-23])

# hydrate files
hydrated_coronavis = {} # dict of hydrated tweets
for date in sorted(coronavis_ids.keys()):
  if date not in hydrated_dates: # ignore dates already hydrated
    print(datetime.now().time().strftime('%H:%M:%S'), 'currently hydrating: ', date)
    try:
      hydrated_coronavis[date], new_request_count = hydrateTweetCSV(coronavis_ids[date], request_count)
      request_count = new_request_count
      hydrated_coronavis[date].to_csv('{}_coronavis_hydrated.csv'.format(date))
      !cp $date"_coronavis_hydrated.csv" "drive/My Drive/Thesis/CoronaVis Hydrated"
      print(datetime.now().time().strftime('%H:%M:%S'), 'finished and saved: ', date)
    except Exception as e:
      print('ERROR', e)

In [None]:
# Example of a given date's data
pd.set_option('display.float_format', '{:.0f}'.format)
hydrated_coronavis['2020-03-05']

### Twarc hydrated jsonl to CSVs of important info

This is code to read the json lines files in order to save the data into CSV files. For efficiency, won't run this because it'll take way too long-- instead, use this as reference when writing code to handle dates after May 5

In [None]:
# DO NOT RUN!!!
jsonl_files = glob.glob("drive/My Drive/Thesis/CoronaVis jsonl/*.jsonl")
for jsonl_file in jsonl_files:
  date = jsonl_file[-35:-25]
  if date == '2020-05-06':
    tweet_df = pd.DataFrame() # data frame to store all important tweet info
    print(datetime.now().time().strftime('%H:%M:%S'), 'start: ', date)

    with open(jsonl_file, 'r', encoding='utf-8') as f:
      for line in f:
        tweet = json.loads(line.rstrip('\n|\r'))
        # get pertinent info and save to dict
        tweet_info = {}
        tweet_info['id'] = tweet['id']
        if ('retweeted_status' in tweet.keys()):
          tweet_info['text'] = tweet['retweeted_status']['full_text']
        else:
          tweet_info['text'] = tweet['full_text']
          tweet_info['user_loc'] = tweet['user']['location']
          tweet_info['place'] = tweet['place']
          tweet_info['coords'] = tweet['coordinates']
        # add tweet info dict to df
        tweet_df = tweet_df.append(tweet_info, ignore_index=True)
    
    tweet_df.set_index('id').to_csv('{}_coronavis_hydrated.csv'.format(date))
    !cp $date"_coronavis_hydrated.csv" "drive/My Drive/Thesis/CoronaVis Hydrated"
    print(datetime.now().time().strftime('%H:%M:%S'), 'finished and saved: ', date)

##CoronaVis: Analysis of Original Number of Tweet IDs and Hydrated Tweets

###Do not run (old versions)

In [None]:
# V1: Count original number of tweet IDs there are to hydrate by date
# ran this when i went straight from having run the previous section
num_ids = {}
for date in sorted(coronavis_ids):
  num_ids[date] = len(coronavis_ids[date])

In [None]:
# V2: Download original tweet ID files from GDrive and get ID counts
# Saved results below so I don't have to run this again
coronvis_files = glob.glob("drive/My Drive/Thesis/0.CoronaVis Original/*.csv")
num_ids = {}
for f in sorted(coronvis_files):
  date = f[-14:-4]
  num_ids[date] = len(pd.read_csv(f, lineterminator='\n'))

###Original Tweet ID Counts

In [None]:
# Original Tweet ID counts between 03/05/2020 and 12/31/2020
num_ids = {'2020-03-05': 4980, '2020-03-06': 40710, '2020-03-07': 46964, '2020-03-08': 66793, '2020-03-09': 72319, '2020-03-10': 91873, '2020-03-11': 160308, '2020-03-12': 363422, '2020-03-13': 207182, '2020-03-14': 255275, '2020-03-15': 170459, '2020-03-16': 118271, '2020-03-17': 163836, '2020-03-18': 177829, '2020-03-19': 145077, '2020-03-20': 177037, '2020-03-21': 118781, '2020-03-22': 138816, '2020-03-23': 130943, '2020-03-24': 114957, '2020-03-25': 96271, '2020-03-26': 57107, '2020-03-27': 83416, '2020-03-28': 77240, '2020-03-29': 81112, '2020-03-30': 62127, '2020-03-31': 65414, '2020-04-01': 71408, '2020-04-02': 19136, '2020-04-03': 49287, '2020-04-04': 35928, '2020-04-05': 40653, '2020-04-06': 52008, '2020-04-07': 44172, '2020-04-08': 50952, '2020-04-09': 60385, '2020-04-10': 207907, '2020-04-11': 215668, '2020-04-12': 197479, '2020-04-13': 222613, '2020-04-14': 138231, '2020-04-15': 133400, '2020-04-16': 177378, '2020-04-17': 196602, '2020-04-18': 234062, '2020-04-19': 214408, '2020-04-20': 193253, '2020-04-21': 234081, '2020-04-22': 242456, '2020-04-23': 136758, '2020-04-24': 224216, '2020-04-25': 198378, '2020-04-26': 181973, '2020-04-27': 150191, '2020-04-28': 146383, '2020-04-29': 227960, '2020-04-30': 196781, '2020-05-01': 199134, '2020-05-02': 196076, '2020-05-03': 202383, '2020-05-04': 218615, '2020-05-05': 205386, '2020-05-06': 227541, '2020-05-07': 92246, '2020-05-08': 208923, '2020-05-09': 205931, '2020-05-10': 190201, '2020-05-11': 225689, '2020-05-12': 213458, '2020-05-13': 124449, '2020-05-14': 209789, '2020-05-15': 199484, '2020-05-16': 170662, '2020-05-17': 186373, '2020-05-18': 184968, '2020-05-19': 175873, '2020-05-20': 210110, '2020-05-21': 133603, '2020-05-22': 196137, '2020-05-23': 197625, '2020-05-24': 192573, '2020-05-25': 196694, '2020-05-26': 187436, '2020-05-27': 116269, '2020-05-28': 195504, '2020-05-29': 176889, '2020-05-30': 174604, '2020-05-31': 268304, '2020-06-01': 171991, '2020-06-02': 163898, '2020-06-03': 138782, '2020-06-04': 197251, '2020-06-05': 143701, '2020-06-06': 151006, '2020-06-07': 154414, '2020-06-08': 137120, '2020-06-09': 166432, '2020-06-10': 100393, '2020-06-11': 176063, '2020-06-12': 207997, '2020-06-13': 115008, '2020-06-14': 169947, '2020-06-15': 260678, '2020-06-16': 206380, '2020-06-17': 198091, '2020-06-18': 169317, '2020-06-19': 217111, '2020-06-20': 238231, '2020-06-21': 225608, '2020-06-22': 148628, '2020-06-23': 272976, '2020-06-24': 151913, '2020-06-25': 282053, '2020-06-26': 191681, '2020-06-27': 242723, '2020-06-28': 300385, '2020-06-29': 289477, '2020-06-30': 172250, '2020-07-01': 274578, '2020-07-02': 218013, '2020-07-03': 180861, '2020-07-04': 240717, '2020-07-05': 250761, '2020-07-06': 223667, '2020-07-07': 174113, '2020-07-08': 172032, '2020-07-09': 243205, '2020-07-10': 298545, '2020-07-11': 280284, '2020-07-12': 268579, '2020-07-13': 289371, '2020-07-14': 274713, '2020-07-15': 329245, '2020-07-16': 290532, '2020-07-17': 298164, '2020-07-18': 254959, '2020-07-19': 245136, '2020-07-20': 205134, '2020-07-21': 181486, '2020-07-22': 251168, '2020-07-23': 104637, '2020-07-24': 177565, '2020-07-25': 227131, '2020-07-26': 213809, '2020-07-27': 150773, '2020-07-28': 336992, '2020-07-29': 308633, '2020-07-30': 339237, '2020-07-31': 281021, '2020-08-01': 252411, '2020-08-02': 253018, '2020-08-03': 205264, '2020-08-04': 149669, '2020-08-05': 143159, '2020-08-06': 206359, '2020-08-07': 102994, '2020-08-08': 171702, '2020-08-09': 211208, '2020-08-10': 156467, '2020-08-11': 146740, '2020-08-12': 188518, '2020-08-13': 88712, '2020-08-14': 299641, '2020-08-15': 155255, '2020-08-16': 149400, '2020-08-17': 184192, '2020-08-18': 137470, '2020-08-19': 122625, '2020-08-20': 79571, '2020-08-21': 132712, '2020-08-22': 160069, '2020-08-23': 161538, '2020-08-24': 176354, '2020-08-25': 130600, '2020-08-26': 47408, '2020-08-27': 212956, '2020-08-28': 197051, '2020-08-29': 175569, '2020-08-30': 170935, '2020-08-31': 131129, '2020-09-01': 139931, '2020-09-02': 25702, '2020-09-03': 212063, '2020-09-04': 191834, '2020-09-05': 131617, '2020-09-06': 128668, '2020-09-07': 167433, '2020-09-08': 51466, '2020-09-09': 152867, '2020-09-10': 249385, '2020-09-11': 226722, '2020-09-12': 190810, '2020-09-13': 183199, '2020-09-14': 194086, '2020-09-15': 145348, '2020-09-16': 246209, '2020-09-17': 181966, '2020-09-18': 70519, '2020-09-19': 97464, '2020-09-20': 118595, '2020-09-21': 126991, '2020-09-22': 197568, '2020-09-23': 217557, '2020-09-24': 189697, '2020-09-25': 127658, '2020-09-26': 140304, '2020-09-27': 115098, '2020-09-28': 122249, '2020-09-29': 157275, '2020-09-30': 184314, '2020-10-01': 165711, '2020-10-02': 510496, '2020-10-03': 552279, '2020-10-04': 451207, '2020-10-05': 482118, '2020-10-06': 531752, '2020-10-07': 427239, '2020-10-08': 295314, '2020-10-09': 174588, '2020-10-10': 189328, '2020-10-11': 256890, '2020-10-12': 195114, '2020-10-13': 224906, '2020-10-14': 247190, '2020-10-15': 185716, '2020-10-16': 154491, '2020-10-17': 173839, '2020-10-18': 187955, '2020-10-19': 252184, '2020-10-20': 371880, '2020-10-21': 373400, '2020-10-22': 54100, '2020-10-23': 270614, '2020-10-24': 411440, '2020-10-25': 464640, '2020-10-26': 334652, '2020-10-27': 325088, '2020-10-28': 139962, '2020-10-29': 185497, '2020-10-30': 200514, '2020-10-31': 235570, '2020-11-01': 196851, '2020-11-02': 204186, '2020-11-03': 164919, '2020-11-04': 139287, '2020-11-05': 218949, '2020-11-06': 168663, '2020-11-07': 248474, '2020-11-08': 173409, '2020-11-09': 250201, '2020-11-10': 144560, '2020-11-11': 106632, '2020-11-12': 12695, '2020-11-13': 246225, '2020-11-14': 130899, '2020-11-15': 165042, '2020-11-16': 178315, '2020-11-17': 292196, '2020-11-18': 122639, '2020-11-19': 89009, '2020-11-20': 201452, '2020-11-21': 254858, '2020-11-22': 211376, '2020-11-23': 164649, '2020-11-24': 92873, '2020-11-25': 235326, '2020-11-26': 190102, '2020-11-27': 158432, '2020-11-28': 166272, '2020-11-29': 179204, '2020-11-30': 110272, '2020-12-01': 98095, '2020-12-02': 194099, '2020-12-03': 248612, '2020-12-04': 235511, '2020-12-05': 207024, '2020-12-06': 197600, '2020-12-07': 117571, '2020-12-08': 20342, '2020-12-09': 246537, '2020-12-10': 239066, '2020-12-11': 216218, '2020-12-12': 172376, '2020-12-13': 172819, '2020-12-14': 162497, '2020-12-16': 50353, '2020-12-17': 153189, '2020-12-18': 270150, '2020-12-19': 231315, '2020-12-20': 234397, '2020-12-21': 302219, '2020-12-22': 289477, '2020-12-23': 224399, '2020-12-24': 190456, '2020-12-25': 114224, '2020-12-26': 36457, '2020-12-27': 149243, '2020-12-28': 257590, '2020-12-29': 235148, '2020-12-30': 286807, '2020-12-31': 223279}

In [None]:
num_ids_df = pd.DataFrame(num_ids.items())
num_ids_df.columns = ['date', 'tweet_count']

In [None]:
num_ids_df[210:240]

In [None]:
# find where to place ticks  (start of each month) and set their labels
month_ticks = ['Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
tick_indices = []

curr_month = ''
for i, date in enumerate(num_ids.keys()):
  month = date[5:7]
  if month != curr_month:
    tick_indices.append(i)
    curr_month = month

In [None]:
# Title: Number of CoronaVis Tweet IDs Available Per Day
# peak on oct 3 - president trump and first lady melania trump tested positive 
# for COVID the day before. this peak is likely a reaction to it
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=num_ids_df,
    x='date', y='tweet_count', 
    kind="line", height=15, aspect=2.5
).set_axis_labels("Date (Month in 2020)", "Number of Tweet IDs", labelpad=35)
g.set(ylim=(0, 575000))
g.set(xticks=tick_indices)
g.set_xticklabels(month_ticks)

In [None]:
# Plot results
# no longer used, for seaborn visualization above
x,y = zip(*num_ids.items())
rotate, ax = plt.subplots(figsize=(75,8))
ax.set_title('Number of Tweet IDs per day')
plt.xticks(rotation=90)
plt.plot(x, y);
# peak on oct 3 - president trump and first lady melania trump tested positive 
# for COVID the day before. this peak is likely a reaction to it

In [None]:
# Number of tweet ids queried for hydration:
total_colab = 0 # count of tweet ids queried using above code
total_twarc = 0 # count of tweet ids queried using twarc
for date in sorted(num_ids):
  if date < '2020-05-06':
    total_colab += num_ids[date]
  else:
    total_twarc += num_ids[date]
print('total queried using colab:', total_colab)
print('total queried using twarc: ', total_twarc)
print('total queried:', total_colab + total_twarc)

###Do not run (old code)

In [None]:
# Count number of tweets successfully hydrated using colab
coronavis_hydrated_files = glob.glob("drive/My Drive/Thesis/1a.CoronaVis Hydrated/*.csv")
coronavis_tweets = {}
num_tweets = {}
for f in sorted(coronavis_hydrated_files):
  date = f[-33:-23]
  coronavis_tweets[date] = pd.read_csv(f, lineterminator='\n')
  num_tweets[date] = len(coronavis_tweets[date])

In [None]:
# DO NOT RUN!!! I just ran it in my laptop's terminal and it was significantly faster
# Count number of tweets successfully hydrated using twarc
# coronavis_hydrated_files_2 = glob.glob("drive/My Drive/Thesis/CoronaVis jsonl/*.jsonl")
# print(datetime.now().time().strftime('%H:%M:%S'), 'start')
# for f in sorted(coronavis_hydrated_files_2):
#   date = f[-35:-25] #double check dates before fully running
#   num_tweets[date] = subprocess.check_output(['wc', '-l', f]).decode("utf-8").split()[0]
# print(datetime.now().time().strftime('%H:%M:%S'), 'end')

In [None]:
# Tweet counts for 05/05 to 08/07, found using a shell script locally
daily_counts = [194138,  78221, 180063, 175684, 159459, 193267, 181988, 107274, 180345, 169278, 141762, 157660, 159126, 152434, 180419, 113446, 168129, 168755, 162165, 165694, 161429, 100477, 166631, 151054, 148664, 227103, 145312, 141767, 118005, 169786, 120088, 128830, 129913, 118268, 145074,  86053, 152540, 180517,  98611, 148302, 227485, 178975, 172096, 147349, 191960, 208625, 193122, 126727, 238324, 134315, 248471, 167144, 209088, 263783, 249119, 139936, 236912, 185609, 156744, 200357, 215558, 197596, 144650, 150234, 214605, 256758, 246835, 227680, 254347, 231004, 277297, 248688, 254110, 211511, 209360, 182971, 161771, 215823,  92191, 156151, 198849, 188822, 131231, 276892, 271086, 298734, 248691, 223718, 223515, 182846, 131796, 126342, 179134,  84392, 124148, 158325, 117741, 107093, 145354, 66790, 154444, 114484, 103892, 135835, 101112, 95036, 60739, 99370, 117358, 117981, 129937, 98044, 36857, 170240, 159679, 133874, 105210, 84796, 102616, 19631, 162955, 146549, 93257, 89652, 121356, 36922, 117827, 193334, 179803, 152254, 147634, 146124, 110707, 190407, 132233, 52306, 76762, 89633, 99335, 161165, 176233, 148991, 96869, 106853, 87595, 92033, 121104, 148997, 133100, 379026, 436282, 351184, 379817, 421511, 338279, 236816, 142917, 156889, 199151, 158029, 181063, 197712, 151932, 122393, 135731, 145412, 125424, 154964, 149130, 21164, 106903, 163843, 194218, 131788, 185754, 108401, 143916, 154400, 189231, 147902, 159463, 132584, 109027, 172654, 134046, 201783, 137350, 198107, 117073, 87193, 9359, 189160, 103914, 135074, 144441, 233598, 98005, 67618, 163859, 196601, 156792, 126540, 74520, 187570, 154747, 125643, 130822, 142522, 81579, 81744, 159790, 200054, 187236, 160154, 158647, 88053, 16649, 193176, 194171, 176388, 140498, 133113, 132287, 42001, 127344, 214756, 183178, 183070, 225058, 212982, 163172, 150637, 89401, 27306, 108480, 179675, 181734, 223107, 181383]
index = 0
for date in sorted(num_ids):
  if date >= '2020-05-06':
    num_tweets[date] = daily_counts[index]
    index += 1

###Successfully hydrated counts

In [None]:
num_tweets = {'2020-03-05': 3877, '2020-03-06': 31631, '2020-03-07': 35996, '2020-03-08': 51948, '2020-03-09': 52907, '2020-03-10': 65964, '2020-03-11': 119743, '2020-03-12': 267224, '2020-03-13': 153734, '2020-03-14': 195051, '2020-03-15': 127096, '2020-03-16': 85877, '2020-03-17': 126618, '2020-03-18': 129069, '2020-03-19': 107279, '2020-03-20': 120273, '2020-03-21': 88308, '2020-03-22': 107499, '2020-03-23': 102452, '2020-03-24': 84013, '2020-03-25': 71704, '2020-03-26': 44882, '2020-03-27': 65392, '2020-03-28': 56889, '2020-03-29': 60849, '2020-03-30': 46711, '2020-03-31': 47094, '2020-04-01': 53635, '2020-04-02': 13233, '2020-04-03': 37564, '2020-04-04': 27916, '2020-04-05': 31443, '2020-04-06': 39426, '2020-04-07': 34176, '2020-04-08': 37942, '2020-04-09': 47794, '2020-04-10': 172455, '2020-04-11': 177722, '2020-04-12': 162233, '2020-04-13': 188077, '2020-04-14': 117494, '2020-04-15': 113315, '2020-04-16': 148096, '2020-04-17': 167109, '2020-04-18': 186477, '2020-04-19': 175927, '2020-04-20': 162161, '2020-04-21': 198436, '2020-04-22': 203050, '2020-04-23': 114947, '2020-04-24': 188628, '2020-04-25': 163942, '2020-04-26': 152573, '2020-04-27': 124779, '2020-04-28': 125296, '2020-04-29': 191269, '2020-04-30': 166209, '2020-05-01': 170201, '2020-05-02': 164906, '2020-05-03': 167868, '2020-05-04': 186756, '2020-05-05': 153462, '2020-05-06': 194138, '2020-05-07': 78221, '2020-05-08': 180063, '2020-05-09': 175684, '2020-05-10': 159459, '2020-05-11': 193267, '2020-05-12': 181988, '2020-05-13': 107274, '2020-05-14': 180345, '2020-05-15': 169278, '2020-05-16': 141762, '2020-05-17': 157660, '2020-05-18': 159126, '2020-05-19': 152434, '2020-05-20': 180419, '2020-05-21': 113446, '2020-05-22': 168129, '2020-05-23': 168755, '2020-05-24': 162165, '2020-05-25': 165694, '2020-05-26': 161429, '2020-05-27': 100477, '2020-05-28': 166631, '2020-05-29': 151054, '2020-05-30': 148664, '2020-05-31': 227103, '2020-06-01': 145312, '2020-06-02': 141767, '2020-06-03': 118005, '2020-06-04': 169786, '2020-06-05': 120088, '2020-06-06': 128830, '2020-06-07': 129913, '2020-06-08': 118268, '2020-06-09': 145074, '2020-06-10': 86053, '2020-06-11': 152540, '2020-06-12': 180517, '2020-06-13': 98611, '2020-06-14': 148302, '2020-06-15': 227485, '2020-06-16': 178975, '2020-06-17': 172096, '2020-06-18': 147349, '2020-06-19': 191960, '2020-06-20': 208625, '2020-06-21': 193122, '2020-06-22': 126727, '2020-06-23': 238324, '2020-06-24': 134315, '2020-06-25': 248471, '2020-06-26': 167144, '2020-06-27': 209088, '2020-06-28': 263783, '2020-06-29': 249119, '2020-06-30': 139936, '2020-07-01': 236912, '2020-07-02': 185609, '2020-07-03': 156744, '2020-07-04': 200357, '2020-07-05': 215558, '2020-07-06': 197596, '2020-07-07': 144650, '2020-07-08': 150234, '2020-07-09': 214605, '2020-07-10': 256758, '2020-07-11': 246835, '2020-07-12': 227680, '2020-07-13': 254347, '2020-07-14': 231004, '2020-07-15': 277297, '2020-07-16': 248688, '2020-07-17': 254110, '2020-07-18': 211511, '2020-07-19': 209360, '2020-07-20': 182971, '2020-07-21': 161771, '2020-07-22': 215823, '2020-07-23': 92191, '2020-07-24': 156151, '2020-07-25': 198849, '2020-07-26': 188822, '2020-07-27': 131231, '2020-07-28': 276892, '2020-07-29': 271086, '2020-07-30': 298734, '2020-07-31': 248691, '2020-08-01': 223718, '2020-08-02': 223515, '2020-08-03': 182846, '2020-08-04': 131796, '2020-08-05': 126342, '2020-08-06': 179134, '2020-08-07': 84392, '2020-08-08': 124148, '2020-08-09': 158325, '2020-08-10': 117741, '2020-08-11': 107093, '2020-08-12': 145354, '2020-08-13': 66790, '2020-08-14': 154444, '2020-08-15': 114484, '2020-08-16': 103892, '2020-08-17': 135835, '2020-08-18': 101112, '2020-08-19': 95036, '2020-08-20': 60739, '2020-08-21': 99370, '2020-08-22': 117358, '2020-08-23': 117981, '2020-08-24': 129937, '2020-08-25': 98044, '2020-08-26': 36857, '2020-08-27': 170240, '2020-08-28': 159679, '2020-08-29': 133874, '2020-08-30': 105210, '2020-08-31': 84796, '2020-09-01': 102616, '2020-09-02': 19631, '2020-09-03': 162955, '2020-09-04': 146549, '2020-09-05': 93257, '2020-09-06': 89652, '2020-09-07': 121356, '2020-09-08': 36922, '2020-09-09': 117827, '2020-09-10': 193334, '2020-09-11': 179803, '2020-09-12': 152254, '2020-09-13': 147634, '2020-09-14': 146124, '2020-09-15': 110707, '2020-09-16': 190407, '2020-09-17': 132233, '2020-09-18': 52306, '2020-09-19': 76762, '2020-09-20': 89633, '2020-09-21': 99335, '2020-09-22': 161165, '2020-09-23': 176233, '2020-09-24': 148991, '2020-09-25': 96869, '2020-09-26': 106853, '2020-09-27': 87595, '2020-09-28': 92033, '2020-09-29': 121104, '2020-09-30': 148997, '2020-10-01': 133100, '2020-10-02': 379026, '2020-10-03': 436282, '2020-10-04': 351184, '2020-10-05': 379817, '2020-10-06': 421511, '2020-10-07': 338279, '2020-10-08': 236816, '2020-10-09': 142917, '2020-10-10': 156889, '2020-10-11': 199151, '2020-10-12': 158029, '2020-10-13': 181063, '2020-10-14': 197712, '2020-10-15': 151932, '2020-10-16': 122393, '2020-10-17': 135731, '2020-10-18': 145412, '2020-10-19': 125424, '2020-10-20': 154964, '2020-10-21': 149130, '2020-10-22': 21164, '2020-10-23': 106903, '2020-10-24': 163843, '2020-10-25': 194218, '2020-10-26': 131788, '2020-10-27': 185754, '2020-10-28': 108401, '2020-10-29': 143916, '2020-10-30': 154400, '2020-10-31': 189231, '2020-11-01': 147902, '2020-11-02': 159463, '2020-11-03': 132584, '2020-11-04': 109027, '2020-11-05': 172654, '2020-11-06': 134046, '2020-11-07': 201783, '2020-11-08': 137350, '2020-11-09': 198107, '2020-11-10': 117073, '2020-11-11': 87193, '2020-11-12': 9359, '2020-11-13': 189160, '2020-11-14': 103914, '2020-11-15': 135074, '2020-11-16': 144441, '2020-11-17': 233598, '2020-11-18': 98005, '2020-11-19': 67618, '2020-11-20': 163859, '2020-11-21': 196601, '2020-11-22': 156792, '2020-11-23': 126540, '2020-11-24': 74520, '2020-11-25': 187570, '2020-11-26': 154747, '2020-11-27': 125643, '2020-11-28': 130822, '2020-11-29': 142522, '2020-11-30': 81579, '2020-12-01': 81744, '2020-12-02': 159790, '2020-12-03': 200054, '2020-12-04': 187236, '2020-12-05': 160154, '2020-12-06': 158647, '2020-12-07': 88053, '2020-12-08': 16649, '2020-12-09': 193176, '2020-12-10': 194171, '2020-12-11': 176388, '2020-12-12': 140498, '2020-12-13': 133113, '2020-12-14': 132287, '2020-12-16': 42001, '2020-12-17': 127344, '2020-12-18': 214756, '2020-12-19': 183178, '2020-12-20': 183070, '2020-12-21': 225058, '2020-12-22': 212982, '2020-12-23': 163172, '2020-12-24': 150637, '2020-12-25': 89401, '2020-12-26': 27306, '2020-12-27': 108480, '2020-12-28': 179675, '2020-12-29': 181734, '2020-12-30': 223107, '2020-12-31': 181383}

In [None]:
num_tweets_df = pd.DataFrame(num_tweets.items())
num_tweets_df.columns = ['date', 'tweet_count']

In [None]:
# Title: Number of CoronaVis Tweet IDs Successfully Hydrated Per Day
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=num_tweets_df,
    x='date', y='tweet_count', 
    kind="line", height=15, aspect=2.5
).set_axis_labels("Date (Month in 2020)", "Number of Hydrated Tweets", labelpad=35)
g.set(ylim=(0, 575000))
g.set(xticks=tick_indices) # created in original tweet ID counts graphic
g.set_xticklabels(month_ticks) # ditto

In [None]:
# visualize number of tweets successfully hydrated
# no longer used, see seaborn visualization above
x,y = zip(*num_tweets.items())
rotate, ax = plt.subplots(figsize=(75,8))
ax.set_title('Number of Tweets Successfully Hydrated per day')
plt.xticks(rotation=90)
plt.plot(x, y);

In [None]:
# sum of tweets successfully found so far
total_hydrated_tweepy = 0
total_hydrated_twarc = 0
for date in sorted(num_tweets):
  if date < '2020-05-06':
    total_hydrated_tweepy += num_tweets[date]
  else:
    total_hydrated_twarc += num_tweets[date]

print('total queried using tweepy:', total_hydrated_tweepy)
print('total queried using twarc: ', total_hydrated_twarc)
print('total queried:', total_hydrated_tweepy + total_hydrated_twarc)

In [None]:
# calculate percent of tweet ids successfully hydrated per day
percent_hydrated = {}
for date in sorted(num_tweets):
  percent_hydrated[date] = num_tweets[date] / num_ids[date]

In [None]:
percent_hydrated_df = pd.DataFrame(percent_hydrated.items())
percent_hydrated_df.columns = ['date', 'decimal']
percent_hydrated_df['percent'] = percent_hydrated_df.apply(lambda row: row.decimal * 100, axis=1)

In [None]:
# Title: Percent of CoronaVis Tweet IDs Successfully Hydrated Per Day
# Note dip starting from the second collection of tweets by CoronaVis
# Can't find a real reason for that huge dip in later October?
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=percent_hydrated_df,
    x='date', y='percent', 
    kind="line", height=15, aspect=2.5
).set_axis_labels("Date (Month in 2020)", "Percent (%)", labelpad=35)
g.set(ylim=(30, 100))
g.set(xticks=tick_indices) # created in original tweet ID counts graphic
g.set_xticklabels(month_ticks) # ditto

In [None]:
# visualize percent of tweet ids successfully hydrated per day
# # no longer used, see seaborn visualization above
x,y = zip(*percent_hydrated.items())
rotate, ax = plt.subplots(figsize=(75,8))
ax.set_title('Percentage of Tweet IDs Successfully Hydrated per day')
plt.xticks(rotation=90)
plt.plot(x, y);

## 2. CoronaVis: State Location Discovery and Sorting

###Part One: 03/05/2020 to 05/05/2020

In [None]:
# ------------------------------------------------------------------------------
# GET HYDRATED TWEET CSVs FROM GDRIVE (03-05 to 05-05)
# hydrated_dict: {yyyy-mm-dd: tweets}
# ------------------------------------------------------------------------------
hydrated_files = glob.glob("drive/My Drive/Thesis/CoronaVis Hydrated/*.csv")
hydrated_dict = {}
for f in hydrated_files:
  date = f[-33:-23]
  hydrated_dict[date] = pd.read_csv(f, lineterminator='\n')

In [None]:
# ------------------------------------------------------------------------------
# CHECK FOR STATE NAME OR ABBREVIATION IN STRING
# ------------------------------------------------------------------------------
def check_for_state(loc):
  loc = loc.lower()
  # check for state name
  for state in states_abbrev:
    if state in loc:
      return states_abbrev[state]
  # check for state abbreviations
  for state in states_lower:
    # can't just look for the two letters, they must stand alone
    abbrev_regex = re.compile(r'\b[,]*{}[,]*\b'.format(state))
    result = re.search(abbrev_regex, loc)
    if (result):
      return state
  # no state found
  return None

In [None]:
# ------------------------------------------------------------------------------
# FUNCTION TO QUERY GEOCODE
# source: https://stackoverflow.com/questions/60083187/python-geopy-nominatim-too-many-requests
# not currently used because takes too long lmao and I have enough tweets
# ------------------------------------------------------------------------------
def geocode_func(geolocator, loc, sleep_sec):
  try:
    return geolocator.geocode(loc)
  except GeocoderTimedOut:
    print('TIMED OUT: GeocoderTimedOut: Retrying...')
    sleep(randint(1*100,sleep_sec*100)/100)
    return geocode_func(geolocator, loc, sleep_sec)
  except GeocoderServiceError as e:
    print('CONNECTION REFUSED: GeocoderServiceError encountered.')
    return None
  except Exception as e:
    print('ERROR: Terminating due to exception {}'.format(e))
    return None

In [None]:
# ------------------------------------------------------------------------------
# GEOPARSING: separate tweets into DFs by state
# VERSION 1: TAKES IN DF FOR A DATE, WHERE EACH ROW IS A TWEET
# ------------------------------------------------------------------------------
def find_state_locations(tweets, date):
  # initialize state lists
  state_dict = {}
  for state in states_lower:
    state_dict[state] = []

  # counters (for reference of how many tweets were retained)
  success = 0
  unqueried = 0
  invalid = 0

  # for each tweet entry in the date's DF
  for index, row in tqdm(tweets.iterrows(), total=tweets.shape[0], desc=date):
    tweet = {'date': date, 'id': row['id'], 'text': row['text']}

    # CHECK PLACE FIELD IF IT EXISTS
    if row['place'] == row['place']:
      place = ast.literal_eval(row['place'])
      if place['country_code']=='US': # place in the US
        place_name = place['full_name'].lower()
        if (place_name[-2:] in states_lower): # state abbrev is in place
          state_dict[place_name[-2:]].append(tweet)
          success += 1
        elif place_name[:-5] in states_abbrev: # state name is in place
          abbrev = states_abbrev[place_name[:-5]]
          state_dict[abbrev].append(tweet)
          success += 1
        else: # search geocode for location
          unqueried += 1
      else:
        invalid += 1

    # CHECK USER_LOC FIELD IF IT EXISTS
    elif row['user_loc'] == row['user_loc']: 
      state = check_for_state(row['user_loc'])
      if state != None:
        state_dict[state].append(tweet)
        success += 1
      else:
        unqueried+=1

    else:
      unqueried += 1
    
  # return state_dict
  return state_dict, success, unqueried, invalid

In [None]:
# # for purpose of test runs
# f = "drive/My Drive/Thesis/CoronaVis Hydrated/2020-04-18_coronavis_hydrated.csv"
# test = pd.read_csv(f, lineterminator='\n')
# state_dict, success, unqueried, invalid = find_state_locations(test, '2020-04-18')
# print('success: ', success, 'unqueried: ', unqueried, 'invalid: ', invalid)

In [None]:
# ------------------------------------------------------------------------------
# SET UP FOR ROUND 1 OF LOCATION FINDING: 03/05-05/05
# ------------------------------------------------------------------------------
tweets_by_state = {} # master df of tweets by state
for state in states_lower:
  tweets_by_state[state] = pd.DataFrame()

counters = pd.DataFrame() # counters for how many tweets were retained

In [None]:
# run just 2020-05-05 to double check the numbers-- not sure why so many are unqueried here?
may5_file = "drive/My Drive/Thesis/1a.CoronaVis Hydrated/2020-05-05_coronavis_hydrated.csv"
may5 = pd.read_csv(may5_file, lineterminator='\n')

new_state_dict, success, unqueried, invalid = find_state_locations(may5, '2020-05-05')

In [None]:
# ------------------------------------------------------------------------------
# ROUND 1 OF LOCATION FINDING: 03/05-05/05
# ------------------------------------------------------------------------------
# Run for all dates in range
for date in hydrated_dict:
  new_state_dict, success, unqueried, invalid = find_state_locations(hydrated_dict[date], date)
  # update state df
  for state in states_lower:
    new_df = pd.DataFrame(new_state_dict[state], columns=['date', 'id', 'text'])
    tweets_by_state[state] = pd.concat([tweets_by_state[state], new_df])
  # update counters df
  total = success + unqueried + invalid
  count = {'date': date, 'success': success, 'unqueried': unqueried, 'invalid': invalid, 'total': total}
  counters = counters.append(count, ignore_index=True)

# Update tweets in GDrive
for state in states_lower:
  state_df = tweets_by_state[state]
  state_df.to_csv('{}_tweets.csv'.format(state))
  !cp $state"_tweets.csv" "drive/My Drive/Thesis/By Location"

# Update counters in GDrive
counters = counters.set_index('date')
counters.to_csv('tweet_counters.csv')
!cp "tweet_counters.csv" "drive/My Drive/Thesis/By Location"

### Part Two: 05/06/2020 to 08/07/2020

In [None]:
# ------------------------------------------------------------------------------
# GEOPARSING: separate tweets into DFs by state
# VERSION 2: takes in jsonl file for a date, where each json entry is a tweet
# Used by parts 2 and 3
# ------------------------------------------------------------------------------
def find_state_locations_jsonl(jsonl_file, date, total_count):
  # initialize state lists
  state_dict = {}
  for state in states_lower:
    state_dict[state] = []

  # counters (for reference of how many tweets were retained)
  success = 0
  unqueried = 0
  invalid = 0

  # for each tweet json entry in the date's jsonl file
  with jsonlines.open(jsonl_file) as reader:
    for obj in tqdm(reader, total=total_count, desc=date):
      # get pertinent tweet info and save to dict
      tweet = {}
      tweet['date'] = date
      tweet['id'] = obj['id']
      if ('retweeted_status' in obj.keys()):
        tweet['text'] = obj['retweeted_status']['full_text']
      else:
        tweet['text'] = obj['full_text']
      # get location info
      user_loc = obj['user']['location']
      place = obj['place']

      # CHECK PLACE FIELD IF IT EXISTS
      if place is not None:
        if place['country_code']=='US': # place in the US
          place_name = place['full_name'].lower()
          if (place_name[-2:] in states_lower): # state abbrev is in place
            state_dict[place_name[-2:]].append(tweet)
            success += 1
          elif place_name[:-5] in states_abbrev: # state name is in place
            abbrev = states_abbrev[place_name[:-5]]
            state_dict[abbrev].append(tweet)
            success += 1
          else: # search geocode for location
            unqueried += 1
        else:
          invalid += 1

      # CHECK USER_LOC FIELD IF IT EXISTS
      elif user_loc == user_loc: 
        state = check_for_state(user_loc)
        if state != None:
          state_dict[state].append(tweet)
          success += 1
        else:
          unqueried+=1

      else:
        unqueried += 1
    
  # return state_dict
  return state_dict, success, unqueried, invalid

In [None]:
# ------------------------------------------------------------------------------
# SET UP FOR ROUND 2 OF LOCATION FINDING: 05/06-08/07
# Updated to just use lists and later convert to df for speed
# ------------------------------------------------------------------------------
tweets_by_state_2 = {} # master df of tweets by state
for state in states_lower:tweets_by_state_2[state] = []

counters_2 = [] # counters for how many tweets were retained

In [None]:
# # single test example
# f = "drive/My Drive/Thesis/CoronaVis jsonl/2020-05-06_coronavis_hydrated.jsonl"
# date = '2020-05-06'
# new_state_dict, success, unqueried, invalid = find_state_locations_jsonl(f, date, 194138)

In [None]:
# ------------------------------------------------------------------------------
# ROUND 2 OF LOCATION FINDING: 05/06-08/07
# ------------------------------------------------------------------------------
# tweet counts for each
daily_counts = [194138,  78221, 180063, 175684, 159459, 193267, 181988, 107274, 180345, 169278, 141762, 157660, 159126, 152434, 180419, 113446, 168129, 168755, 162165, 165694, 161429, 100477, 166631, 151054, 148664, 227103, 145312, 141767, 118005, 169786, 120088, 128830, 129913, 118268, 145074,  86053, 152540, 180517,  98611, 148302, 227485, 178975, 172096, 147349, 191960, 208625, 193122, 126727, 238324, 134315, 248471, 167144, 209088, 263783, 249119, 139936, 236912, 185609, 156744, 200357, 215558, 197596, 144650, 150234, 214605, 256758, 246835, 227680, 254347, 231004, 277297, 248688, 254110, 211511, 209360, 182971, 161771, 215823,  92191, 156151, 198849, 188822, 131231, 276892, 271086, 298734, 248691, 223718, 223515, 182846, 131796, 126342, 179134,  84392]
index = 0

# Run for all dates in range
jsonl_files = glob.glob("drive/My Drive/Thesis/CoronaVis jsonl/*.jsonl")
for jsonl_file in sorted(jsonl_files):
  date = jsonl_file[-35:-25]
  new_state_dict, success, unqueried, invalid = find_state_locations_jsonl(jsonl_file, date, daily_counts[index])
  index += 1
  # update state df
  for state in states_lower:
    tweets_by_state_2[state] = tweets_by_state_2[state] + new_state_dict[state]
  # update counters df
  total = success + unqueried + invalid
  count = {'date': date, 'success': success, 'unqueried': unqueried, 'invalid': invalid, 'total': total}
  counters_2.append(count)

# Update tweets in GDrive
for state in states_lower:
  # get first half of tweets from GDrive
  f = "drive/My Drive/Thesis/By Location/{}_tweets.csv".format(state)
  df_1 = pd.read_csv(f, lineterminator='\n')
  # get new df, concatenate with old
  df_2 = pd.DataFrame(tweets_by_state_2[state], columns=['date', 'id', 'text'])
  joint_df = pd.concat([df_1, df_2])
  joint_df.to_csv('{}_tweets.csv'.format(state))
  !cp $state"_tweets.csv" "drive/My Drive/Thesis/By Location"

# Update counters in GDrive
f = "drive/My Drive/Thesis/By Location/tweet_counters.csv".format(state)
old_counters = pd.read_csv(f, lineterminator='\n')
old_counters = old_counters.set_index('date')
new_counters = pd.DataFrame(counters_2, columns=['date', 'success', 'unqueried', 'invalid'])
new_counters = new_counters.set_index('date')
joint_counters = pd.concat([old_counters, new_counters])
joint_counters.to_csv('tweet_counters.csv')
!cp "tweet_counters.csv" "drive/My Drive/Thesis/By Location"

### Part Three: 08/08/2020 to 12/31/2020

In [None]:
# ------------------------------------------------------------------------------
# SET UP FOR ROUND 3 OF LOCATION FINDING: 08/08-12/31
# Updated to just use lists and later convert to df for speed
# ------------------------------------------------------------------------------
tweets_by_state_3 = {} # master df of tweets by state
for state in states_lower:tweets_by_state_3[state] = []

counters_3 = [] # counters for how many tweets were retained

In [None]:
# ------------------------------------------------------------------------------
# ROUND 3 OF LOCATION FINDING: 08/08-12/31
# need to first run cells for find_state_locations_jsonl (in part two) and 
# check_for_state (in part one)
# ------------------------------------------------------------------------------
# tweet counts for each
daily_counts = [124148, 158325, 117741, 107093, 145354, 66790, 154444, 114484, 103892, 135835, 101112, 95036, 60739, 99370, 117358, 117981, 129937, 98044, 36857, 170240, 159679, 133874, 105210, 84796, 102616, 19631, 162955, 146549, 93257, 89652, 121356, 36922, 117827, 193334, 179803, 152254, 147634, 146124, 110707, 190407, 132233, 52306, 76762, 89633, 99335, 161165, 176233, 148991, 96869, 106853, 87595, 92033, 121104, 148997, 133100, 379026, 436282, 351184, 379817, 421511, 338279, 236816, 142917, 156889, 199151, 158029, 181063, 197712, 151932, 122393, 135731, 145412, 125424, 154964, 149130, 21164, 106903, 163843, 194218, 131788, 185754, 108401, 143916, 154400, 189231, 147902, 159463, 132584, 109027, 172654, 134046, 201783, 137350, 198107, 117073, 87193, 9359, 189160, 103914, 135074, 144441, 233598, 98005, 67618, 163859, 196601, 156792, 126540, 74520, 187570, 154747, 125643, 130822, 142522, 81579, 81744, 159790, 200054, 187236, 160154, 158647, 88053, 16649, 193176, 194171, 176388, 140498, 133113, 132287, 42001, 127344, 214756, 183178, 183070, 225058, 212982, 163172, 150637, 89401, 27306, 108480, 179675, 181734, 223107, 181383]
index = 0

# Run for all dates in range
jsonl_files = glob.glob("drive/My Drive/Thesis/1c.CoronaVis jsonl (aug-dec)/*.jsonl")
for jsonl_file in sorted(jsonl_files):
  date = jsonl_file[-35:-25]
  new_state_dict, success, unqueried, invalid = find_state_locations_jsonl(jsonl_file, date, daily_counts[index])
  index += 1
  # update state df
  for state in states_lower:
    tweets_by_state_3[state] = tweets_by_state_3[state] + new_state_dict[state]
  # update counters df
  total = success + unqueried + invalid
  count = {'date': date, 'success': success, 'unqueried': unqueried, 'invalid': invalid, 'total': total}
  counters_3.append(count)

# Update tweets in GDrive
for state in states_lower:
  # get first half of tweets from GDrive
  f = "drive/My Drive/Thesis/2. CoronaVis By Location/{}_tweets.csv".format(state)
  df_1 = pd.read_csv(f, lineterminator='\n')
  # get new df, concatenate with old
  df_2 = pd.DataFrame(tweets_by_state_3[state], columns=['date', 'id', 'text'])
  joint_df = pd.concat([df_1, df_2])
  joint_df.to_csv('{}_tweets.csv'.format(state))
  !cp $state"_tweets.csv" "drive/My Drive/Thesis/2. CoronaVis By Location"

# Update counters in GDrive
f = "drive/My Drive/Thesis/2. CoronaVis By Location/tweet_counters.csv"
old_counters = pd.read_csv(f, lineterminator='\n')
old_counters = old_counters.set_index('date')
new_counters = pd.DataFrame(counters_3, columns=['date', 'success', 'unqueried', 'invalid'])
new_counters = new_counters.set_index('date')
joint_counters = pd.concat([old_counters, new_counters])
joint_counters.to_csv('tweet_counters.csv')
!cp "tweet_counters.csv" "drive/My Drive/Thesis/2. CoronaVis By Location"

### Outlier Check: 05/05/2020's low success rate
yikes

In [None]:
# ------------------------------------------------------------------------------
# Load originally hydrated file
# ------------------------------------------------------------------------------
file = "drive/My Drive/Thesis/1a.CoronaVis Hydrated/2020-05-05_coronavis_hydrated.csv"
hydrated_05_05 = pd.read_csv(file, lineterminator='\n')

In [None]:
state_dict, success, unqueried, invalid = find_state_locations(hydrated_05_05, '2020-05-05')
print('success: ', success, 'unqueried: ', unqueried, 'invalid: ', invalid)

In [None]:
# ------------------------------------------------------------------------------
# Load newly hydrated file (note: this was hydrated at a later date than part 1, so 
# success rate was lower, as more tweets were deleted / made unavailable over time)
# ------------------------------------------------------------------------------
file = "drive/My Drive/Thesis/2020-05-05_coronavis_hydrated.jsonl"

In [None]:
date = '2020-05-05'
new_state_dict, success, unqueried, invalid = find_state_locations_jsonl(file, date, 153462)
print('success: ', success, 'unqueried: ', unqueried, 'invalid: ', invalid)

....dang it lol 

### Outlier Check: 11/12/2020's low-ish success rate
(this one seems fine)

In [None]:
# ------------------------------------------------------------------------------
# Load newly hydrated file (note: this was hydrated at a later date than part 1, so 
# success rate was lower, as more tweets were deleted / made unavailable over time)
# ------------------------------------------------------------------------------
f2_11_12 = "drive/My Drive/Thesis/2020-11-12_coronavis_hydrated.jsonl"

In [None]:
date = '2020-10-21'
new_state_dict, success, unqueried, invalid = find_state_locations_jsonl(f2_11_12, date, 9282)
print('success: ', success, 'unqueried: ', unqueried, 'invalid: ', invalid)

In [None]:
joint_counters.loc['2020-11-12']

###Fix 05/05/2020 data

In [None]:
f = "drive/My Drive/Thesis/2. CoronaVis By Location/tweet_counters.csv"
old_counters = pd.read_csv(f, lineterminator='\n')
old_counters = old_counters.set_index('date')
old_counters.drop('2020-05-05', inplace = True)
new_counters = pd.DataFrame(counters_4, columns=['date', 'success', 'unqueried', 'invalid'])
new_counters = new_counters.set_index('date')
joint_counters = pd.concat([old_counters, new_counters])
joint_counters = joint_counters.sort_index()
joint_counters.to_csv('tweet_counters.csv')
!cp "tweet_counters.csv" 

In [None]:
# ------------------------------------------------------------------------------
# ROUND 4 OF LOCATION FINDING: fix 05/05 data
# ------------------------------------------------------------------------------
# Run state sorting code 
jsonl_file = file = "drive/My Drive/Thesis/2020-05-05_coronavis_hydrated.jsonl"
date = '2020-05-05'
new_state_dict, success, unqueried, invalid = find_state_locations_jsonl(jsonl_file, date, 153462)
# update counters df
total = success + unqueried + invalid
count = {'date': date, 'success': success, 'unqueried': unqueried, 'invalid': invalid, 'total': total}
counters_4 = []
counters_4.append(count)

# Update tweets in GDrive
for state in states_lower:
  # get rest of tweets from GDrive
  f = "drive/My Drive/Thesis/2. CoronaVis By Location/{}_tweets.csv".format(state)
  df_1 = pd.read_csv(f, lineterminator='\n')
  # remove previous 05/05 data and unnecessary index columns
  df_1.drop('Unnamed: 0', inplace=True, axis=1)
  df_1.drop('Unnamed: 0.1', inplace=True, axis=1)
  df_1.drop('Unnamed: 0.1.1', inplace=True, axis=1)
  df_1.drop(df_1[df_1['date'] == '2020-05-05'].index, inplace = True)
  # get new df and concatenate with old
  df_2 = pd.DataFrame.from_dict(new_state_dict[state])
  joint_df = pd.concat([df_1, df_2])
  print(state, ": ", joint_df.shape[0]==(df_1.shape[0] + df_2.shape[0]))
  # reorder dates again
  joint_df = joint_df.sort_values(by='date')
  # save to GDrive
  joint_df.to_csv('{}_tweets.csv'.format(state))
  !cp $state"_tweets.csv" "drive/My Drive/Thesis/2. CoronaVis By Location"

# Update counters in GDrive
f = "drive/My Drive/Thesis/2. CoronaVis By Location/tweet_counters.csv"
old_counters = pd.read_csv(f, lineterminator='\n')
old_counters = old_counters.set_index('date')
old_counters.drop('2020-05-05', inplace = True)
new_counters = pd.DataFrame(counters_4, columns=['date', 'success', 'unqueried', 'invalid'])
new_counters = new_counters.set_index('date')
joint_counters = pd.concat([old_counters, new_counters])
joint_counters = joint_counters.sort_index()
joint_counters.to_csv('tweet_counters.csv')
!cp "tweet_counters.csv" "drive/My Drive/Thesis/2. CoronaVis By Location"

## CoronaVis: Analysis of Location Sorted Tweets

In [None]:
# ------------------------------------------------------------------------------
# COUNT NUMBER OF TWEETS FOR EACH STATE
# ------------------------------------------------------------------------------
location_files = glob.glob("drive/My Drive/Thesis/2. CoronaVis By Location/*.csv")
sorted_tweets = {}
num_sorted_tweets = {}
for file in sorted(location_files):
  if file != "drive/My Drive/Thesis/2. CoronaVis By Location/tweet_counters.csv":
    state = file[-13:-11]
    sorted_tweets[state] = pd.read_csv(file, lineterminator='\n')
    num_sorted_tweets[state] = len(sorted_tweets[state])

In [None]:
num_sorted_tweets_df = pd.DataFrame(list(num_sorted_tweets.items()),columns = ['state', 'count']) 
# num_sorted_tweets_df = num_sorted_tweets_df.set_index('state')
num_sorted_tweets_df['state'] = num_sorted_tweets_df.apply(lambda row: row.state.upper(), axis=1)
num_sorted_tweets_df

In [None]:
total = 0
for state in num_sorted_tweets:
  total += num_sorted_tweets[state]
print("Total number of successfully sorted tweets:", total)

In [None]:
counters_file = "drive/My Drive/Thesis/2. CoronaVis By Location/tweet_counters.csv"
joint_counters = pd.read_csv(counters_file, lineterminator='\n')
joint_counters = joint_counters.set_index('date')

In [None]:
joint_counters['total'].sum()

In [None]:
# Title: Total Number of Tweets Available Per State
sns.set_context("talk", font_scale=3.1)
g = sns.catplot(
    data=num_sorted_tweets_df, kind="bar",
    x="state", y="count", 
    height=15, aspect=2.5, palette="Set2", edgecolor=".6"
).set_axis_labels("State", "Number of Tweets", labelpad=35)
g.set_xticklabels(rotation=90)

In [None]:
# visualize number of tweets successfully hydrated
# no longer used (see seaborn one above)
x,y = zip(*num_sorted_tweets.items())
fig, ax = plt.subplots(figsize=(35,5))
ax.set_title('Number of Tweets Available per State')
plt.xticks(rotation=90)
plt.bar(x, y);

In [None]:
# todo: rewrite this code to actually call min and max functions 
print('max is CA with {} tweets'.format(num_sorted_tweets['ca']))
print('min is WY with {} tweets'.format(num_sorted_tweets['wy']))

In [None]:
# calculate percentages and save into the gdrive
# already done!
joint_counters['total'] = joint_counters.success + joint_counters.invalid + joint_counters.unqueried
joint_counters['%success'] = joint_counters.apply(lambda row: row.success / row.total, axis=1)
joint_counters['%invalid'] = joint_counters.apply(lambda row: row.invalid / row.total, axis=1)
joint_counters['%unqueried'] = joint_counters.apply(lambda row: row.unqueried / row.total, axis=1)

joint_counters.to_csv('tweet_counters.csv')
# !cp "tweet_counters.csv" "drive/My Drive/Thesis/2. CoronaVis By Location"

In [None]:
print('mean of success rate: ', joint_counters['%success'].mean())
print('mean of unqueried rate: ', joint_counters['%unqueried'].mean())
print('mean of invalid rate: ', joint_counters['%invalid'].mean())

In [None]:
# create a reformated df for the purpose of a multi-line graph
date = joint_counters.index.tolist()
length = len(date)
date = date + date + date
percent = joint_counters['%success'].tolist() + joint_counters['%invalid'].tolist() + joint_counters['%unqueried'].tolist()
result = ['success'] * length + ['not_in_USA'] * length + ['undetermined'] * length

state_classification = pd.DataFrame({'date': date, 'percent': percent, 'State Result': result})

In [None]:
# find where to place ticks (start of each month) and set their labels
month_ticks = ['Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
tick_indices = []

curr_month = ''
for i, date in enumerate(joint_counters.index.tolist()):
  month = date[5:7]
  if month != curr_month:
    tick_indices.append(i)
    curr_month = month

In [None]:
# Title: Tweet State Attribution Results: An Overview
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=state_classification, kind="line", 
    x='date', y='percent', hue='State Result',
    height=15, aspect=2.5, palette="Set2"
).set_axis_labels("Date (Month in 2020)", "Percent (%)", labelpad=35)
g.set(xticks=tick_indices)
g.set_xticklabels(month_ticks)

In [None]:
# Title: Tweet State Attribution Results: Percent Successful
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=joint_counters, kind="line", 
    x=joint_counters.index, y='%success', 
    height=15, aspect=2.5, palette="Set2"
).set_axis_labels("Date (Month in 2020)", "Percent (%)", labelpad=35)
g.set(xticks=tick_indices)
g.set_xticklabels(month_ticks)

In [None]:
# Title: Tweet State Attribution Results: Percent With Non-USA Country Code
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=joint_counters, kind="line", 
    x=joint_counters.index, y='%invalid', 
    height=15, aspect=2.5, palette="Set2"
).set_axis_labels("Date (Month in 2020)", "Percent (%)", labelpad=35)
g.set(xticks=tick_indices)
g.set_xticklabels(month_ticks)

In [None]:
# Title: Tweet State Attribution Results: Percent Undetermined
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=joint_counters, kind="line", 
    x=joint_counters.index, y='%unqueried', 
    height=15, aspect=2.5, palette="Set2"
).set_axis_labels("Date (Month in 2020)", "Percent (%)", labelpad=35)
g.set(xticks=tick_indices)
g.set_xticklabels(month_ticks)

In [None]:
# ignore this, this is an awful graphic LOL
joint_counters.plot.barh(y=['%success', '%invalid', '%unqueried'], stacked=True, color={"%invalid": "red", "%success": "LightGreen", "%unqueried": "LightBlue"}, figsize=(20,75));

##3. Sentiment Analysis of Tweets by Location
Performed using VADER: https://github.com/cjhutto/vaderSentiment

In [None]:
# ------------------------------------------------------------------------------
# SET UP VADER, INITIALIZE ANALYZER
# ------------------------------------------------------------------------------
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [None]:
print(states_lower)

In [None]:
# ------------------------------------------------------------------------------
# HANDLE STATES IN SMALL GROUPS TO AVOID SESSION CRASH AFTER USING ALL AVAILABLE RAM
# ------------------------------------------------------------------------------
set1 = ['al', 'ak', 'az', 'ar', 'co', 'ca', 'ct', 'de', 'ga', 'hi', 'id'] # done
set2 = ['fl', 'il', 'in', 'ia', 'ks', 'ky'] # done
set3 = ['la', 'me', 'md', 'ma', 'mi', 'mn', 'ms', 'mo', 'mt', 'ne', 'nv', 'nh'] # done
set4 = ['nj', 'nm', 'ny', 'nc', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn'] # done
set5 = ['tx', 'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy']

In [None]:
# ------------------------------------------------------------------------------
# GET HYDRATED, SORTED TWEET CSVs FROM GDRIVE
# hydrated_dict: {yyyy-mm-dd: tweets}
# ------------------------------------------------------------------------------
location_files = glob.glob("drive/My Drive/Thesis/2. CoronaVis By Location/*.csv")
tweets = {}
for file in sorted(location_files):
  state = file[-13:-11]
  if state in set1: # CHANGE SET HERE
    tweets[state] = pd.read_csv(file, lineterminator='\n')

In [None]:
# ------------------------------------------------------------------------------
# DEFINE FUNCTIONS TO CLEAN THROUGH TWEETS
# CODE ADAPTED FROM : https://medium.com/python-in-plain-english/twitter-sentiment-analysis-using-vader-tweepy-b2a62fba151e
# ------------------------------------------------------------------------------
def remove_pattern(input_txt, pattern1, pattern2, pattern3):
  r1 = re.findall(pattern1, input_txt)
  for i in r1:
    input_txt = re.sub(i, '', input_txt)

  r2 = re.findall(pattern2, input_txt)
  for i in r2:
    input_txt = re.sub(i, '', input_txt)

  r3 = re.findall(pattern3, input_txt)
  for i in r3:
    input_txt = re.sub(i, '', input_txt)  
                
  return input_txt

def clean_tweets(tweets):
  #remove twitter Return handles (RT @xxx:), twitter handles (@xxx), URL links (httpxxx)
  tweets = np.vectorize(remove_pattern)(tweets, "RT @[\w]*:", "@[\w]*", "https?://[A-Za-z0-9./]*")
  return tweets

In [None]:
# ------------------------------------------------------------------------------
# CLEAN THROUGH TWEETS
# ------------------------------------------------------------------------------
index = 1
for state in sorted(tweets.keys()):
  print(index, state, datetime.now().time().strftime('%H:%M:%S'))
  tweets[state]['clean_text'] = clean_tweets(tweets[state]['text'])
  index += 1

In [None]:
# ------------------------------------------------------------------------------
# REMOVE UNNECESSARY COLS FROM DFs
# ------------------------------------------------------------------------------
for state in tweets.keys():
  tweets[state].drop('Unnamed: 0', inplace=True, axis=1)
  tweets[state].drop('Unnamed: 0.1', inplace=True, axis=1)
  tweets[state].drop('Unnamed: 0.1.1', inplace=True, axis=1)

In [None]:
for state in sorted(tweets.keys()):
  scores_list = []
  for index, row in tqdm(tweets[state].iterrows(), total=tweets[state].shape[0], desc=state):
    text = row['clean_text']
    sentiment = analyser.polarity_scores(text)
    com = sentiment["compound"]
    pos = sentiment["pos"]
    neu = sentiment["neu"]
    neg = sentiment["neg"]
      
    scores_list.append({"Compound": com, "Positive": pos, "Negative": neg, "Neutral": neu})

  sentiment_scores = pd.DataFrame.from_dict(scores_list)
  tweets[state] = tweets[state].join(sentiment_scores)

  tweets[state].to_csv('{}_tweets_sentiment.csv'.format(state))
  !cp $state"_tweets_sentiment.csv" "drive/My Drive/Thesis/3. CoronaVis Sentiment"

In [None]:
for state in sorted(tweets.keys()):
  print(state, list(tweets[state].columns.values))

### Fix 05/05 Data

In [None]:
# ------------------------------------------------------------------------------
# Fix sentiment analysis of just 05/05 data
# ------------------------------------------------------------------------------
for state in states_lower:
  # Perform sentiment analysis on 05/05/2020 data for the state
  new_df = pd.DataFrame.from_dict(new_state_dict[state])
  new_df['clean_text'] = clean_tweets(new_df['text'])

  scores_list = []
  for index, row in tqdm(new_df.iterrows(), total=new_df.shape[0], desc=state):
    text = row['clean_text']
    sentiment = analyser.polarity_scores(text)
    com = sentiment["compound"]
    pos = sentiment["pos"]
    neu = sentiment["neu"]
    neg = sentiment["neg"]
      
    scores_list.append({"Compound": com, "Positive": pos, "Negative": neg, "Neutral": neu})

  sentiment_scores = pd.DataFrame.from_dict(scores_list)
  new_df = new_df.join(sentiment_scores)

  # get original sentiment analyzed state file
  f = "drive/My Drive/Thesis/3. CoronaVis Sentiment/{}_tweets_sentiment.csv".format(state)
  og_df = pd.read_csv(f, lineterminator='\n')
  # remove previous 05/05 data and unnecessary index columns
  og_df.drop('Unnamed: 0', inplace=True, axis=1)
  og_df.drop(og_df[og_df['date'] == '2020-05-05'].index, inplace = True)
  # get new df and concatenate with old
  joint_df = pd.concat([og_df, new_df])
  # reorder dates again
  joint_df = joint_df.sort_values(by='date')
  print(joint_df.shape[0])
  # # save to GDrive
  joint_df.to_csv('{}_tweets_sentiment.csv'.format(state))
  !cp $state"_tweets_sentiment.csv" "drive/My Drive/Thesis/3. CoronaVis Sentiment"

# **Prepare Input/Output Dataframes**

###Load COVID Historical Data from CSV files in GDrive

In [None]:
# function to turn date string '2020-03-05' into int 20200305
def dateStrToInt(row):
  date = row['date']
  return np.int(date[:4] + date[5:7] + date[8:10])

In [None]:
# ------------------------------------------------------------------------------
# load state COVID case data from GDrive, save into master_history: {state: df}
# data loaded in 2 parts
# ------------------------------------------------------------------------------
master_history_1 = {} # empty dict to fill with dfs
for state in states_lower:
  date = '2020-10-27' # date you want to load historical data from
  path = 'drive/My Drive/Thesis/{}/{}_{}_historical.csv'.format(date, date, state)
  master_history_1[state] = pd.read_csv(path, index_col=0).iloc[::-1].reset_index().drop(columns=['index'])

master_history_2 = {} # empty dict to fill with dfs
for state in states_lower:
  date = '2021-03-12' # date you want to load historical data from
  path = 'drive/My Drive/Thesis/{}/{}_{}_historical.csv'.format(date, date, state)
  master_history_2[state] = pd.read_csv(path, index_col=0).iloc[::-1].reset_index().drop(columns=['index'])

master_history_3 = {} # empty dict to fill with dfs
for state in states_lower:
  path = 'drive/My Drive/Thesis/dataQualityGrades/{}_dataQualityGrade.csv'.format(state)
  master_history_3[state] = pd.read_csv(path).iloc[::-1].reset_index().drop(columns=['index', 'Unnamed: 0'])
  master_history_3[state]['date'] = master_history_3[state].apply(dateStrToInt, axis=1)
  master_history_3[state] = master_history_3[state].set_index('date')

In [None]:
# double check final date in all master_history_1 dfs is 1026 -- it is :)
# for state in states_lower:
  # print(master_history_1[state].tail(1)['date'] == 20201026)

In [None]:
# # ------------------------------------------------------------------------------
# # data exploration
# # ------------------------------------------------------------------------------
# master_history['va'].loc[:368]['positive'].isnull().values.any()
# # master_history['nj']
# master_history['va'].loc[365:370]

**Note**: positiveIncrease category has no NaN values. The 'positive' category either has 0 values or NaN values until the first report of cases.

**Categories to use**: positive, positiveIncrease, negative, dataQualityGrade (quantified)

### COVID Historical Data: prep for model

In [None]:
def replaceDataQualityGrade(row):
  if row['dataQualityGrade'] is None:
    return master_history_3[state].loc[row['date']]['dataQualityGrade']

In [None]:
# ------------------------------------------------------------------------------
# combine two sets of data
# ------------------------------------------------------------------------------
master_history = {}
for state in states_lower:
  master_history_2[state].drop(master_history_2[state][master_history_2[state].date < 20201027].index, inplace=True)
  master_history_2[state].drop(master_history_2[state][master_history_2[state].date > 20201231].index, inplace=True)
  master_history[state] = pd.concat([master_history_1[state], master_history_2[state]])  

In [None]:
# ------------------------------------------------------------------------------
# fill in dataQualityGrade data from new API (v2)
# ------------------------------------------------------------------------------
for state in states_lower:
  master_history[state]['dataQualityGrade'] = master_history[state].apply(replaceDataQualityGrade, axis=1)

In [None]:
master_history['tx'].tail()

In [None]:
# ------------------------------------------------------------------------------
# get list of states which have NaN values in the positive category
# ------------------------------------------------------------------------------
nanStates = []
for state in states_lower:
  if master_history[state]['positive'].isnull().values.any():
    nanStates.append(state)

nanStates # print list

In [None]:
# ------------------------------------------------------------------------------
# remove all rows with NaN / zero values in the positive column so it starts 
# with first positive cases
# (this gets rid of NaN values in the positiveIncrease column too)
# ------------------------------------------------------------------------------
for state in states_lower:
  firstIndex = -1
  if state in nanStates:
    firstIndex = master_history[state]['positive'].first_valid_index()
  elif master_history[state]['positive'].values[-1] == 0:
    firstIndex = master_history[state].positive.eq(0).idxmax()

  if firstIndex != -1:
    master_history[state] = master_history[state].iloc[firstIndex:]

In [None]:
# double check
for state in states_lower:
  if master_history[state]['positive'].isnull().values.any():
    print(state)

In [None]:
# ------------------------------------------------------------------------------
# zero out all null values in the negative column
# ------------------------------------------------------------------------------
for state in states_lower:
  if master_history[state]['negative'].isnull().values.any():
    # print(state)
    master_history[state]['negative'] = master_history[state]['negative'].fillna(0)

# ------------------------------------------------------------------------------
# previously: check where the NaN values in the negative column are and zero 
# them out if they're all at the end
# ------------------------------------------------------------------------------
  # if master_history[state]['negative'].isnull().values.any():
    # nullList = np.where(master_history[state]['negative'].isnull())[0]
    # rangeList=np.arange(nullList[0], master_history[state].shape[0])
    # if (rangeList == nullList).all():
    #   master_history[state]['negative'] = master_history[state]['negative'].fillna(0)
    #   print(state, 'nulls at the end, zeroed out! :)')
    # else: 
    #   print(state, ': nulls, not all at end, need to fix :(')

In [None]:
# ------------------------------------------------------------------------------
# replace dataQualityGrade with a numerical score (assign NaN equivalent to a C)
# ------------------------------------------------------------------------------
def numberGrade(row):
  if row['dataQualityGrade'] == 'A+':
    return 100
  elif row['dataQualityGrade'] == 'A':
    return 95
  elif row['dataQualityGrade'] == 'B':
    return 85
  elif row['dataQualityGrade'] == 'C':
    return 75
  elif row['dataQualityGrade'] == 'D':
    return 65
  elif row['dataQualityGrade'] == 'F':
    return 55
  else:
    return 75 # what score to give when they don't have a score??

for state in states_lower:
  master_history[state]['dataQualityGrade'] = master_history[state].apply(numberGrade, axis=1)

In [None]:
# ------------------------------------------------------------------------------
# Continue cleaning up df: reformat date, flip order of rows to be consistent 
# with the twitter df, rename columns, remove hositalizedIncrease and 
# deathIncrease columns for poor data quality
# ------------------------------------------------------------------------------
def dateFormat(date):
  date = str(date)
  return date[0:4] + "-" + date[4:6] + "-" + date[6:8]

for state in states_lower:
  master_history[state] = master_history[state][::-1].reset_index().drop(columns=['index', 'hospitalizedIncrease', 'deathIncrease'])
  master_history[state]['date'] = [dateFormat(date) for date in master_history[state]['date']]
  master_history[state].columns = ['date', 'posCases', 'posIncCases', 'negCases', 'dataQualityGrade']

In [None]:
# example state
master_history['wa']

###Tweet Data: Prepare for model (only run once; saved to GDrive after)

In [None]:
# ------------------------------------------------------------------------------
# GET MOST RECENT TWEET CSVs FROM GDRIVE
# master_tweets: {state: tweets}
# ------------------------------------------------------------------------------
location_files = glob.glob("drive/My Drive/Thesis/3. CoronaVis Sentiment/*.csv")
master_tweets = {}
for file in sorted(location_files):
  state = file[-23:-21]
  master_tweets[state] = pd.read_csv(file, lineterminator='\n')

In [None]:
# ------------------------------------------------------------------------------
# for each state, find daily tweet counts and sentiment averages across tweets
# save to Google Drive (folder: 4. Daily Tweet Summaries)
# ------------------------------------------------------------------------------
for state in states_lower:
  daily_tweets = {}
  date = '2020-03-05' # first date for the tweets
  scores = {'count': 0, 'Compound': 0, 'Positive': 0, 'Negative': 0, 'Neutral': 0}
  for index, row in tqdm(master_tweets[state].iterrows(), total=master_tweets[state].shape[0], desc=state):
    if row['date'] == date:
      scores['count'] += 1
      scores['Compound'] += row['Compound']
      scores['Positive'] += row['Positive']
      scores['Negative'] += row['Negative']
      scores['Neutral'] += row['Neutral']
    else:
      # find average scores add prev date to the dict
      scores['Compound'] = scores['Compound'] / scores['count']
      scores['Positive'] = scores['Positive'] / scores['count']
      scores['Negative'] = scores['Negative'] / scores['count']
      scores['Neutral'] = scores['Neutral'] / scores['count']
      daily_tweets[date] = scores
      # restart dict and update date
      date = row['date']
      scores = {'count': 1, 'Compound': row['Compound'], 'Positive': row['Positive'], 
                'Negative': row['Negative'], 'Neutral': row['Neutral']}
  
  # add final date to dict if it hasn't been added yet
  if date not in daily_tweets:
    scores['Compound'] = scores['Compound'] / scores['count']
    scores['Positive'] = scores['Positive'] / scores['count']
    scores['Negative'] = scores['Negative'] / scores['count']
    scores['Neutral'] = scores['Neutral'] / scores['count']
    daily_tweets[date] = scores

  # save to GDrive
  daily_tweets_df = pd.DataFrame.from_dict(daily_tweets, orient='index')
  daily_tweets_df.to_csv('{}_daily_tweet_summaries.csv'.format(state))
  !cp $state"_daily_tweet_summaries.csv" "drive/My Drive/Thesis/4. Daily Tweet Summaries"

###Tweet Data: Fix 05/05/2020 Data (only run once; saved to GDrive after)

In [None]:
# ------------------------------------------------------------------------------
# Update tweet summaries with new 05/05 data
# ------------------------------------------------------------------------------
states = ['az', 'ar', 'ca', 'co', 'ct', 'de', 'fl', 'ga', 'hi', 'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma', 'mi', 'mn', 'ms', 'mo', 'mt', 'ne', 'nv', 'nh', 'nj', 'nm', 'ny', 'nc', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy']

for state in states:
  # get full tweet sentiment CSV
  f_sentiment = "drive/My Drive/Thesis/3. CoronaVis Sentiment/{}_tweets_sentiment.csv".format(state)
  sentiment = pd.read_csv(f_sentiment, lineterminator='\n')

  # get current tweet summaries sheet and remove 05/05 data
  f_summary = "drive/My Drive/Thesis/4. Daily Tweet Summaries/{}_daily_tweet_summaries.csv".format(state)
  summary = pd.read_csv(f_summary, lineterminator='\n')
  summary.rename(columns = {'Unnamed: 0':'date'}, inplace = True)
  summary.drop(summary[summary['date'] == '2020-05-05'].index, inplace = True)

  sentiment_05_05 = sentiment.loc[sentiment['date'] == '2020-05-05']
  scores = {'count': 0, 'Compound': 0, 'Positive': 0, 'Negative': 0, 'Neutral': 0}
  date = 'temp'
  for index, row in tqdm(sentiment_05_05.iterrows(), total=sentiment_05_05.shape[0], desc=state):
    if row['date'] == date:
      scores['count'] += 1
      scores['Compound'] += row['Compound']
      scores['Positive'] += row['Positive']
      scores['Negative'] += row['Negative']
      scores['Neutral'] += row['Neutral']
    else:
      # restart dict and update date
      date = row['date']
      scores = {'count': 1, 'Compound': row['Compound'], 'Positive': row['Positive'], 
                'Negative': row['Negative'], 'Neutral': row['Neutral']}
  
  scores['Compound'] = scores['Compound'] / scores['count']
  scores['Positive'] = scores['Positive'] / scores['count']
  scores['Negative'] = scores['Negative'] / scores['count']
  scores['Neutral'] = scores['Neutral'] / scores['count']
  scores['date'] = '2020-05-05'

  # get new df and concatenate with old
  new_data = pd.DataFrame([scores])
  joint_df = pd.concat([summary, new_data])
  # reorder dates again
  joint_df = joint_df.sort_values(by='date')

  # save to GDrive
  joint_df.to_csv('{}_daily_tweet_summaries.csv'.format(state))
  !cp $state"_daily_tweet_summaries.csv" "drive/My Drive/Thesis/4. Daily Tweet Summaries"
  print('done with ', state)

###Combine the dfs into one large prepared input/output df

In [None]:
# ------------------------------------------------------------------------------
# Grab daily tweet info from GDrive (assumes the COVID data is already loaded
# on Colab because it runs quickly)
# ------------------------------------------------------------------------------
location_files = glob.glob("drive/My Drive/Thesis/4. Daily Tweet Summaries/*.csv")
daily_tweets = {}
for file in sorted(location_files):
  state = file[-28:-26]
  daily_tweets[state] = pd.read_csv(file, lineterminator='\n')
  daily_tweets[state].drop('Unnamed: 0', inplace=True, axis=1)

In [None]:
# ------------------------------------------------------------------------------
# rename columns in df daily_tweets
# ------------------------------------------------------------------------------
for state in states_lower:
  daily_tweets[state].columns = ['date', 'tweetCount', 'compoundSent', 'posSent', 'negSent', 'neutralSent']

In [None]:
# ------------------------------------------------------------------------------
# fix order of dates
# ------------------------------------------------------------------------------
for state in states_lower:
  daily_tweets[state] = daily_tweets[state].sort_values(by = 'date')

In [None]:
# ------------------------------------------------------------------------------
# def state populations and densities and a function to access them
# source: https://worldpopulationreview.com/states
# ------------------------------------------------------------------------------
state_pops = [4934190, 724357, 7520100, 3033950, 39613500, 5893630, 3552820, 990334, 21944600, 10830000, 1406430, 1860120, 12569300, 6805660, 3167970, 2917220, 4480710, 4627000, 1354520, 6065440, 6912240, 9992430, 5706400, 2966410, 6169040, 1085000, 1952000, 3185790, 1372200, 8874520, 2105000, 19300000, 10701000, 770026, 11714600, 3990440, 4289440, 12804100, 1061510, 5277830, 896581, 6944260, 29730300, 3310770, 623251, 8603980, 7796940, 1767860, 5852490, 581075]
state_densities = [97.4270, 1.2694, 66.2016, 58.3059, 254.2929, 56.8653, 733.7505, 508.1242, 409.2233, 188.3053, 218.9678, 22.5079, 226.3964, 189.9643, 56.7157, 35.6807, 113.4759, 107.0966, 43.9166, 624.8522, 886.1846, 176.7352, 71.6641, 63.2187, 89.7419, 7.4547, 25.4087, 29.0195, 153.2671, 1206.7609, 17.3540, 409.5404, 220.1037, 11.1596, 286.6939, 58.1739, 44.6873, 286.1699, 1026.6054, 175.5707, 11.8265, 168.4069, 113.8080, 40.2917, 67.6197, 217.8774, 117.3248, 73.5444, 108.0633, 5.9847]

def getStateInfo(state):
  index = states_lower.index(state)
  return state_pops[index], state_densities[index]

In [None]:
def getColNames(label):
  return [label + '_tweetCount', label + '_CompoundSent', label + '_posSent', 
          label + '_negSent', label + '_neutralSent', label + '_posCases',
          label + '_posIncCases', label + '_negCases', label + '_dataQualityGrade']

In [None]:
# ------------------------------------------------------------------------------
# Create the master input/output tables by state {state: I/O df}
# ------------------------------------------------------------------------------
io_dict = {}
for state in tqdm(states_lower, total=50):
  # merge tables for all rows with dates in common
  daily_df = pd.merge(daily_tweets[state], master_history[state], on="date")

  # calculate rolling averages
  avg3_means = daily_df.rolling(3).mean() # 3 day rolling average
  avg3_means.columns = getColNames('mean3')
  avg3_medians = daily_df.rolling(3).median() # 3 day rolling median
  avg3_medians.columns = getColNames('median3')
  avg7_means = daily_df.rolling(7).mean() # 7 day rolling average
  avg7_means.columns = getColNames('mean7')
  avg7_medians = daily_df.rolling(7).median() # 7 day rolling median
  avg7_medians.columns = getColNames('median7')

  # merge rolling average tables
  daily_df.columns = ['date'] + getColNames('today')
  frames = [daily_df, avg3_means, avg3_medians, avg7_means, avg7_medians]
  rolling_df = pd.concat(frames, axis=1)

  # add state population and density to every row
  pop, density = getStateInfo(state)
  rolling_df['population'] = pop
  rolling_df['density'] = density

  # add output columns: shift to stay with the date it refers to
  # col was renamed last for earlier but data still represents that day
  predict3 = rolling_df['today_posCases'].shift(-3)
  predict7 = rolling_df['today_posCases'].shift(-7)
  predict14 = rolling_df['today_posCases'].shift(-14)
  io_frames = [rolling_df, predict3, predict7, predict14]
  state_df = pd.concat(io_frames, axis=1)
  state_df.columns = list(rolling_df.columns) + ['predict3', 'predict7', 'predict14']

  # remove the first 6 rows (don't have 7-day rolling average) and last 14 rows 
  # (don't have predictions 3, 7, and 14 days ahead)
  state_df = state_df[6:-14]

  # save to i/o dict
  io_dict[state] = state_df

  # # save to GDrive
  state_df.to_csv('{}_IO.csv'.format(state))
  !cp $state"_IO.csv" "drive/My Drive/Thesis/5. IO Table"

In [None]:
# ------------------------------------------------------------------------------
# Create a master input/output df with all state data
# ------------------------------------------------------------------------------
df_list = []
for state in states_lower:
  df_list.append(io_dict[state])

# concatenate all the state dfs into a master df
master_df = pd.concat(df_list, ignore_index=True)

# save master df to GDrive
master_df.to_csv('master_IO.csv'.format(state))
!cp "master_IO.csv" "drive/My Drive/Thesis/5. IO Table"

#**Some Additional Analysis**

##COVID Tracking Project Data

###Set up: DF

In [None]:
# define regions of the US
# census bureau designated
div1 = ['ct', 'me', 'ma', 'nh', 'ri', 'vt'] # northeast: new england
# div1 = ['nh', 'ma', 'ct', 'ri', 'vt', 'me']
div2 = ['nj', 'ny', 'pa'] # northeast: mid-atlantic
div3 = ['il', 'in', 'mi', 'oh', 'wi'] # midwest: east north central
div4 = ['ia', 'ks', 'mn', 'mo', 'ne', 'nd', 'sd'] #midwest: west north central
div5 = ['de', 'fl', 'ga', 'md', 'nc', 'sc', 'va', 'wv'] # south atlantic
div6 = ['al', 'ky', 'ms', 'tn'] # south: east south central
div7 = ['ar', 'la', 'ok', 'tx'] # south: west south central
div8 = ['az', 'co', 'id', 'mt', 'nv', 'nm', 'ut', 'wy'] # west: mountain
div9 = ['ak', 'ca', 'hi', 'or', 'wa'] # west: pacific

regions = [div1, div2, div3, div4, div5, div6, div7, div8, div9]
region_names = ['New England', 'Mid-Atlantic', 'East North Central', 
                'West North Central', 'South Atlantic', 'East South Central',
                'West South Central', 'Mountain', 'Pacific']

In [None]:
# compile master_history df into one large df for purpose of graph creation
history = []

for i, div in enumerate(regions):
  region_df = pd.DataFrame()
  for state in div:
    state_df = master_history[state].copy(deep=True)
    state_df['state'] = state
    state_df['region'] = region_names[i]
    state_df.drop(state_df[state_df.date < '2020-03-05'].index, inplace=True)
    state_df.drop(state_df[state_df.date > '2020-12-31'].index, inplace=True)
    state_df = state_df.iloc[::-1]
    region_df = pd.concat([region_df, state_df])
  
  region_df.reset_index(inplace=True)
  history.append(region_df)

In [None]:
history[0]

In [None]:
# find where to place ticks (start of each month) and set their labels
month_ticks = ['Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
tick_indices = []

curr_month = ''
# pull out one state as an example for state_df before running this once
for i, date in enumerate(state_df.date.tolist()):
  month = date[5:7]
  if month != curr_month:
    tick_indices.append(i)
    curr_month = month

###Case Count Trends: Positive

In [None]:
for div in history:
  sns.set_style("whitegrid")
  sns.set_context("talk", font_scale=3.1)
  g = sns.relplot(
      data=div, kind="line", sort=False,
      x='date', y='posCases', hue='state',
      height=15, aspect=2.5, palette="Set2"
  ).set_axis_labels("Date (Month in 2020)", "Number of Cases", labelpad=35)
  g.set(xticks=tick_indices)
  g.set_xticklabels(month_ticks)

###Case Count Trends: Negative

In [None]:
for div in history:
  sns.set_style("whitegrid")
  sns.set_context("talk", font_scale=3.1)
  g = sns.relplot(
      data=div, kind="line", sort=False,
      x='date', y='negCases', hue='state',
      height=15, aspect=2.5, palette="Set2"
  ).set_axis_labels("Date (Month in 2020)", "Number of Cases", labelpad=35)
  g.set(xticks=tick_indices)
  g.set_xticklabels(month_ticks)

###Data Quality Analysis
Note: N/A=75 for all states after 10/27

In [None]:
history_df = pd.concat([history[0], history[1], history[2], history[3], history[4], history[5], history[6], history[7], history[8]])
history_df.sort_values(by=['state'], inplace=True)

In [None]:
# Title: Data Quality Distribution Across States
sns.set_context("talk", font_scale=3.1)
g = sns.catplot(
    data=history_df, kind="box",
    x="state", y="dataQualityGrade", 
    height=15, aspect=2.5, palette="Set2"
).set_axis_labels("State", "Data Quality Grade", labelpad=35)
g.set_xticklabels(rotation=90)

##Sentiment Analysis

###Daily Average Sentiment Values for a given state

In [None]:
# Find out where to place x ticks and tick labels on graph
month_ticks = ['Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

def xTicks(dates):
  tick_indices = []
  curr_month = ''
  for i, date in enumerate(dates):
    month = date[5:7]
    if month != curr_month:
      tick_indices.append(i)
      curr_month = month
  return tick_indices

In [None]:
# given a state, graph its different sentiment values
def graphStateSentiment(state):
  date = daily_tweets[state]['date'].tolist()
  tick_indices = xTicks(date)
  length = len(date)
  date = date + date + date + date
  score = daily_tweets[state]['compoundSent'].tolist() + daily_tweets[state]['posSent'].tolist() + daily_tweets[state]['negSent'].tolist() + daily_tweets[state]['neutralSent'].tolist()
  sentiment = ['Compound'] * length + ['Positive'] * length + ['Negative'] * length + ['Neutral'] * length

  df = pd.DataFrame.from_dict({'date': date, 'Sentiment': sentiment, 'score': score})

  # Title: 
  sns.set_style("whitegrid")
  sns.set_context("talk", font_scale=3.1)
  g = sns.relplot(
      data=df, kind="line", 
      x='date', y='score', hue='Sentiment',
      height=15, aspect=2.5, palette="Set2"
  ).set_axis_labels("Date (Month in 2020)", "Score", labelpad=35)
  g.set(xticks=tick_indices)
  g.set_xticklabels(month_ticks)

In [None]:
graphStateSentiment('hi')

In [None]:
graphStateSentiment('wv')

In [None]:
graphStateSentiment('or')

In [None]:
daily_tweets[state].mean()

In [None]:
# find states with max average positive/negative sentiment values
posmax = 0
negmax = 0
posmax_state = 'N/A'
negmax_state = 'N/A'

for state in states_lower:
  means = daily_tweets[state].mean()
  if means['compoundSent'] > posmax:
    posmax = means['compoundSent']
    posmax_state = state
  if means['compoundSent'] < negmax:
    negmax = means['compoundSent']
    negmax_state = state
    
print("highest average positive sentiment: ", posmax_state, " with mean ", posmax)
print("highest average negative sentiment: ", negmax_state, " with mean ", negmax)

###Distribution of compound sentiment across all states

In [None]:
# Set up for box plot by combining data for all states' compound values
compound_df = pd.DataFrame()
for state in states_lower:
  date = daily_tweets[state]['date'].tolist()
  compound = daily_tweets[state]['compoundSent'].tolist()
  state = [state] * len(date)
  df = pd.DataFrame.from_dict({'date': date, 'state': state, 'compoundSent': compound})
  compound_df = pd.concat([compound_df, df])

In [None]:
# Title: Distribution of Compound Sentiment Values by State
sns.set_context("talk", font_scale=3.1)
g = sns.catplot(
    data=compound_df, kind="box",
    x="state", y="compoundSent", 
    height=15, aspect=2.5, palette="Set2"
).set_axis_labels("State", "Sentiment Score", labelpad=35)
g.set_xticklabels(rotation=90)

# **The Model**

##**[RUN OPTION 2]** Final I/O Setup
Load I/O table from GDrive, split into train/validate/test sets, shuffle, split input/outputs

### By State (Option 1) - ignore

In [None]:
# ------------------------------------------------------------------------------
# load input/output data from GDrive, save into io_dict: {state: df}
# drop the date column
# ------------------------------------------------------------------------------
io_dict = {} # empty dict to fill with dfs
for state in states_lower:
  path = 'drive/My Drive/Thesis/5. IO Table/{}_IO.csv'.format(state)
  io_dict[state] = pd.read_csv(path, index_col=0).drop(columns=['date'])

In [None]:
# ------------------------------------------------------------------------------
# shuffle then split data set
# 70:10:20 train:validate:test ratios (rounded)
# ------------------------------------------------------------------------------
train_dict = {}
validate_dict = {}
test_dict = {}

for state in states_lower:
  shuffle = io_dict[state].sample(frac=1) # shuffle
  
  tenth = io_dict[state].shape[0] / 10
  train_dict[state] = io_dict[state].iloc[:int(tenth*7)]
  validate_dict[state] = io_dict[state].iloc[int(tenth*7):int(tenth*8)]
  test_dict[state] = io_dict[state].iloc[int(tenth*8):]

In [None]:
# ------------------------------------------------------------------------------
# separate the input and output into separate dfs
# ------------------------------------------------------------------------------
train_input = {}
validate_input = {}
test_input = {}

train_output = {}
validate_output = {}
test_output = {}

for state in states_lower:
  train_input[state] = train_dict[state].iloc[:, :-3]
  train_output[state] = train_dict[state].iloc[:, -3:]
  validate_input[state] = validate_dict[state].iloc[:, :-3]
  validate_output[state] = validate_dict[state].iloc[:, -3:]
  test_input[state] = test_dict[state].iloc[:, :-3]
  test_output[state] = test_dict[state].iloc[:, -3:]

In [None]:
# ------------------------------------------------------------------------------
# convert to tensors
# ------------------------------------------------------------------------------
for state in states_lower:
  train_input[state] = torch.FloatTensor(train_input[state].values)
  train_output[state] = torch.FloatTensor(train_output[state].values)
  validate_input[state] = torch.FloatTensor(validate_input[state].values)
  validate_output[state] = torch.FloatTensor(validate_output[state].values)
  test_input[state] = torch.FloatTensor(test_input[state].values)
  test_output[state] = torch.FloatTensor(test_output[state].values)

### Master DF (Option 2 -- go with this one)

In [None]:
# ------------------------------------------------------------------------------
# load master I/O df from GDrive, drop the date column
# ------------------------------------------------------------------------------
path = 'drive/My Drive/Thesis/5. IO Table/master_IO.csv'
master_df = pd.read_csv(path, index_col=0).drop(columns=['date'])

In [None]:
# ------------------------------------------------------------------------------
# comparative option for evaluation: remove all tweet sentiment data
# ------------------------------------------------------------------------------
master_df = master_df.drop(['today_tweetCount', 'mean3_tweetCount', 'median3_tweetCount', 'mean7_tweetCount', 'median7_tweetCount'], axis=1)
master_df = master_df.drop(['today_CompoundSent', 'today_posSent', 'today_negSent', 'today_neutralSent'], axis=1)
master_df = master_df.drop(['mean3_CompoundSent', 'mean3_posSent', 'mean3_negSent', 'mean3_neutralSent'], axis=1)
master_df = master_df.drop(['median3_CompoundSent', 'median3_posSent', 'median3_negSent', 'median3_neutralSent'], axis=1)
master_df = master_df.drop(['mean7_CompoundSent', 'mean7_posSent', 'mean7_negSent', 'mean7_neutralSent'], axis=1)
master_df = master_df.drop(['median7_CompoundSent', 'median7_posSent', 'median7_negSent', 'median7_neutralSent'], axis=1)

In [None]:
master_df

In [None]:
test = master_df['predict3'].tolist()
test.sort()
print(test)

In [None]:
# ------------------------------------------------------------------------------
# shuffle then split data set
# 70:30 train:test ratio (rounded)
# ------------------------------------------------------------------------------
shuffle = master_df.sample(frac=1) # shuffle

tenth = master_df.shape[0] / 10
train = master_df.iloc[:int(tenth*7)]
test = master_df.iloc[int(tenth*7):]

In [None]:
# ------------------------------------------------------------------------------
# separate the input and output into separate dfs
# ------------------------------------------------------------------------------
train_input = train.iloc[:, :-3]
train_output = train.iloc[:, -3:]
test_input = test.iloc[:, :-3]
test_output = test.iloc[:, -3:]

In [None]:
# ------------------------------------------------------------------------------
# convert to tensors
# ------------------------------------------------------------------------------
train_input = torch.FloatTensor(train_input.values)
train_output = torch.FloatTensor(train_output.values)
test_input = torch.FloatTensor(test_input.values)
test_output = torch.FloatTensor(test_output.values)

##Define and train NN

Enable GPU under "Edit" > "Notebook Settings"

In [None]:
# ------------------------------------------------------------------------------
# move tensor to the GPU if available
# ------------------------------------------------------------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_input = train_input.to(device)
train_output = train_output.to(device)
test_input = test_input.to(device)
test_output = test_output.to(device)

In [None]:
print(f"Shape of tensor: {train_input.shape}")
print(f"Datatype of tensor: {train_input.dtype}")
print(f"Device tensor is stored on: {train_input.device}")

In [None]:
# ------------------------------------------------------------------------------
# define model constructor
# ------------------------------------------------------------------------------
class Net(torch.nn.Module):
  def __init__(self, input_size, hidden_size):
    super(Net, self).__init__()

    self.hidden = torch.nn.Linear(input_size, hidden_size) # fc1 layer
    self.predict = torch.nn.Linear(hidden_size, 3) # fc2 output layer

  def forward(self, x):
    # introduce non-linearity to hidden layer's output (any neg# --> 0)
    hidden = F.relu(self.hidden(x)) 

    # linear output
    x = self.predict(hidden)
    return x

In [None]:
# ------------------------------------------------------------------------------
# initialize model
# ------------------------------------------------------------------------------
input_size = train_input.size()[1] # number of features selected
hidden_size = 21 # number of nodes/neurons in the hidden layer

model = Net(input_size, hidden_size) # create the model
criterion = torch.nn.MSELoss() # mean square error loss function
model = model.to(device) # move to gpu if available 

optimizer = torch.optim.Adam(model.parameters(), lr = 1e-4) # adam optimizer

In [None]:
# ------------------------------------------------------------------------------
# check test loss before model training
# ------------------------------------------------------------------------------
model.eval() # switch to eval mode (so doesn't learn new weights)
y_pred = model(test_input)
before_train = criterion(y_pred.squeeze(), test_output.squeeze())
print('Test loss before training' , before_train.item())

In [None]:
# ------------------------------------------------------------------------------
# split up y_pred and test_output tensors for more individual analysis pre-train
# ------------------------------------------------------------------------------
y_pred3 = y_pred[:, 0]
y_pred7 = y_pred[:, 1]
y_pred14 = y_pred[:, 2]

test_output3 = test_output[:, 0]
test_output7 = test_output[:, 1]
test_output14 = test_output[:, 2]

In [None]:
# ------------------------------------------------------------------------------
# find mean squared error loss values by day
# ------------------------------------------------------------------------------
before_accuracy3 = criterion(y_pred3, test_output3)
before_accuracy7 = criterion(y_pred7, test_output7)
before_accuracy14 = criterion(y_pred14, test_output14)

print('Pre-training MSELoss for 3 day prediction:' , before_accuracy3.item())
print('Pre-training MSELoss for 7 day prediction:' , before_accuracy7.item())
print('Pre-training MSELoss for 14 day prediction:' , before_accuracy14.item())

In [None]:
# ------------------------------------------------------------------------------
# train model
# ------------------------------------------------------------------------------
model.train() # switch back to training mode
epochs = 10000 # what should i set this to?
errors = [] # maintain record to keep loss for each epoch
errors3 = [] # errors for 3 day prediction
errors7 = [] # errors for 7 day prediction
errors14 = [] # # errors for 14 day prediction

# split up train outputs for comparison during training
train_output3 = train_output[:, 0]
train_output7 = train_output[:, 1]
train_output14 = train_output[:, 2]

index = 0
for epoch in range(epochs):
  optimizer.zero_grad() # clear gradients before backpropagation
  # Forward pass
  y_pred = model(train_input)
  y_pred3 = y_pred[:, 0]
  y_pred7 = y_pred[:, 1]
  y_pred14 = y_pred[:, 2]
  # Compute Loss
  loss = criterion(y_pred.squeeze(), train_output.squeeze())
  errors.append(loss.item())
  errors3.append(criterion(y_pred3, train_output3).item())
  errors7.append(criterion(y_pred7, train_output7).item())
  errors14.append(criterion(y_pred14, train_output14).item())
  if index%10 == 0:
    print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
  index+=1
  # Compute other measures of accuracy?
  # Backward pass
  loss.backward() # backpropagation, compute gradients
  optimizer.step() # apply gradients to update weights

## Evaluate Accuracy: Absolute Loss (MSE)

In [None]:
# ------------------------------------------------------------------------------
# evaluate post-training: overall accuracy
# ------------------------------------------------------------------------------
model.eval()
y_pred = model(test_input)
after_train = criterion(y_pred.squeeze(), test_output.squeeze())
print('Test loss after training:' , after_train.item())
print('Compare to loss before training: ', before_train.item())
print('This is a difference of: ', before_train.item()/after_train.item(), 'times')

In [None]:
# ------------------------------------------------------------------------------
# split up y_pred and test_output tensors for more individual analysis
# ------------------------------------------------------------------------------
y_pred3 = y_pred[:, 0]
y_pred7 = y_pred[:, 1]
y_pred14 = y_pred[:, 2]

test_output3 = test_output[:, 0]
test_output7 = test_output[:, 1]
test_output14 = test_output[:, 2]

In [None]:
# ------------------------------------------------------------------------------
# find mean squared error loss values by day
# ------------------------------------------------------------------------------
accuracy3 = criterion(y_pred3, test_output3)
accuracy7 = criterion(y_pred7, test_output7)
accuracy14 = criterion(y_pred14, test_output14)

print('MSELoss for 3 day prediction:' , accuracy3.item())
print('MSELoss for 7 day prediction:' , accuracy7.item())
print('MSELoss for 14 day prediction:' , accuracy14.item())

In [None]:
# ------------------------------------------------------------------------------
# find improvement for MSE loss values per day
# ------------------------------------------------------------------------------
print('3 day: improvement of: ', (before_accuracy3/accuracy3).item(), 'times')
print('7 day: improvement of: ', (before_accuracy7/accuracy7).item(), 'times')
print('14 day: improvement of: ', (before_accuracy14/accuracy14).item(), 'times')

##Evaluation: Relative Loss

In [None]:
# ------------------------------------------------------------------------------
# sort test output values in increasing order for more intuitive graphing purposes
# ------------------------------------------------------------------------------
test_output3_sort, idx3 = torch.sort(test_output3)
test_output7_sort, idx7 = torch.sort(test_output7)
test_output14_sort, idx14 = torch.sort(test_output14)

In [None]:
# ------------------------------------------------------------------------------
# function that takes in a list and sorts it according to a second list that represents index order
# ------------------------------------------------------------------------------ 
def sortByIndex(arr, idx):
  sorted_list = []
  for i in idx:
    sorted_list.append(arr[i].item())
  return np.asarray(sorted_list)

In [None]:
# ------------------------------------------------------------------------------
# sort predicted values according to new order of test output values
# ------------------------------------------------------------------------------
y_pred3_sort = sortByIndex(y_pred3, idx3)
y_pred7_sort = sortByIndex(y_pred7, idx7)
y_pred14_sort = sortByIndex(y_pred14, idx14)

In [None]:
# ------------------------------------------------------------------------------
# see where super early numbers are (0-2)
# ------------------------------------------------------------------------------
(test_output3_sort == 2).nonzero(as_tuple=True)[0]

In [None]:
# ------------------------------------------------------------------------------
# first remove values where output=0 in relative3 (just 4 examples of super early on)
# ------------------------------------------------------------------------------
test_output3_cut = test_output3_sort[5:].cpu().numpy()
test_output7_cut = test_output7_sort[5:].cpu().numpy()
test_output14_cut = test_output14_sort[5:].cpu().numpy()
y_pred3_cut = y_pred3_sort[5:]
y_pred7_cut = y_pred7_sort[5:]
y_pred14_cut = y_pred14_sort[5:]

relative3 = np.abs(test_output3_cut - y_pred3_cut)/test_output3_cut * 100
relative7 = np.abs(test_output7_cut - y_pred7_cut)/test_output7_cut * 100
relative14 = np.abs(test_output14_cut - y_pred14_cut)/test_output14_cut * 100

In [None]:
# ------------------------------------------------------------------------------
# check largest error value for a general sense / plausability 
# ------------------------------------------------------------------------------
idx = np.argmax(relative3)
print(idx, test_output3_cut[idx], y_pred3_cut[idx])

In [None]:
# ------------------------------------------------------------------------------
# find mean loss values by day
# ------------------------------------------------------------------------------
relative_accuracy3 = np.mean(relative3)
relative_accuracy7 = np.mean(relative7)
relative_accuracy14 = np.mean(relative14)

print('Mean relative error for 3 day prediction:' , relative_accuracy3.item(), '%')
print('Mean relative error for 7 day prediction:' , relative_accuracy7.item(), '%')
print('Mean relative error for 14 day prediction:' , relative_accuracy14.item(), '%')

In [None]:
# ------------------------------------------------------------------------------
# find mean loss values by day
# ------------------------------------------------------------------------------
relative_accuracy3 = np.mean(relative3[100:])
relative_accuracy7 = np.mean(relative7[100:])
relative_accuracy14 = np.mean(relative14[100:])

print('Mean relative error for 3 day prediction:' , relative_accuracy3.item(), '%')
print('Mean relative error for 7 day prediction:' , relative_accuracy7.item(), '%')
print('Mean relative error for 14 day prediction:' , relative_accuracy14.item(), '%')

In [None]:
# ------------------------------------------------------------------------------
# find mean loss values by day
# ------------------------------------------------------------------------------
relative_accuracy3 = np.mean(relative3[415:])
relative_accuracy7 = np.mean(relative7[415:])
relative_accuracy14 = np.mean(relative14[415:])

print('Mean relative error for 3 day prediction:' , relative_accuracy3.item(), '%')
print('Mean relative error for 7 day prediction:' , relative_accuracy7.item(), '%')
print('Mean relative error for 14 day prediction:' , relative_accuracy14.item(), '%')

Some notes:
After about the earliest 100-150ish predictions, accuracy increases dramatically. Corresponds with when the actual case counts pass the early couple hundreds. 

##Evaluation: Find % Correct

In [None]:
# ------------------------------------------------------------------------------
# define function which takes in array of relative accuracies and returns how 
# many predictions fall within 1%, 5%, 25%, and 50%. 
# ------------------------------------------------------------------------------
def countThresholds(arr, idx):
  counts = {1: 0, 5: 0, 25: 0, 50: 0, 100: 0, 'else': 0}
  for x in arr:
    if x <= 1:
      counts[1] += 1
    elif x <= 5:
      counts[5] += 1
    elif x <= 25:
      counts[25] += 1
    elif x <= 50:
      counts[50] += 1
    elif x <= 100:
      counts[100] += 1
    else:
      counts['else'] += 1

  total_count = len(arr)
  counts['within 1%'] = counts[1] / total_count
  counts['within 5%'] = (counts[1] + counts[5]) / total_count
  counts['within 25%'] = (counts[1] + counts[5] + counts[25]) / total_count
  counts['within 50%'] = (counts[1] + counts[5] + counts[25] + counts[50]) / total_count
  counts['within 100%'] = (counts[1] + counts[5] + counts[25] + counts[50] + counts[100]) / total_count
  return pd.DataFrame(counts, index=[idx])

In [None]:
# ------------------------------------------------------------------------------
# find thresholds for predictions 
# ------------------------------------------------------------------------------
thresholds3 = countThresholds(relative3, '3-day')
thresholds7 = countThresholds(relative7, '7-day')
thresholds14 = countThresholds(relative14, '14-day')

thresholds_df = pd.concat([thresholds3, thresholds7, thresholds14])
thresholds_df

##Evaluation: Plot errors

In [None]:
# ------------------------------------------------------------------------------
# plot total MSE
# ------------------------------------------------------------------------------
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=np.log10(errors), kind="line",
    height=15, aspect=2.5, palette="Set2"
).set_axis_labels("Epochs", "Log Loss", labelpad=35)

In [None]:
# ------------------------------------------------------------------------------
# prepare df for multi-line plots
# ------------------------------------------------------------------------------
epochs = [*range(10000)] * 3
errors_daily = np.concatenate([np.log10(errors3), np.log10(errors7), np.log10(errors14)])
prediction = ['3 day'] * 10000 + ['7 day'] * 10000 + ['14 day'] * 10000
errors_df = pd.DataFrame.from_dict({'epochs': epochs, 'errors': errors_daily, 'Prediction': prediction})

In [None]:
# ------------------------------------------------------------------------------
# plot 3, 7, and 14 day MSE values against each other
# ------------------------------------------------------------------------------
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=errors_df, kind="line", 
    x='epochs', y='errors', hue='Prediction',
    height=15, aspect=2.3, palette="Set2"
).set_axis_labels("Epoch", "Log Loss", labelpad=35)

##Evaluation: Plot Relative Errors

In [None]:
# ------------------------------------------------------------------------------
# plot relative loss values against each other
# ------------------------------------------------------------------------------
index = [*range(4195)] * 3
relative_errors = np.concatenate([relative3/100, relative7/100, relative14/100])
prediction = ['3 day'] * 4195 + ['7 day'] * 4195 + ['14 day'] * 4195
errors_df = pd.DataFrame.from_dict({'index': index, 'errors': relative_errors, 'Prediction': prediction})

sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=errors_df, kind="line", 
    x='index', y='errors', hue='Prediction',
    height=15, aspect=2.3, palette="Set2"
).set_axis_labels("", "Relative Error", labelpad=35)

##Evaluation: Plot Predicted on top of Actual

In [None]:
# ------------------------------------------------------------------------------
# graph 3 day predicted and actual
# ------------------------------------------------------------------------------
dataCount = len(test_output3_cut.tolist())
data = test_output3_cut.tolist() + y_pred3_cut.tolist()
category = ['actual'] * dataCount + ['predicted'] * dataCount
index = list(range(dataCount)) * 2

predict3_df = pd.DataFrame.from_dict({'index': index, 'data': data, 'category': category})

In [None]:
current_palette = sns.color_palette("Set2")
first = current_palette[5]
second = current_palette[2]
sns.set_palette([first, second])

In [None]:
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=predict3_df,
    x='index', y='data', hue='category', style='category',
    height=15, aspect=2.3, linewidth=0, markers=['o', '*'], s=1000
).set_axis_labels("", "Number of Cases", labelpad=35)

for lh in g._legend.legendHandles: 
    lh.set_alpha(1)
    lh._sizes = [600] 

In [None]:
# ------------------------------------------------------------------------------
# graph 7 day predicted and actual
# ------------------------------------------------------------------------------
dataCount = len(test_output7_cut.tolist())
data = test_output7_cut.tolist() + y_pred7_cut.tolist()
category = ['actual'] * dataCount + ['predicted'] * dataCount
index = list(range(dataCount)) * 2

predict7_df = pd.DataFrame.from_dict({'index': index, 'data': data, 'category': category})

In [None]:
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=predict7_df,
    x='index', y='data', hue='category', style='category',
    height=15, aspect=2.3, linewidth=0, markers=['o', '*'], s=1000
).set_axis_labels("", "Number of Cases", labelpad=35)

for lh in g._legend.legendHandles: 
    lh.set_alpha(1)
    lh._sizes = [600] 

In [None]:
# ------------------------------------------------------------------------------
# graph 14 day predicted and actual
# ------------------------------------------------------------------------------
dataCount = len(test_output14_cut.tolist())
data = test_output14_cut.tolist() + y_pred14_cut.tolist()
category = ['actual'] * dataCount + ['predicted'] * dataCount
index = list(range(dataCount)) * 2

predict14_df = pd.DataFrame.from_dict({'index': index, 'data': data, 'category': category})

In [None]:
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    data=predict14_df,
    x='index', y='data', hue='category', style='category',
    height=15, aspect=2.3, linewidth=0, markers=['o', '*'], s=1000
).set_axis_labels("", "Number of Cases", labelpad=35)

for lh in g._legend.legendHandles: 
    lh.set_alpha(1)
    lh._sizes = [600] 

##Evaluation: Plot Predicted v. Actual

In [None]:
# ------------------------------------------------------------------------------
# reset color palette
# ------------------------------------------------------------------------------
# current_palette = sns.color_palette("Set2")
sns.set_palette("Set2")

In [None]:
# ------------------------------------------------------------------------------
# graph 3 day predicted v. actual
# ------------------------------------------------------------------------------
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    x=test_output3_cut.tolist(), y=y_pred3_cut.tolist(),
    height=15, aspect=2.3
).set_axis_labels("Actual", "Predicted", labelpad=35)

In [None]:
# ------------------------------------------------------------------------------
# graph 7 day predicted v. actual
# ------------------------------------------------------------------------------
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    x=test_output7_cut.tolist(), y=y_pred7_cut.tolist(),
    height=15, aspect=2.3
).set_axis_labels("Actual", "Predicted", labelpad=35)

In [None]:
# ------------------------------------------------------------------------------
# graph 14 day predicted v. actual
# ------------------------------------------------------------------------------
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=3.1)
g = sns.relplot(
    x=test_output14_cut.tolist(), y=y_pred14_cut.tolist(),
    height=15, aspect=2.3
).set_axis_labels("Actual", "Predicted", labelpad=35)

##Old Evaluation Plots (no longer used)

In [None]:
# evaluate with some plots (code from Batista medium article)
# import matplotlib.pyplot as plt
# import numpy as np

def plotcharts(errors):
  errors = np.array(errors)

  plt.figure(figsize=(12, 5))

  error_graph = plt.subplot(1, 2, 1) # nrows, ncols, index
  error_graph.set_title('Errors')
  plt.plot(errors, '-')
  plt.xlabel('Epochs')

  predict3_graph = plt.subplot(1, 2, 2)
  predict3_graph.set_title('3 Day Prediction')
  a = plt.plot(test_output3.detach().cpu().numpy(), 'yo', label='Real')
  plt.setp(a, markersize=10)
  a = plt.plot(y_pred3.detach().cpu().numpy(), 'b+', label='Predicted')
  plt.setp(a, markersize=10)
  plt.legend(loc=7)

plotcharts(errors)

In [None]:
plt.figure(figsize=(12, 5))
predict7_graph = plt.subplot(1, 2, 1)
predict7_graph.set_title('7 Day Prediction')
a = plt.plot(test_output7.detach().cpu().numpy(), 'yo', label='Real')
plt.setp(a, markersize=10)
a = plt.plot(y_pred7.detach().cpu().numpy(), 'b+', label='Predicted')
plt.setp(a, markersize=10)
plt.legend(loc=7)

predict14_graph = plt.subplot(1, 2, 2)
predict14_graph.set_title('14 Day Prediction')
a = plt.plot(test_output14.detach().cpu().numpy(), 'yo', label='Real')
plt.setp(a, markersize=10)
a = plt.plot(y_pred14.detach().cpu().numpy(), 'b+', label='Predicted')
plt.setp(a, markersize=10)
plt.legend(loc=7)
plt.show()

##Notes

Tutorial used for this entire section: https://medium.com/@andreluiz_4916/pytorch-neural-networks-to-predict-matches-results-in-soccer-championships-part-ii-3d02b2ddd538

Params to change to try to improve predictions:
1. Learning rate: float values between 0 and 1.
2. Momentum rate: float values between 0 and 1.
3. Number of hidden layers: change the structure of your model adding more hidden layers.
4. Number of neurons/nodes in the hidden layer: integer values between 1 and ‘your imagination’.
5. Epochs: integer values between 1 and ‘your level of patience’ to wait the training process to be finished.



# **Miscellaneous**

In [None]:
import os

In [None]:
# ------------------------------------------------------------------------------
# FIX NAMES OF FILES (made copies with princeton account bc storage space issues in the personal account)
# ------------------------------------------------------------------------------
location_files = glob.glob("drive/My Drive/Thesis/Scrap/Tweets By Location (first couple dates, old method with geocode)/*.csv")
for file in sorted(location_files):
  state = file[-13:-11]
  new_name = "drive/My Drive/Thesis/Scrap/Tweets By Location (first couple dates, old method with geocode)/{}_tweets.csv".format(state)
  os.rename(file, new_name)