# Get All Tweets with #podrevday

In [None]:
!mkdir data

In [None]:
!GetOldTweets3 --querysearch "podrevday" --since 2020-01-01 --until 2020-07-10 --output "data/jan-july-2020.csv"

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from geotext import GeoText



import nest_asyncio
nest_asyncio.apply()


import twint

df = pd.read_csv('data/jan-july-2020.csv', parse_dates=['date'])

# Get User Data

In [None]:
pod_rev_users = list(set(df.username))

c = twint.Config()
c.Store_object = True
c.Pandas = True

for user in pod_rev_users: 
    c.Username = user
    twint.run.Lookup(c)
Users_df = twint.storage.panda.User_df

users_df = Users_df.drop_duplicates()
users_df.to_csv('data/user_data.csv')

# Clean User Data

In [None]:
user_df.columns

In [None]:
user_df = user_df.loc[:, ['id','username', 'name', 'location', 'join_date', 'followers', 'following', 'likes', 'url', 'verified' ]]

In [None]:
user_df.loc[:, "location"] = user_df.loc[:, "location"].fillna("blank")
user_df.head()

In [None]:
user_df.loc[:, "geotext"] = user_df.loc[:, "location"].apply(GeoText)

In [None]:
user_df.loc[:, 'city'] = user_df.loc[:, 'geotext'].apply(lambda x: x.cities)
user_df.loc[:, 'country'] = user_df.loc[:, 'geotext'].apply(lambda x: x.countries)

In [None]:
from geonamescache import GeonamesCache
gc = GeonamesCache()
countries = gc.get_countries()
country_info = pd.DataFrame(countries).T
country_info = country_info.set_index('geonameid').reset_index()
name_code = country_info.loc[:, ["name", "iso3"]]

In [None]:
us_states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
             "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA",
             "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY",
             "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX",
             "UT", "VT", "VA", "WA", "WV", "WI", "WY", "USA", "United States",
             'Seattle', "Los Angeles", "Houston", "Atlanta", "Pittsburgh"]

us_state_names = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", 
               "District ", "of Columbia", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", 
               "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", 
               "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", 
               "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", 
               "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", 
               "Wisconsin", "West Virginia", "Wyoming"]

can_prov_abbrev = {'Alberta': 'AB','British Columbia': 'BC','Manitoba': 'MB', 'New Brunswick': 'NB',
                       'Newfoundland and Labrador': 'NL', 'Northwest Territories': 'NT','Nova Scotia': 'NS','Nunavut': 'NU',
                       'Ontario': 'ON','Prince Edward Island': 'PE', 'Quebec': 'QC','Saskatchewan': 'SK','Yukon': 'YT'}

can_prov_names, can_prov_abbr = zip(*can_prov_abbrev.items())

uk = ["England", 'Wales', "Scotland", 'London', "Manchester", "Isle of Wight", "Northern Ireland", "United Kingdom", 'Bailiwick of Guernsey', "UK", "Hoxton", "Jersey"]

india_city = ["Bangalore", "Delhi", "Hyderabad", "Bengaluru"]

german_city = ["Munich", "Berlin", "eisgau","Hamburg", "Dortmund"]

south_africa = ["South Africa", "Durban", "Johannesburg"]

In [None]:
def replacer(area, name):
    user_df.loc[(user_df.location.str.contains('|'.join(area))), "country"] = name    
    return user_df

In [None]:
user_df = replacer(can_prov_names, "Canada")
user_df = replacer(can_prov_abbrev, "Canada")
user_df = replacer(us_state_names, "United States")
user_df = replacer(us_states, "United States")
user_df = replacer(uk, "United Kingdom")
user_df = replacer(german_city, "Germany")
user_df = replacer(south_africa, "South Africa")
user_df = replacer(india_city, "India")
user_df.loc[(user_df.location == "Italia"), "country"] = "Italy" 
user_df.loc[(user_df.location == "Belgrade"), "country"] = "Serbia" 
user_df.loc[(user_df.country == "PolandSerbia"), "country"] = "Poland" 

In [None]:
user_df["city"] = user_df['city'].apply(lambda x: "".join(map(str, x)))
user_df["country"] = user_df['country'].apply(lambda x: "".join(map(str, x)))

In [None]:
user_df_2 = pd.merge(left=user_df,
                    right=name_code,
                    how='left',
                    left_on='country',
                    right_on='name')

In [None]:
user_df_2.drop('name_y',axis='columns', inplace=True)

# Merge User's Geographic Data with Tweets

In [None]:
full_df = pd.merge(left=df, 
                   right=user_df_2, 
                   how='left',
                   on='username')

full_df.to_csv('data/tweets_users.csv', index=False)