Analyze Collected Tweet Metadata

In [10]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import AxesGrid
from sklearn import decomposition
import statsmodels.regression.linear_model as lm
from sklearn.ensemble import RandomForestRegressor 
from typing import List, Tuple, Dict

import itertools

MYDIR = "./"
ALLFILES = os.listdir(MYDIR)
MYFILES = [ff for ff in ALLFILES if ("all_tweets" in ff)]
MYFILES

['all_tweets_2020-10-12__01_23_57.csv',
 'all_tweets_2020-10-12__22_35_54.csv',
 'all_tweets_2020-10-12__22_39_58.csv',
 'all_tweets_2020-10-12__22_49_41.csv',
 'all_tweets_2020-10-12__23_00_59.csv',
 'all_tweets_2020-10-13__00_48_10.csv',
 'all_tweets_2020-10-13__00_57_56.csv',
 'all_tweets_2020-10-13__09_22_44.csv',
 'all_tweets_2020-10-13__09_22_54.csv',
 'all_tweets_2020-10-13__19_53_14.csv',
 'all_tweets_2020-10-13__19_53_21.csv',
 'all_tweets_2020-10-13__21_13_06.csv',
 'all_tweets_2020-10-13__21_16_26.csv',
 'all_tweets_2020-10-13__21_35_56.csv',
 'all_tweets_2020-10-15__20_07_50.csv',
 'all_tweets_2020-10-15__21_34_26.csv',
 'all_tweets_2020-10-15__21_38_08.csv']

In [11]:
MY_BIG_TOPICS = {
          "race": ["black", "racism", "racial"],
          "politics": ["trump", "biden", "elections", "democrat", "republican",],
          "covid": ["covid", "corona", "pandemic", "virus", "vaccine"],
          "police": ["cops", "police"],
          "wildfires": ["fires", "wildfires", "air quality"],
         }
TOPICS = []
for tt in MY_BIG_TOPICS.values():
    TOPICS.extend(tt)
print(TOPICS)

def getBigTopic(topic) -> str:
    for kk in MY_BIG_TOPICS.keys():
        if topic in MY_BIG_TOPICS[kk]:
            return kk
    return None

['black', 'racism', 'racial', 'trump', 'biden', 'elections', 'democrat', 'republican', 'covid', 'corona', 'pandemic', 'virus', 'vaccine', 'cops', 'police', 'fires', 'wildfires', 'air quality']


In [12]:
tweetsHT = {} # Dict[str, pd.DataFrame]
for ff in MYFILES:
    tweetsHT[ff] = pd.read_csv(MYDIR + ff)
    
tweetsDF = pd.concat(tweetsHT).drop_duplicates().reset_index(drop=True)
print(f"Gathered the data collected from twitter ({len(tweetsDF)} rows)")

Gathered the data collected from twitter (80453 rows)


In [30]:
tweetsDF["big_topic"] = tweetsDF.the_topic.apply(lambda x: getBigTopic(x))
location_legit = tweetsDF.loc[~tweetsDF.location.isin(TOPICS)]
location_none = tweetsDF.loc[tweetsDF.location.isin(TOPICS)]
location_none.location = None

tweetsDF = pd.concat([location_legit, location_none]).reset_index(drop=True)
"""
Get counts of tweets by location and big_topic
"""
totals = pd.DataFrame(tweetsDF.groupby(["location", "topic", "the_topic", "big_topic"]).count()["id"]).reset_index()
totals.rename(columns = {"id": "count"}, inplace=True)
"""
Get counts of retweets by location and big_topic
"""
retweets = pd.DataFrame(tweetsDF.groupby(["location", "topic", "the_topic", "big_topic"]).sum()["retweet_count"]).reset_index()
summaries = totals.merge(retweets, on=["location", "topic", "the_topic", "big_topic"])
summaries["retweets_per_tweet"] = summaries["retweet_count"] / summaries["count"]

summaries.to_csv("./tweets_for_analysis.csv", index=False)

In [14]:
"""
Get counts of tweets by location and big_topic
"""
totals = pd.DataFrame(tweetsDF.groupby(["location", "big_topic"]).count()["id"]).reset_index()
totals.rename(columns = {"id": "count"}, inplace=True)
"""
Get counts of retweets by location and big_topic
"""
retweets = pd.DataFrame(tweetsDF.groupby(["location", "big_topic"]).sum()["retweet_count"]).reset_index()
summaries = totals.merge(retweets, on=["location", "big_topic"])
summaries["retweets_per_tweet"] = summaries["retweet_count"] / summaries["count"]

In [15]:
import plotly.express as px
for tt in sorted(list(summaries["big_topic"].unique())):
    print(tt)
    
    my_summary = summaries.loc[summaries["big_topic"] == tt].copy()
    fig = px.scatter(my_summary,
                     x="count", 
                     y="retweets_per_tweet", 
                     color="location"
                    )
    
    fig.update_traces(marker=dict(size=12),
                      line=dict(width=2,
                                color='DarkSlateGrey'),
                      selector=dict(mode='markers')
                     )
    
    fig.show()

covid


politics


race


In [16]:
import plotly.express as px
for tt in sorted(list(summaries["location"].unique())):
    print(tt)
    
    my_summary = summaries.loc[summaries["location"] == tt].copy()
    fig = px.bar(my_summary,
                     x="count", 
                     y="retweets_per_tweet", 
                     color="big_topic",
                     title=tt
                )
    fig.show()

Alameda


Albany


Alum Rock


Ashland


Atherton


Belmont


Berkeley


Brisbane


Broadmoor


Burbank


Burlingame


Campbell


Castro Valley


Cherryland


Colma


Cupertino


Daly City


Dublin


East Foothills


East Palo Alto


Emeryville


Fairview


Foster City


Fremont


Gilroy


Half Moon Bay


Hayward


Hillsborough


La Honda


Ladera


Lexington Hills


Livermore


Los Altos


Los Altos Hills


Los Gatos


Loyola


Menlo Park


Millbrae


Milpitas


Montara


Monte Sereno


Morgan Hill


Moss Beach


Mountain View


Newark


North Fair Oaks


Oakland


Pacifica


Palo Alto


Pescadero


Piedmont


Pleasanton


Portola Valley


Redwood City


San Bruno


San Carlos


San Francisco


San Jose


San Leandro


San Lorenzo


San Martin


San Mateo


Santa Clara


Saratoga


South San Francisco


Stanford


Sunnyvale


Sunol


Union City


Woodside


In [17]:
summaries.sort_values("count", ascending=False, inplace=True)
fig = px.bar(summaries,
                 x="location", 
                 y="count", 
                 color="big_topic",
                 barmode="group",
                 title="Big Topics by Location"
            )
fig.show()

In [25]:
"""
Get counts of tweets by location and the_topic
"""
totals = pd.DataFrame(tweetsDF.groupby(["location", "the_topic"]).count()["id"]).reset_index()
totals.rename(columns = {"id": "count"}, inplace=True)
"""
Get counts of retweets by location and the_topic
"""
retweets = pd.DataFrame(tweetsDF.groupby(["location", "the_topic"]).sum()["retweet_count"]).reset_index()
summaries = totals.merge(retweets, on=["location", "the_topic"])
summaries["retweets_per_tweet"] = summaries["retweet_count"] / summaries["count"]
print(len(summaries))
summaries.head()

635


Unnamed: 0,location,the_topic,count,retweet_count,retweets_per_tweet
0,Alameda,Alameda,524,90551.0,172.807252
1,Alameda,biden,44,25.0,0.568182
2,Alameda,black,112,1546.0,13.803571
3,Alameda,corona,12,18.0,1.5
4,Alameda,covid,283,40263.0,142.272085


In [28]:
summaries.sort_values("retweets_per_tweet", ascending=False, inplace=True)
fig = px.bar(summaries,
                 x="the_topic", 
                 y="count", 
                 color="location",
                 barmode="group",
                 title="Topics by Location"
            )
fig.show()