In [1]:
import sys
import pandas as pd
import numpy as np
import plotly.express as px
import math

sys.path.append("../")

import cleaning as c
import importlib
importlib.reload(c)

import warnings
warnings.filterwarnings("ignore")

In [4]:
url = "../Data/sampleData.csv"
df = pd.read_csv(url)
df.drop(columns = ["userid_str", "status_id_str", "id"], inplace = True)

df["year"] = df["birth"].apply(c.get_year)
df["term_partisanship"] = df["term_partisanship"].apply(c.clean_state)
df["term_state"] = df["term_state"].apply(c.clean_state)
df["posted"] = df["date"].apply(lambda x: int(x[0:4]))
df["age_when_posted"] = df["posted"] - df["year"]


df.head(3)

Unnamed: 0,date,text,name,birth,term_partisanship,term_type,term_state,country,Bucket,SentimentScore,version,year,posted,age_when_posted
0,2015-09-28,I commend all of law enforcement for the secur...,Al Green,1947-09-01,Democrat,rep,TX,China,2 or 3,,2019_batch1,1947.0,2015,68.0
1,2019-02-24,"Socialist tyrant Maduro starves, tortures his ...",Scott DesJarlais,1964-02-21,Republican,rep,TN,China,1,2.0,2019_batch1,1964.0,2019,55.0
2,2019-02-24,"Socialist tyrant Maduro starves, tortures his ...",Scott DesJarlais,1964-02-21,Republican,rep,TN,Iran,2 or 3,,2019_batch1,1964.0,2019,55.0


In [19]:
no_na = df.dropna(axis = 0)
china = no_na.loc[(no_na["SentimentScore"] < 5.0) & (no_na["country"] == "China")]


bin_width= 10
nbins = math.ceil((china["age_when_posted"].max() - df["age_when_posted"].min()) / bin_width)
px.histogram(china, x = "age_when_posted", color = "SentimentScore", nbins = nbins)

In [33]:
scores = df[df["SentimentScore"] <= 5.0]
scores_by_age = pd.DataFrame(scores.groupby(["SentimentScore", "country"])["age_when_posted"].mean()).reset_index()
px.scatter(scores_by_age, x = "SentimentScore", y = "age_when_posted", color = "country")

In [40]:
df.loc[(df["country"] == "Iran") & (df["SentimentScore"] == 3.5)]

Unnamed: 0,date,text,name,birth,term_partisanship,term_type,term_state,country,Bucket,SentimentScore,version,year,posted,age_when_posted
7794,2020-01-10,"...but through Article I, Section, clause 11 i...",Sheila Jackson Lee,1950-01-12,Democrat,rep,TX,Iran,1,3.5,2020_batch1,1950.0,2020,70.0


In [38]:
top_ten = df["term_state"].value_counts()[0:10].index
top_ten

Index(['TX', 'FL', 'TN', 'CA', 'OH', 'IN', 'NY', 'AR', 'WI', 'MO'], dtype='object')

In [39]:
top_ten = df[df["term_state"].isin(top_ten)]
top_ten.dropna(inplace = True)
i = pd.pivot_table(top_ten, index = "country", columns = "term_state", values = "SentimentScore", aggfunc = np.mean)
px.imshow(i)

In [12]:
int(df[df["term_partisanship"] == "Republican"]["year"].mean())

1964

In [29]:
china = df[df["country"] == "China"]
china["posted"] = china["date"].apply(lambda x: int(x[0:4]))

fig = px.scatter(china[china['term_partisanship'] == "Republican"], x = "posted", y = "SentimentScore")
fig.show()

In [26]:
bin_width= 10
nbins = math.ceil((df["year"].max() - df["year"].min()) / bin_width)
px.histogram(df, x = "year", nbins = nbins)

In [30]:
df["term_partisanship"] = df["term_partisanship"].apply(c.clean_state)
df["term_partisanship"].unique()

dems = df[df["term_partisanship"] == "Democrat"]
reps = df[df["term_partisanship"] == "Republican"]
othr = df[df["term_partisanship"].isin(["Independent", "Unknown"])]

In [34]:
bin_width= 10
nbins_dems = math.ceil((dems["year"].max() - dems["year"].min()) / bin_width)
nbins_reps = math.ceil((reps["year"].max() - reps["year"].min()) / bin_width)
nbins_othr = math.ceil((othr["year"].max() - othr["year"].min()) / bin_width)

px.histogram(reps, x = "year", nbins = nbins_reps)

In [39]:
state_v_country = pd.crosstab(df["term_state"], df["country"])
state_v_country["total"] = state_v_country["Canada"] + state_v_country["China"] + state_v_country["Iran"]
state_v_country.reset_index(inplace = True)
state_v_country.head()

country,term_state,Canada,China,Iran,total
0,AK,40,98,16,154
1,AL,9,142,21,172
2,AR,21,540,90,651
3,AZ,28,249,31,308
4,CA,95,692,161,948


In [40]:
import plotly.graph_objects as go

In [42]:
fig = go.Figure(
    data=[
        go.Bar(name='Canada', x = state_v_country["term_state"], y = state_v_country["Canada"], yaxis='y', offsetgroup=1),
        go.Bar(name='China', x = state_v_country["term_state"], y = state_v_country["China"], yaxis='y', offsetgroup=2),
        go.Bar(name='Iran', x = state_v_country["term_state"], y = state_v_country["Iran"], yaxis='y', offsetgroup=3),
    ],
    layout={
        'yaxis': {'title': '# of Mentions by State'}
    },
)

# Change the bar mode
fig.update_layout(title_text = "# of Tweets For Each Country By State", title_x = 0.5, barmode='group')
fig.show()