In [10]:
%load_ext lab_black

In [15]:
from glob import glob

import altair as alt
import pandas as pd

## About

## User Inputs

In [None]:
sentiment_mapper_dict = {"0": "NEG", "1": "NEU", "2": "POS"}
unwanted_subjects = ["misc", "images", "rover-images", "nasa-careers", "other-missions"]

In [8]:
dtypes = dict(
    id=pd.StringDtype(),
    text=pd.StringDtype(),
    sentiment=pd.Int32Dtype(),
    subject=pd.StringDtype(),
)

In [4]:
test_fpath, train_fpath, val_fpath = sorted(glob("v2/*.xlsx"))

In [16]:
%%time
df = (
    pd.read_excel(
        train_fpath,
        nrows=500,
        parse_dates=['created_at'],
        dtype=dtypes,
    )
    .dropna(subset=['sentiment', 'subject'])
)
df.head()

CPU times: user 79.7 ms, sys: 9.19 ms, total: 88.9 ms
Wall time: 86.8 ms


Unnamed: 0,id,created_at,text,sentiment,subject
0,1477338095515222019,2022-01-01 17:57:04,Q: How do you send a payload to the James Webb...,1,Jwst-mission
1,1479552167988146178,2022-01-07 20:35:00,Why NASA's James Webb telescope Twitter accoun...,0,Jwst-facts
2,1477132631590707200,2022-01-01 04:20:37,"With Webb’s Mid-Booms Extended, Sunshield Take...",2,Jwst-mission
3,1478270532508291079,2022-01-04 07:42:14,The first image from the new James Webb telesc...,2,Jwst-mission
4,1478922877764259840,2022-01-06 02:54:25,Still can't believe we did the space shuttle. ...,2,Other-missions


In [61]:
data_sents_by_subject = (
    df.query("subject != 'misc'")
    .astype({"sentiment": pd.StringDtype()})
    .replace({"sentiment": sentiment_mapper_dict})
    .groupby(["subject", "sentiment"], as_index=False)
    .size()
    .assign(subject=lambda df: df["subject"].str.title())
)
data_sents_by_subject

Unnamed: 0,subject,sentiment,size
0,Jwst-Facts,NEG,3
1,Jwst-Facts,NEU,11
2,Jwst-Facts,POS,20
3,Jwst-Mission,NEG,5
4,Jwst-Mission,NEU,3
5,Jwst-Mission,POS,67
6,Nasa-Careers,NEG,1
7,Nasa-Careers,NEU,6
8,Nasa-Careers,POS,6
9,Nasa-Funding,NEG,6


In [18]:
data_subjects = (
    df["subject"]
    .str.title()
    .value_counts()
    .reset_index()
    .rename(columns={"index": "subject", "subject": "count"})
)
data_subjects

Unnamed: 0,subject,count
0,Misc,97
1,Other-Missions,96
2,Jwst-Mission,75
3,Nasa-Science,39
4,Images,36
5,Jwst-Facts,34
6,Nasa-Careers,13
7,Telescopes,10
8,Rover-Images,8
9,Nasa-Funding,7


In [30]:
fig_size = dict(width=400, height=300)
bars = (
    alt.Chart(data_subjects)
    .mark_bar()
    .encode(
        x=alt.Y("count:Q", title=None, axis=alt.Axis(tickSize=0)),
        y=alt.Y("subject:O", title=None, sort="-x", axis=alt.Axis(tickSize=0)),
    )
)
text = bars.mark_text(align="left", baseline="middle", dx=3).encode(text="count:Q")
chart = (
    alt.layer(bars, text)
    .properties(**fig_size)
    .configure_view(strokeWidth=0)
    .configure_axis(grid=False, domain=False)
)
chart

In [66]:
fig_size = dict(width=400, height=75)
chart = (
    alt.Chart(data_sents_by_subject)
    .mark_bar()
    .encode(
        x=alt.X("size:Q", title=None, axis=alt.Axis(tickSize=0)),
        y=alt.Y("sentiment:O", title=None, axis=alt.Axis(tickSize=0)),
        color=alt.Color(
            "sentiment:N", title=None, legend=None, scale=alt.Scale(scheme="set1")
        ),
        row=alt.Row("subject:N", title=None, spacing=7.5),
        tooltip=["sentiment", "size"],
    )
    .properties(**fig_size)
    .configure_view(strokeWidth=0)
    .configure_axis(grid=False, domain=False)
)
chart

In [84]:
df_wanted_subjects = df.query("~subject.str.lower().isin(@unwanted_subjects)")

In [85]:
data_filtered_sentiment = (
    df_wanted_subjects["subject"]
    .str.title()
    .value_counts()
    .reset_index()
    .rename(columns={"index": "subject", "subject": "count"})
)
print(
    f"Kept {len(df_wanted_subjects):,} out of "
    f"{len(df):,} ({100*len(df_wanted_subjects)/len(df):.2f}%) tweets"
)
data_filtered_sentiment

Kept 165 out of 415 (39.76%) tweets


Unnamed: 0,subject,count
0,Jwst-Mission,75
1,Nasa-Science,39
2,Jwst-Facts,34
3,Telescopes,10
4,Nasa-Funding,7
