# Exploratory Data Analysis

## Introduction

After cleaning the data we are going to take a look a it. And since we want to know how the information changes across time, we will be looking at tweets from different weeks.

1. **Most common words:** Find them and create word clouds. See if anything needs to be removed.
2. **Size of vocabulary:** Look at the number of unique words used
3. **Engagement metrics across time:** A much insightfull look into the stats obtained during data cleaning.


In [None]:
import json
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.io as pio
import re
import spacy

from dash import Dash, dcc, html, Input, Output
from dotenv import load_dotenv
from itertools import product
from jupyter_dash import JupyterDash

In [None]:
load_dotenv()

BASE_DIR = os.environ.get("BASE_DIR")
BEARER_TOKEN = os.environ.get("BEARER_TOKEN")

In [None]:
pd.set_option("display.max_colwidth", 300)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 50)
pd.set_option("display.precision", 2)
pd.set_option("display.float_format", "{:,.2f}".format)

pio.templates.default = "plotly_white"
pio.kaleido.scope.default_scale = 2

gruvbox_colors = [
    "#fabd2f",
    "#b8bb26",
    "#458588",
    "#fe8019",
    "#b16286",
    "#fb4943",
    "#689d6a",
    "#d79921",
    "#98971a",
    "#83a598",
    "#d65d0e",
    "#d3869b",
    "#cc241d",
    "#8ec07c",
    "#b57614",
    "#79740e",
    "#076678",
    "#af3a03",
    "#8f3f71",
    "#9d0006",
    "#4d7b58",
    "#fbf1c7",
    "#928374",
    "#282828",
]

In [None]:
TIME_STAMPS = [(2022, 35), (2022, 40), (2022, 45), (2022, 50), (2023, 3)]

### Data Loading

There are three documents that I want to load. The corpus frame, document term matrix and clean data.

In [None]:
corpus = pd.read_feather(
    f"{BASE_DIR}/data/processed/corpus-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.feather"
)
dtm = pd.read_feather(
    f"{BASE_DIR}/data/processed/dtm-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.feather"
)
data_dtm = pd.read_feather(
    f"{BASE_DIR}/data/processed/data-dtm-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.feather"
)
stats_data = pd.read_feather(
    f"{BASE_DIR}/data/processed/stats_data-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.feather"
)
top30_df = pd.read_feather(
    f"{BASE_DIR}/data/processed/top30_df-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.feather"
)

In [None]:
dtm.set_index("index", inplace=True)
data_dtm.set_index("index", inplace=True)
corpus.set_index("index", inplace=True)
stats_data.set_index("index", inplace=True)

In [None]:
dtm.head()

In [None]:
corpus.head()

In [None]:
corpus.info()

In [None]:
stats_data.head()

## Most Common Words

In [None]:
newspapers = corpus["newspaper"].unique()

In [None]:
year_weeks = corpus["created_at"].dt.isocalendar()[["year", "week"]]
year_weeks.drop_duplicates(inplace=True)
year_weeks = year_weeks.to_numpy()

In [None]:
dtm_newspaper = pd.DataFrame(index=dtm.columns)

In [None]:
corpus["year"] = corpus["created_at"].dt.isocalendar().year
corpus["week"] = corpus["created_at"].dt.isocalendar().week

In [None]:
dtm_newspaper = pd.DataFrame(index=dtm.columns)

for year_week, newspaper in product(year_weeks, newspapers):
    data_ids = corpus.loc[
        (corpus["newspaper"] == newspaper)
        & (corpus["year"] == year_week[0])
        & (corpus["week"] == year_week[1]),
        ["id"],
    ]
    filtered_data = dtm.filter(items=data_ids["id"], axis=0)
    dtm_newspaper[f"{newspaper}-{year_week[0]}_{year_week[1]}"] = filtered_data.sum(
        axis=0
    )

In [None]:
top30_dict = {}

for newspaper in dtm_newspaper.columns:
    top = dtm_newspaper[newspaper].sort_values(ascending=False).head(30)
    top30_dict[newspaper] = list(zip(top.index, top.values))

In [None]:
top30_dict

In [None]:
top30_df = pd.DataFrame.from_records(top30_dict)

In [None]:
top30_df = top30_df.melt(
    value_vars=top30_df.columns, var_name="newspaper_date", value_name="word_count"
)

top30_df[["newspaper", "year_week"]] = top30_df["newspaper_date"].str.split(
    r"-", expand=True
)
top30_df[["year", "week"]] = top30_df["year_week"].str.split(r"_", expand=True)
top30_df[["word", "count"]] = pd.DataFrame(
    top30_df["word_count"].to_list(), index=top30_df.index
)

top30_df.drop(["word_count", "newspaper_date", "year_week"], axis=1, inplace=True)

top30_df["year"] = pd.to_numeric(top30_df["year"])
top30_df["week"] = pd.to_numeric(top30_df["week"])

In [None]:
top30_df.info()

In [None]:
top30_df.head()

In [None]:
top30_df.to_feather(
    f"{BASE_DIR}/data/processed/top30_df-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.feather"
)

In [None]:
top30_df["hot_topics"] = top30_df["word"].map(
    {
        "castillo": "castillo",
        "pedro": "castillo",
        "dina": "boluarte",
        "boluarte": "boluarte",
        "perú": "país",
        "país": "país",
        "congreso": "congreso",
        "covid": "covid",
        "protestas": "protestas",
        "manifestaciones": "protestas",
    }
)
top30_df["hot_topics"].fillna("", inplace=True)

In [None]:
with open(
    f"{BASE_DIR}/data/processed/top_30-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.json", "w"
) as file:
    json.dump(top30_dict, file)

In [None]:
fig = px.bar(
    top30_df,
    x="word",
    y="count",
    facet_row="newspaper",
    facet_col="week",
    color="hot_topics",
    color_discrete_sequence=gruvbox_colors,
    title="Top 30 words per newspaper per week",
    height=3200,
    width=3200,
)

fig.for_each_annotation(lambda a: a.update(text=f"{a.text.split('=')[-1]}"))
fig.update_xaxes(matches=None, showticklabels=True, categoryorder="total descending")
fig.update_yaxes(matches=None, showticklabels=True)

fig.write_html(f"{BASE_DIR}/reports/top30_bar-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.html")

fig.show()

## Number of words

In [None]:
unique_list = []

# Identify the non-zero items in the document-term matrix
for newspaper in dtm_newspaper.columns:
    uniques = dtm_newspaper[newspaper].to_numpy().nonzero()[0].size
    unique_list.append(uniques)

# Create a new datafra,e that contains this unique word count
data_words = pd.DataFrame(
    list(zip(dtm_newspaper.columns, unique_list)), columns=["newspaper", "unique_words"]
)
data_words.set_index("newspaper", inplace=True)
data_words.sort_values(by="unique_words", ascending=False)
data_words.reset_index(inplace=True)

In [None]:
data_words[["newspaper", "year_week"]] = data_words["newspaper"].str.split(
    r"-", expand=True
)
data_words[["year", "week"]] = data_words["year_week"].str.split(r"_", expand=True)

data_words.drop(["year_week"], axis=1, inplace=True)

data_words["year"] = pd.to_numeric(data_words["year"])
data_words["week"] = pd.to_numeric(data_words["week"])

In [None]:
data_words.head()

In [None]:
data_words.info()

Since the number of unique words might be linked to the number of tweets, I will add a column with the number of tweets for each newspaper.

In [None]:
tweet_number = pd.DataFrame(
    corpus.groupby(by=["newspaper", "year", "week"]).count()["id"]
)
tweet_number.rename(columns={"id": "tweet_number"}, inplace=True)

In [None]:
tweet_number.reset_index(inplace=True)

In [None]:
tweet_number.info()

In [None]:
data_words = data_words.merge(tweet_number)

In [None]:
data_words.head()

In [None]:
data_words.info()

In [None]:
data_words["word_tweet_ratio"] = data_words["unique_words"] / data_words["tweet_number"]
data_words.sort_values(by="word_tweet_ratio", ascending=False)

In [None]:
data_words.to_csv(
    f"{BASE_DIR}/reports/tables/words_tweets-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.csv"
)

In [None]:
fig = px.scatter(
    data_words,
    "unique_words",
    "tweet_number",
    facet_col="week",
    color="newspaper",
    color_discrete_sequence=gruvbox_colors,
    title="Unique words per newspaper",
    width=2400,
    height=600,
)

fig.show()

In [None]:
data_words = pd.read_csv(
    f"{BASE_DIR}/reports/tables/words_tweets-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.csv",
    index_col=0,
)

## Engagement

Now I will be looking into engagement metrics as a whole per newspaper and how it changes, as well as the relationship with the most used words as well as the ammount of vocabulary expressed in the most used words.

### Raw stats

In [None]:
stats_data.head()

In [None]:
stats_data.info()

In [None]:
stats_data["year"] = stats_data["created_at"].dt.isocalendar().year
stats_data["week"] = stats_data["created_at"].dt.isocalendar().week

In [None]:
stats_summary = (
    stats_data[
        [
            "newspaper",
            "retweet_count",
            "reply_count",
            "like_count",
            "quote_count",
            "year",
            "week",
        ]
    ]
    .groupby(by=["newspaper", "year", "week"])
    .agg(func=["count", "min", "mean", "std", "max", "sum"])
)

In [None]:
stats_summary[("retweet_count", "ratio")] = (
    stats_summary[("retweet_count", "sum")] / stats_summary[("retweet_count", "count")]
)
stats_summary[("reply_count", "ratio")] = (
    stats_summary[("reply_count", "sum")] / stats_summary[("reply_count", "count")]
)
stats_summary[("like_count", "ratio")] = (
    stats_summary[("like_count", "sum")] / stats_summary[("like_count", "count")]
)
stats_summary[("quote_count", "ratio")] = (
    stats_summary[("quote_count", "sum")] / stats_summary[("quote_count", "count")]
)

In [None]:
stats_summary = stats_summary.stack()
stats_summary = stats_summary.melt(var_name="metric", ignore_index=False)

In [None]:
stats_summary = stats_summary.reset_index()
stats_summary.rename({"level_3": "stat"}, axis=1, inplace=True)

In [None]:
stats_summary["year_week"] = (
    stats_summary["year"].astype("str") + "w" + stats_summary["week"].astype("str")
)

In [None]:
stats_summary.head()

In [None]:
stats_summary.info()

In [None]:
fig = px.line(
    stats_summary,
    x="year_week",
    y="value",
    color="newspaper",
    facet_row="metric",
    facet_row_spacing=0.08,
    facet_col="stat",
    color_discrete_sequence=gruvbox_colors,
    title="Raw engagement stats per newspaper",
    width=2100,
    height=1200,
)

fig.for_each_annotation(lambda a: a.update(text=f"{a.text.split('=')[-1]}"))
fig.update_xaxes(showticklabels=True, tickangle=-45)
fig.update_yaxes(matches=None, showticklabels=True)

fig.show()

In [None]:
stats_summary.to_csv(
    f"{BASE_DIR}/reports/tables/raw_stats_summary-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.csv"
)

### Stats for most used words

With this bit I want to find out which are the words that drive the most engagement for the newspaper durin the time period selected. For that I'll be building a small dashboard application using [Plotly Dash](https://dash.plotly.com)

In [None]:
top30_df.head()

In [None]:
data_dtm.head()

In [None]:
stats_data.head()

In [None]:
data_dtm["year"] = data_dtm["created_at"].dt.isocalendar().year
data_dtm["week"] = data_dtm["created_at"].dt.isocalendar().week

In [None]:
top30_ids = pd.merge(
    data_dtm, top30_df, how="right", on=["newspaper", "year", "week", "word"]
)
top30_ids.dropna(subset=["id"], inplace=True)

In [None]:
top30_stats = pd.merge(
    top30_ids,
    stats_data,
    how="left",
    on=["id", "created_at", "newspaper", "year", "week"],
)

In [None]:
top30_stats.head()

In [None]:
top30_stats.reset_index().to_feather(
    f"{BASE_DIR}/data/processed/top30-stats-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.feather"
)

In [None]:
top30_stats_summary = (
    top30_stats[
        [
            "newspaper",
            "retweet_count",
            "reply_count",
            "like_count",
            "quote_count",
            "word",
            "year",
            "week",
        ]
    ]
    .groupby(by=["newspaper", "year", "week", "word"])
    .agg(func=["count", "min", "mean", "std", "max", "sum"])
)

In [None]:
top30_stats_summary[("retweet_count", "ratio")] = (
    top30_stats_summary[("retweet_count", "sum")]
    / top30_stats_summary[("retweet_count", "count")]
)
top30_stats_summary[("reply_count", "ratio")] = (
    top30_stats_summary[("reply_count", "sum")]
    / top30_stats_summary[("reply_count", "count")]
)
top30_stats_summary[("like_count", "ratio")] = (
    top30_stats_summary[("like_count", "sum")]
    / top30_stats_summary[("like_count", "count")]
)
top30_stats_summary[("quote_count", "ratio")] = (
    top30_stats_summary[("quote_count", "sum")]
    / top30_stats_summary[("quote_count", "count")]
)

In [None]:
top30_stats_summary = top30_stats_summary.stack()
top30_stats_summary = top30_stats_summary.melt(var_name="metric", ignore_index=False)

In [None]:
top30_stats_summary = top30_stats_summary.reset_index()
top30_stats_summary.rename({"level_4": "stat"}, axis=1, inplace=True)

In [None]:
top30_stats_summary["year_week"] = (
    top30_stats_summary["year"].astype("str")
    + "w"
    + top30_stats_summary["week"].astype("str")
)

In [None]:
top30_stats_summary.head()

In [None]:
top30_stats_summary["hot_topics"] = top30_stats_summary["word"].map(
    {
        "castillo": "castillo",
        "pedro": "castillo",
        "dina": "boluarte",
        "boluarte": "boluarte",
        "perú": "país",
        "país": "país",
        "congreso": "congreso",
        "covid": "covid",
        "protestas": "protestas",
        "manifestaciones": "protestas",
    }
)
top30_stats_summary["hot_topics"].fillna("", inplace=True)

In [None]:
top30_stats_summary.info()

In [None]:
top30_stats_summary.to_csv(
    f"{BASE_DIR}/reports/tables/top30_stats_summary-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.csv"
)

#### Building the app

In [None]:
top30_stats_summary = pd.read_csv(
    f"{BASE_DIR}/reports/tables/top30_stats_summary-{TIME_STAMPS[0]}-{TIME_STAMPS[-1]}.csv",
    index_col=0,
)
top30_stats_summary["hot_topics"].fillna("", inplace=True)

In [None]:
top30_app = JupyterDash(__name__)

In [None]:
top30_app.layout = html.Div(
    children=[
        html.H1(
            "Top 30 words engagement stats per newspaper",
            style={"font-family": "Open Sans", "color": "#2a3f5f"},
        ),
        html.Br(),
        html.Div(
            [
                html.Div(
                    [
                        html.Label(
                            "Stat",
                            style={"font-family": "Open Sans", "color": "#2a3f5f"},
                        ),
                        dcc.RadioItems(
                            top30_stats_summary["stat"].unique(),
                            "ratio",
                            id="stat",
                            inline=True,
                            style={"font-family": "Open Sans", "color": "#2a3f5f"},
                        ),
                    ]
                ),
                html.Div(
                    [
                        html.Label(
                            "Metric",
                            style={"font-family": "Open Sans", "color": "#2a3f5f"},
                        ),
                        dcc.RadioItems(
                            top30_stats_summary["metric"].unique(),
                            "like_count",
                            id="metric",
                            inline=True,
                            style={"font-family": "Open Sans", "color": "#2a3f5f"},
                        ),
                    ]
                ),
            ],
            style={"display": "flex", "justify-content": "space-around"},
        ),
        html.Br(),
        dcc.Graph(id="stats_graph"),
    ],
    style={"height": "3600px"},
)

In [None]:
@top30_app.callback(
    Output("stats_graph", "figure"), Input("stat", "value"), Input("metric", "value")
)
def update_figure(selected_stat, selected_metric):
    filtered_data = top30_stats_summary.loc[
        (top30_stats_summary["stat"] == selected_stat)
        & (top30_stats_summary["metric"] == selected_metric)
    ]

    fig = px.bar(
        filtered_data,
        x="word",
        y="value",
        facet_row="newspaper",
        facet_col="year_week",
        color="hot_topics",
        color_discrete_sequence=gruvbox_colors,
        height=3200,
        width=3200,
    )

    fig.for_each_annotation(lambda a: a.update(text=f"{a.text.split('=')[-1]}"))
    fig.update_xaxes(
        matches=None, showticklabels=True, categoryorder="total descending"
    )
    fig.update_yaxes(matches=None, showticklabels=True)

    return fig

In [None]:
if __name__ == "__main__":
    top30_app.run_server(mode="inline")