
# Authors EDA 🖋

This is an exploration of author labels of articles

#### Notebook Properties
* Upstream Notebook: `src.engineering.word_counts_and_sentiments`
* Compute Resources: `32 GB RAM, 4 CPUs` (when not performing EDA on a sample of data)
* Last Updated: `Dec 5 2023`

#### Data

| **Name** | **Type** | **Location Type** | **Description** | **Location** | 
| --- | --- | --- | --- | --- | 
| `all_the_news` | `input` | `Delta` | Read full delta dataset of `AllTheNews` | `catalog/text_eda/all_the_news.delta` | 

In [0]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from tqdm.autonotebook import tqdm
from deltalake import DeltaTable
from src.utils.io import FileSystemHandler

In [0]:
pd.set_option("display.max_columns", None)
pd.options.plotting.backend = "plotly"
tqdm.pandas()

datafs = FileSystemHandler("s3")

In [0]:
LIMIT_PARTITIONS: int | None = None
"""An input parameter to limit the number of table partitions to read from delta. Useful to perform EDA on a sample of data."""

SHUFFLE_PARTITIONS: bool = False
"""Whether to randomize the partitions before reading"""

INPUT_TABLE: str = "all_the_news" 
INPUT_CATALOG: str = "simple_topic"


### Read Data

In [0]:
atn_delta_table: DeltaTable = datafs.read_delta(
    table=INPUT_TABLE,
    catalog_name=INPUT_CATALOG,
    as_pandas=False,
)

df: pd.DataFrame = datafs.read_delta_partitions(
    delta_table=atn_delta_table,
    N_partitions=LIMIT_PARTITIONS,
    shuffle_partitions=SHUFFLE_PARTITIONS,
)

df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(by=["date"])
df = df[df.year != 2020]

df = df.dropna(subset=["section"])
"""Drop articles without sections for this analysis"""

print(df.shape)
df.head()

In [0]:
author_article_threshold: int = 100

unique_authors = df["author"].dropna().value_counts()
unique_authors = unique_authors[
    (unique_authors > author_article_threshold)
    & ~(
        (unique_authors.index.str.contains("staff", case=False))
        | (unique_authors.index.str.contains("media", case=False))
        | (unique_authors.index.str.contains("press", case=False))
        | (unique_authors.index.str.contains("network", case=False))
    )
]

unique_authors = unique_authors.to_frame().reset_index()
unique_authors.columns = ["author", "article_count"]

unique_authors = (
    unique_authors[
        unique_authors.author.apply(
            lambda cell: not any(
                [p.lower() in cell.lower() for p in df.publication.unique()]
            )
        )
    ]
    .reset_index(drop=True)
    .drop_duplicates(subset=["author"])
)

print(unique_authors.shape)
unique_authors.head()

In [0]:
author_article_df: pd.DataFrame = (
    df[
        (df.author.isin(unique_authors.author))
        & (df.simple_topic != "Commercial Business")
    ]
    .dropna(subset=["author"])
    .dropna(subset=["article"])
)

print(author_article_df.shape)
author_article_df.head()

In [0]:
author_article_df.groupby(["publication"])["author"].nunique().sort_values(
    ascending=False
).plot(
    kind="barh",
    template="plotly_white",
    title="Publication Representation of Articles with Considered Authors",
)

In [0]:
author_article_df.groupby(["simple_topic"])["author"].nunique().sort_values(
    ascending=False
).plot(
    kind="bar",
    template="plotly_white",
    title="Topic Representation of Articles with Considered Authors",
)

In [0]:
author_article_df.groupby(["author", "simple_topic"])[
    ["article"]
].count().reset_index().plot(
    kind="bar",
    x="simple_topic",
    y="article",
    color="author",
    template="plotly_white",
)

In [0]:
select_topics: list[str] = [
    "Politics",
    "Sports",
    "Technology",
]

author_article_df[author_article_df.simple_topic.isin(select_topics)].groupby(
    [pd.Grouper(key="date", freq="M"), "simple_topic"]
)[["vader_compound_title"]].mean().reset_index().plot(
    kind="line",
    x="date",
    y="vader_compound_title",
    color="simple_topic",
    template="plotly_white",
    title="Average Sentiments of Topics over Time for Considered Authors",
    markers=True
)

In [0]:
select_authors: list[str] = [
    "Dave Quinn",
    "Alexia Fernandez",
    "Stephanie Petit",
]

author_article_df[author_article_df.author.isin(select_authors)].groupby(
    [pd.Grouper(key="date", freq="M"), 'author']
).agg({'vader_compound_title': 'mean', 'article': 'count'}).reset_index().plot(
    kind="line",
    x="date",
    y="vader_compound_title",
    template="plotly_white",
    title="Average Sentiments by Author over Time",
    color="author",
    markers=True
)

In [0]:
author_article_df[author_article_df.author.isin(select_authors)].groupby(
    [pd.Grouper(key="date", freq="M"), "simple_topic"]
).agg({'vader_compound_title': 'mean', 'article': 'count'}).reset_index().plot(
    kind="line",
    x="date",
    y="vader_compound_title",
    color="simple_topic",
    title="Average Sentiments by Selected Authors on their Topics over Time",
    template="plotly_white",
)