In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates

import src.constants as constants
from src.common_utils import read_pickled_data

# Template

In [None]:
green = "rgb(75, 184, 135)"
green_light = "rgb(114, 224, 175)"
grey = "rgb(158, 158, 158)"
grey_light = "rgb(224, 224, 224)"
red = "rgb(255, 33, 0)"
red_light = "rgb(255, 71, 71)"

axis_common_dict = dict(
    linecolor="black",
    linewidth=2,
    ticks="outside",
    title=dict(standoff=15),
)

template = go.layout.Template()
template.layout = dict(
    paper_bgcolor="white",
    plot_bgcolor=grey_light,
    colorway=[green_light, "orange"],
    font=dict(color="black"),
    title=dict(
        font=dict(size=18),
        x=0.1, xanchor="left",
        y=0.9, yanchor="top"
    ),
    xaxis=dict(
        **axis_common_dict,
    ),
    yaxis=dict(
        **axis_common_dict,
        gridcolor=grey
    )
)
template.data.histogram = [go.Histogram(marker=dict(line=dict(width=2, color=green)))]

# Reading Data and Stats

In [None]:
data_news_pp = read_pickled_data([constants.CONCAT_ALL_PATH, "preprocessed", "exp_news.pkl"])

# Figures

Histogram of title length.

In [None]:
fig = px.histogram(
    data_news_pp,
    x="title_length",
    range_x=[0, 25],
    labels={
        "title_length": "Title Length",
        "count": "Number of Articles" # Issue with plotly
    },
    title="Title Length Histogram",
    width=750, height=500,
    template=template
)
fig.update_layout(yaxis_title="Number of Articles")
fig.update_yaxes(template.layout.yaxis)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(minor=dict(ticks="outside", dtick=1))
fig.show()
fig.write_image("./visualizations/news/title_length_hist.pdf")

Histogram of abstract length.

In [None]:
fig = px.histogram(
    data_news_pp,
    x="abstract_length",
    range_x=[-1, 100],
    labels={
        "abstract_length": "Abstract Length",
        "count": "Number of Articles" # Issue with plotly
    },
    title="Abstract Length Histogram",
    width=750, height=500,
    template=template
)
fig.update_layout(yaxis_title="Number of Articles")
fig.update_yaxes(template.layout.yaxis)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(dtick=10, minor=dict(ticks="outside", dtick=5))
fig.show()
fig.write_image("./visualizations/news/abstract_length_hist.pdf")

Histogram of title length, with and without stopwords.

In [None]:
df = pd.DataFrame({
    "Title Length": np.concatenate((["Full Title"]*len(data_news_pp), ["Without Stopwords"]*len(data_news_pp))),
    "data": np.concatenate((data_news_pp["title_length"], data_news_pp["title_no_stopwords_length"]))
})

fig = px.histogram(
    df,
    x="data",
    range_x=[0, 25],
    color="Title Length",
    labels={
        "data": "Title Length",
        "count": "Number of Articles" # Issue with plotly
    },
    title="Title Length Histogram",
    barmode="overlay",
    width=750, height=500,
    template=template
)
fig.update_layout(yaxis_title="Number of Articles")
fig.update_yaxes(template.layout.yaxis)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(minor=dict(ticks="outside", dtick=1))
fig.update_traces(marker=dict(line=dict(width=2, color="white")))
fig.show()
fig.write_image("./visualizations/news/title_length_stopword_hist.pdf")

Boxplot of title length.

In [None]:
fig = px.box(
    data_news_pp,
    x="title_length",
    range_x=[0, 25],
    title="Title Length Boxplot",
    labels={
        "title_length": "Title Length",
    },
    width=750, height=500,
    template=template
)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(minor=dict(ticks="outside", dtick=1))
fig.show()
fig.write_image("./visualizations/news/title_length_boxplot.pdf")

Histogram of abstract length, with and without stopwords.

In [None]:
df = pd.DataFrame({
    "Abstract Length": np.concatenate((["Full Abstract"]*len(data_news_pp), ["Without Stopwords"]*len(data_news_pp))),
    "data": np.concatenate((data_news_pp["abstract_length"], data_news_pp["abstract_no_stopwords_length"]))
})

fig = px.histogram(
    df,
    x="data",
    color="Abstract Length",
    range_x=[-1, 100],
    labels={
        "data": "Abstract Length",
        "count": "Number of Articles" # Issue with plotly
    },
    title="Abstract Length Histogram",
    barmode="overlay",
    width=750, height=500,
    template=template
)
fig.update_layout(yaxis_title="Number of Articles")
fig.update_traces(marker=dict(line=dict(width=2, color="white")))
fig.update_yaxes(template.layout.yaxis)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(dtick=10, minor=dict(ticks="outside", dtick=5))
fig.show()
fig.write_image("./visualizations/abstract_length_stopword_hist.pdf")

Boxplot of abstract length.

In [None]:
fig = px.box(
    data_news_pp,
    x="abstract_length",
    range_x=[0, 250],
    title="Abstract Length Boxplot",
    labels={
        "abstract_length": "Abstract Length",
    },
    width=750, height=500,
    template=template
)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(minor=dict(ticks="outside", dtick=50))
fig.show()
fig.write_image("./visualizations/news/abstract_length_boxplot.pdf")

Histogram of lenghts of concatenations of title and abstract.

In [None]:
df = pd.DataFrame({
    "Title and Abstract Length": np.concatenate((["Full Title and Abstract"]*len(data_news_pp), ["Without Stopwords"]*len(data_news_pp))),
    "data": np.concatenate((data_news_pp["title_and_abstract_length"], data_news_pp["title_and_abstract_no_stopwords_length"]))
})

fig = px.histogram(
    df,
    x="data",
    color="Title and Abstract Length",
    range_x=[0, 110],
    labels={
        "data": "Title and Abstract Length",
        "count": "Number of Articles" # Issue with plotly
    },
    title="Title and Abstract Length Histogram",
    barmode="overlay",
    width=750, height=500,
    template=template
)
fig.update_layout(yaxis_title="Number of Articles")
fig.update_traces(marker=dict(line=dict(width=2, color="white")))
fig.update_xaxes(template.layout.xaxis)
fig.update_yaxes(template.layout.yaxis)
fig.update_xaxes(dtick=10, minor=dict(ticks="outside", dtick=5))
fig.show()
fig.write_image("./visualizations/news/title_and_abstract_length_stopword_hist.pdf")

Boxplot of lenghts of concatenations of title and abstract.

In [None]:
fig = px.box(
    data_news_pp,
    x="title_and_abstract_length",
    range_x=[0, 300],
    title="Lengths of Article Concatenated Titles and Abstracts",
    labels={
        "title_and_abstract_length": "Title and Abstract Length",
    },
    width=750, height=500,
    template=template
)
fig.update_xaxes(template.layout.xaxis)
fig.show()
fig.write_image("./visualizations/news/title_and_abstract_length_boxplot.pdf")

In [None]:
data_news_pp["title_and_abstract_length"].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.99])

Histogram of article survival times.

In [None]:
survival_time_hrs = np.array(data_news_pp["survival_time_hrs"])
survival_time_hrs = survival_time_hrs[np.where(survival_time_hrs > 0)]
fig = px.histogram(
    x=survival_time_hrs,
    range_x=[-1, 72],
    labels={
        "x": "Survival Time in Hours",
        "count": "Number of Articles" # Issue with plotly
    },
    title="Article Survival Time in Hours",
    width=750, height=500,
    template=template
)
fig.update_xaxes(template.layout.xaxis)
fig.update_yaxes(template.layout.yaxis)
fig.update_layout(yaxis_title="Number of Articles")
fig.update_xaxes(minor=dict(ticks="outside", dtick=5))
fig.show()
fig.write_image("./visualizations/news/survival_time_hist.pdf")

In [None]:
data_news_pp[data_news_pp["survival_time_hrs"] != 0]["survival_time_hrs"].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95])

Histogram of number of times articles are shown.

In [None]:
fig = px.histogram(
    data_news_pp[data_news_pp["shown"] != 0],
    x="shown",
    range_x=[0, 300],
    nbins=100000,
    labels={
        "shown": "Amount Shown",
        "count": "Number of Articles" # Issue with plotly
    },
    title="Number of Times Articles are Shown",
    width=750, height=500,
    template=template
)
fig.update_xaxes(template.layout.xaxis)
fig.update_yaxes(template.layout.yaxis)
fig.update_layout(yaxis_title="Number of Articles")
fig.show()
fig.write_image("./visualizations/news/news_shown_hist.pdf")

In [None]:
data_news_pp[data_news_pp["shown"] != 0]["shown"].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.99])

Histogram of number of times articles are clicked.

In [None]:
fig = px.histogram(
    data_news_pp[data_news_pp["clicked"] != 0],
    x="clicked",
    range_x=[0, 30],
    nbins=100000,
    labels={
        "clicked": "Amount Clicked",
        "count": "Number of Articles" # Issue with plotly
    },
    title="Number of Times Articles are Clicked",
    width=750, height=500,
    template=template
)
fig.update_xaxes(template.layout.xaxis)
fig.update_yaxes(template.layout.yaxis)
fig.update_layout(yaxis_title="Number of Articles")
fig.show()
fig.write_image("./visualizations/news/news_clicked_hist.pdf")

In [None]:
data_news_pp[data_news_pp["clicked"] != 0]["clicked"].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.99])

News engagement percentages.

In [None]:
fig = px.histogram(
    data_news_pp[data_news_pp["engagement_percentage"] != 0],
    nbins=100,
    x="engagement_percentage",
    labels={
        "engagement_percentage": "Percentage Clicked",
        "count": "Number of Articles" # Issue with plotly
    },
    title="Percentage Articles are Engaged with",
    width=750, height=500,
    template=template
)
fig.update_xaxes(template.layout.xaxis)
fig.update_yaxes(template.layout.yaxis)
fig.update_layout(yaxis_title="Number of Articles")
fig.show()
fig.write_image("./visualizations/news/engagement_percent_hist.pdf")

In [None]:
data_news_pp[data_news_pp["engagement_percentage"] != 0]["engagement_percentage"].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.99])