In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates

import src.constants as constants
from src.common_utils import read_pickled_data

# Template

In [None]:
green_dark = "rgb(75, 184, 135)"
green_light = "rgb(114, 224, 175)"
grey = "rgb(158, 158, 158)"
grey_light = "rgb(224, 224, 224)"

axis_common_dict = dict(
    linecolor="black",
    linewidth=2,
    ticks="outside",
    title=dict(standoff=15),
)

template = go.layout.Template()
template.layout = dict(
    paper_bgcolor="white",
    plot_bgcolor=grey_light,
    colorway=[green_light, "blue", "red", "orange"],
    font=dict(color="black"),
    title=dict(
        font=dict(size=18),
        x=0.1, xanchor="left",
        y=0.9, yanchor="top"
    ),
    xaxis=dict(
        **axis_common_dict,
    ),
    yaxis=dict(
        **axis_common_dict,
        gridcolor=grey
    )
)
template.data.histogram = [go.Histogram(marker=dict(line=dict(width=2, color=green_dark)))]

# Reading Data and Stats

In [None]:
data_behaviors_pp = read_pickled_data([constants.CONCAT_TRAINFULL_PATH, "preprocessed", "exp_behaviors.pkl"])
data_users_pp = read_pickled_data([constants.CONCAT_TRAINFULL_PATH, "preprocessed", "exp_users.pkl"])

In [None]:
stats = read_pickled_data([constants.CONCAT_TRAINFULL_PATH, "preprocessed", "exp_stats.pkl"])
stats

# Figures

Histogram of history length.

In [None]:
fig = px.histogram(
    data_users_pp,
    x="history_length",
    range_x=[-1, 60],
    labels={
        "history_length": "History Length",
        "count": "Number of Users" # Issue with plotly
    },
    title="User History Lengths Histogram",
    width=750, height=500,
    template=template
)
fig.update_layout(yaxis_title="Number of Users")
fig.update_yaxes(template.layout.yaxis)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(minor=dict(ticks="outside", dtick=2))
fig.show()
fig.write_image("./visualizations/behavior/history_length_hist.pdf")

Histogram of number of impressions per user.

In [None]:
data_impression_counts = data_behaviors_pp["user_id"].value_counts().rename_axis("user_id").reset_index(name="impression_count")
fig = px.histogram(
    data_impression_counts,
    x="impression_count",
    range_x=[0, 15],
    labels={
        "impression_count": "Number of Impressions",
        "count": "Number of Users" # Issue with plotly
    },
    title="User Impression Counts Histogram",
    width=750, height=500,
    template=template
)
fig.update_layout(yaxis_title="Number of Users")
fig.update_yaxes(template.layout.yaxis)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(minor=dict(ticks="outside", dtick=1))
fig.show()
fig.write_image("./visualizations/behavior/impression_count_hist.pdf")

Histogram of number of shown news per impression.

In [None]:
fig = px.histogram(
    data_behaviors_pp,
    x="shown_news_length",
    range_x=[0, 100],
    nbins=100,
    labels={
        "shown_news_length": "Number of Shown News",
        "count": "Number of Impressions" # Issue with plotly
    },
    title="Shown News per Impression Histogram",
    width=750, height=500,
    template=template
)
fig.update_layout(yaxis_title="Number of Impressions")
fig.update_yaxes(template.layout.yaxis)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(minor=dict(ticks="outside", dtick=10))
fig.show()
fig.write_image("./visualizations/behavior/shown_news_hist.pdf")

Histogram of number of clicked news per impression.

In [None]:
fig = px.histogram(
    data_behaviors_pp,
    x="clicked_news_length",
    range_x=[0, 10],
    range_y=[0, 2000000],
    labels={
        "clicked_news_length": "Clicked News",
        "count": "Number of Impressions" # Issue with plotly
    },
    title="Number of Clicked News Per Impression Histogram",
    width=750, height=500,
    template=template
)
fig.update_layout(yaxis_title="Number of Impressions")
fig.update_yaxes(template.layout.yaxis)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(dtick=1)
fig.show()
fig.write_image("./visualizations/behavior/clicked_news_hist.pdf")

Histogram of percentage of ignored news.

In [None]:
fig = px.histogram(
    data_behaviors_pp,
    x="ignored_news_percent",
    range_x=[0, 100],
    nbins=75,
    labels={
        "ignored_news_percent": "Percentage of Ignored News",
        "count": "Number of Impressions" # Issue with plotly
    },
    title="Percentage of Ignored News Histogram",
    width=750, height=500,
    template=template
)
fig.update_layout(yaxis_title="Number of Impressions")
fig.update_yaxes(template.layout.yaxis)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(minor=dict(ticks="outside", dtick=10), dtick=20)
fig.show()
fig.write_image("./visualizations/behavior/percentage_ignored_news_hist.pdf")

Scatter plot, relation between the amount of shown news and the percentage of ignored news.

In [None]:
fig = px.scatter(
    data_behaviors_pp,
    x="ignored_news_percent",
    y="shown_news_length",
    range_x=[0, 100],
    range_y=[0, 350],
    labels={
        "ignored_news_percent": "Percentage of Ignored News",
        "shown_news_length": "Number of Shown News"
    },
    title="Relation between Shown and Ignored News",
    width=750, height=500,
    template=template
)
fig.update_traces(marker_size=4, marker_color=green_dark)
fig.update_yaxes(template.layout.yaxis)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(minor=dict(ticks="outside", dtick=10), dtick=20, gridcolor=grey)
fig.show()
fig.write_image("./visualizations/behavior/shown_and_ignored.pdf")

Histogram of number of impressions per time of day.

In [None]:
fig = px.histogram(
    data_behaviors_pp,
    x="timestamp",
    nbins=100,
    labels={
        "timestamp": "Time",
        "count": "Number of Impressions" # Issue with plotly
    },
    title="Number of Impressions per Time of Day Histogram",
    width=750, height=500,
    template=template
)
fig.update_layout(yaxis_title="Number of Impressions")
fig.update_yaxes(template.layout.yaxis)
fig.update_xaxes(template.layout.xaxis)
fig.update_xaxes(minor=dict(ticks="outside", dtick=43200000), dtick=86400000)
fig.update_layout(
    xaxis_tickformat="%dth",
)
fig.show()
fig.write_image("./visualizations/behavior/impressions_per_time_of_day.pdf")

Pie chart of distribution of impressions over 4 time categories. Refer to the preprocessing code for the specific definition of each category.

In [None]:
data_time_category_percent = (data_behaviors_pp["time_category"].value_counts() / len(data_behaviors_pp)).rename_axis("time_category").reset_index(name="percent")
fig = px.pie(
    data_time_category_percent,
    values="percent",
    names="time_category",
    template=template
)
fig.show()
fig.write_image("./visualizations/behavior/impressions_per_time_category.pdf")