# Imports and Functions  

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import altair as alt

def dt_to_min(dt):
    if type(dt) == str:
        hour, minute = map(int, dt.split(":"))
        return hour * 60 + minute
    
    return dt.hour * 60 + dt.minute

def is_subscribed_to(subreddit_name, subscribed_subs_df):
    if (subscribed_subs_df == subreddit_name).any().any():
        return 1
    return 0

# Data Visiualization

## Tags and Flairs

I will count and visualize the tags and flairs of the subreddits that I am subscribed to. It is worth to note that while tags have no missing data since they are annotated by hand, flairs have missing data since some of the subreddits do not have flairs or they have a poor flair system.

Also, I will not use flairs because they are inconsistent, missing, and usually unique to each subreddit. I will use tags instead.

In [2]:
subreddits_df = pd.read_csv("data/final_data/subscribed_subreddits.csv")

# Count the number of each tag from the tags column of the subreddits_df
tag_count_dict = subreddits_df["Tags"].map(lambda x: x.split(", ")).explode().value_counts().to_dict()
tag_count_df = pd.DataFrame({"Tag": list(tag_count_dict.keys()), "Count": list(tag_count_dict.values())})
tag_count_df["Tag"] = tag_count_df["Tag"].astype(dtype="category")
# Count the number of each flair from the flairs column of the subreddits_df
flair_count_dict = subreddits_df["Flairs"].map(lambda x: x.split(", ") if type(x) == str else None).explode().value_counts().to_dict()
flair_count_df = pd.DataFrame({"Flair": list(flair_count_dict.keys()), "Count": list(flair_count_dict.values())})
flair_count_df["Flair"] = flair_count_df["Flair"].astype(dtype="category")


# Create the chart for tags
tags_chart = alt.Chart(tag_count_df).mark_bar().encode(
    alt.X("Count:Q", axis=alt.Axis(tickCount=tag_count_df["Count"].max() // 2)),
    alt.Y("Tag:N", sort="-x", title=None),
    color=alt.value("#1f77b4"),
    tooltip=["Count:Q"],  # Show the count when hovering over the bar
).properties(
    # Set the size of the chart
    width=500,
    height=333.33,
    title = "Tags"
)
# Chart for all the flairs
flairs_chart_full = alt.Chart(flair_count_df).mark_bar().encode(
    alt.X("Flair:N", sort="-y"),
    alt.Y("Count:Q"),
    color=alt.value("#1f77b4"),
    tooltip=["Count:Q"],  # Show the count when hovering over the bar
).configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=13,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=18,
    domain=False,  # Remove the axis line
)
flair_count_df = flair_count_df[flair_count_df["Count"] > 1]  # Drop flairs with less than 2 counts since it means they are unique for subreddit
# Create the chart for flairs
flairs_chart = alt.Chart(flair_count_df).mark_bar().encode(
    alt.X("Count:Q", axis=alt.Axis(tickCount=flair_count_df["Count"].max() // 2)),
    alt.Y("Flair:N", sort="-x", title=None),
    color=alt.value("#1f77b4"),
    tooltip=["Count:Q"],  # Show the count when hovering over the bar
).properties(
    # Set the size of the chart
    width=500,
    height=333.33,
    title = "Flairs"
)

# Combine the charts
chart = alt.hconcat(tags_chart, flairs_chart, spacing=120)
chart = chart.configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=13,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=18,
    domain=False,  # Remove the axis line
)

chart

In [3]:
# Save the charts

tags_chart = tags_chart.configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=13,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=18,
    domain=False,  # Remove the axis line
)
flairs_chart = flairs_chart.configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=13,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=18,
    domain=False,  # Remove the axis line
)
tags_chart.save("figures/tags_and_flairs/altair_tags.html")
# tags_chart.save("figures/tags_and_flairs/altair_tags.png")
flairs_chart.save("figures/tags_and_flairs/altair_flairs.html")
# flairs_chart.save("figures/tags_and_flairs/altair_flairs.png")
flairs_chart_full.save("figures/tags_and_flairs/altair_flairs_full.html")
# flairs_chart_full.save("figures/tags_and_flairs/altair_flairs_full.png")
# chart.save("figures/tags_and_flairs/altair_tags_flairs.html")
# chart.save("figures/tags_and_flairs/altair_tags_flairs.png")

## Logins
I will count the logins in 30 minute intervals and visualize in bar chart to see at what times I am active.  

For the code I round (floor) the minutes to 30 minute intervals and count the logins. But for the _histograms_ to function properly I need a data including all the intervals even if there are no logins in that interval. Therefore, I will create a new dataframe with all the intervals, combine with my data, count the logins and offset everything by 1 to get the correct counts.

### Logins in Total
This chart shows all the logins in total.

In [4]:
login_datetime_df = pd.read_csv("data/final_data/ip_logs.csv")
login_datetime_df["Date"] = login_datetime_df["Date"].astype("datetime64[ns]")

login_datetime_df["TimeInMinutes"] = login_datetime_df["Date"].map(dt_to_min)

chart_login_total = alt.Chart(login_datetime_df).mark_bar().encode(
    x=alt.X("TimeInMinutes:Q", bin=alt.Bin(step=30), axis=alt.Axis(tickCount=48, labelAngle=-90, labelExpr="utcFormat(utcParse(datum.value >= 60 ? floor(datum.value / 60) + ':' + datum.value % 60 : '0:' + datum.value, '%H:%M'), '%H:%M')")), 
    y=alt.Y("count()"),
    tooltip=["count()"],  # Show the count when hovering over the bar
).properties(
    width=1200,
    height=400,
    title="Total Logins by Time",
)

chart_login_total = chart_login_total.configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=13,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=18,
    domain=False,  # Remove the axis line
)

# Save the chart
chart_login_total.save("figures/login_times/altair_login_total.html")
# chart_login_total.save("figures/login_times/altair_login_total.png")

chart_login_total

### Logins by Weekdays

Instead of looking my login times in total, I will look at them in daily basis. I will count the logins in 30 minute intervals and visualize in bar chart to see at what times I am active.

In [5]:
# Read my weekly schedule from json file
import json
with open("./data/final_data/schedule.json", "r") as f:
    weekly_schedule = json.load(f)

# Group by day
login_datetime_grouped = login_datetime_df.groupby("DayName")

# Create the charts for each day
charts_login_daily = {}
for name, group in login_datetime_grouped:
    group["Time"] = group["Date"].map(dt_to_min)

    # Create the chart
    chart = alt.Chart(group).mark_bar().encode(
        x=alt.X("Time:Q", bin=alt.Bin(step=30, extent=[0, 24*60-1]), axis=alt.Axis(tickCount=48, labelAngle=-90, labelExpr="utcFormat(utcParse(datum.value >= 60 ? floor(datum.value / 60) + ':' + datum.value % 60 : '0:' + datum.value, '%H:%M'), '%H:%M')")),
        y=alt.Y("count()", title="Number of Logins"),
        tooltip=["count()"],  # Show the count when hovering over the bar
    ).properties(
        title=name,
        width=1200,
        height=400,
    )

    try:
        # Create the ruler for weekly schedule
        d = pd.DataFrame(columns=["name", "startTime", "endTime", "Time", "code"])
        for cl in weekly_schedule[name]["classes"]:
            start_time = dt_to_min(cl["startTime"])
            end_time = dt_to_min(cl["endTime"])
            t = (start_time + end_time) / 2
            d.loc[len(d)] = cl | {"Time": t}

        checkbox = alt.binding_checkbox(name="Show class hours")
        checkbox_selection = alt.param(bind=checkbox, name="show")
        rule = alt.Chart(d).mark_rule(color="red", size=3).encode(
            alt.X("Time:Q", title="Time"),
            tooltip=["name:N", "startTime:N", "endTime:N", "code:N"],
        ).add_params(
            checkbox_selection
        ).transform_filter(
            checkbox_selection
        )
        # Combine the chart and the rule
        chart = chart + rule
    except KeyError:
        pass  # No classes on weekends

    charts_login_daily[name] = chart

# Sort the charts by day in week order
charts_list = [charts_login_daily["Monday"], charts_login_daily["Tuesday"], charts_login_daily["Wednesday"], charts_login_daily["Thursday"], charts_login_daily["Friday"], charts_login_daily["Saturday"], charts_login_daily["Sunday"]]
# Combine the charts
chart_login_daily = alt.vconcat(*charts_list, spacing=10)

chart_login_daily = chart_login_daily.configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=13,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=18,
    domain=False,  # Remove the axis line
)

chart_login_daily

In [6]:
# Save the charts
for name, ch in charts_login_daily.items():
    ch = ch.configure(
        padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
        title={"fontSize": 18},
    ).configure_view(
        stroke=None,
    ).configure_axisX(
        labelFontSize=13,
        grid=False,  # Remove the grid
        domainWidth=2,  # Set the width of the axis line
        domainColor="#000"  # Set the color of the axis line
    ).configure_axisY(
        labelFontSize=14,
        titleFontSize=18,
        domain=False,  # Remove the axis line
    )
    ch.save(f"figures/login_times/altair_{name}.html")
    # ch.save(f"figures/login_times/altair_{name}.png")
chart_login_daily.save("figures/login_times/altair_login_times.html")

## Voted Posts and Comments

In [7]:
post_votes_df = pd.read_csv("data/final_data/post_votes.csv").rename(columns={"Scores": "Score"})
comment_votes_df = pd.read_csv("data/final_data/comment_votes.csv")

### Subs

Count the number of posts and comments that I have voted in each subreddit and visualize them in a bar chart

In [8]:
# Count the appearances of each subreddit (filter out the users)
subreddit_count_df_p = post_votes_df.copy()[~post_votes_df["SubredditName"].str.contains("u_")].drop(columns=["MyVote", "Score"])
subreddit_count_df_p = subreddit_count_df_p.replace(False, None)
subreddit_count_df_c = comment_votes_df.copy()[~comment_votes_df["SubredditName"].str.contains("u_")].drop(columns=["MyVote", "Score"])
subreddit_count_df_c = subreddit_count_df_c.replace(False, None)
# Concat
subreddit_count_df = pd.concat([subreddit_count_df_p, subreddit_count_df_c]).reset_index(drop=True)
subreddit_count_df["Count"] = 1
subreddit_count_df = subreddit_count_df.groupby("SubredditName").count().reset_index()
subreddit_count_df["IsSubscribed"] = subreddit_count_df["IsSubscribed"].map(lambda x: True if x > 0 else False)
subreddit_count_df["SubredditName"] = subreddit_count_df["SubredditName"].astype(dtype="category")


# Add a slider to it to filter the minimum count
slider1 = alt.binding_range(min=1, max=subreddit_count_df["Count"].max(), step=1, name="Minimum Count")
slider2 = alt.binding_range(min=1, max=subreddit_count_df["Count"].max(), step=1, name="Max Count")
selector1 = alt.selection_point(fields=["minCount"], bind=slider1, name="minCount", value=[{"minCount": 1}])
selector2 = alt.selection_point(fields=["maxCount"], bind=slider2, name="maxCount", value=[{"maxCount": subreddit_count_df["Count"].max()}])
# Create the chart
chart = alt.Chart(subreddit_count_df).mark_bar().encode(
    alt.X("SubredditName:N", sort="-y", title="Subreddit", axis=alt.Axis(labelAngle=-45)),
    alt.Y("Count:Q"),
    tooltip=["Count:Q", "SubredditName:N"],  # Show the count when hovering over the bar
    color = alt.Color(
        "IsSubscribed:N", 
        scale=alt.Scale(domain=[True, False], range=["#1f77b4", "#ff7f0e"]),
        legend=alt.Legend(title="Is Subscribed", labelFontSize=12, titleFontSize=14, offset=0, orient="top-right"),
        ),
).properties(
    width=1200,
    height=400,
).add_params(
    selector1,
    selector2,
).transform_filter(
    (alt.datum.Count >= selector1.minCount) & (alt.datum.Count <= selector2.maxCount)
)

chart = chart.configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=11,
    titleFontSize=18,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=18,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)

chart.save("figures/voted/altair_subreddit_counts.html")
# chart.save("figures/voted/altair_subreddit_counts.png")
chart


### Tags

Do the same for each tag.

In [9]:
tags_df = pd.concat([post_votes_df[['SubredditName']], comment_votes_df[['SubredditName']]]).reset_index(drop=True)
tags_df["Tags"] = tags_df["SubredditName"].map(lambda x: subreddits_df[subreddits_df["subreddit"] == x]["Tags"].reset_index(drop=True)[0] if x in subreddits_df["subreddit"].values else None)
tags_df = tags_df.dropna().reset_index(drop=True).drop(columns=["SubredditName"])
tag_count_dict = tags_df["Tags"].map(lambda x: x.split(", ")).explode().value_counts().to_dict()
tag_count_df = pd.DataFrame({"Tag": list(tag_count_dict.keys()), "Count": list(tag_count_dict.values())})
tag_count_df["Tag"] = tag_count_df["Tag"].astype(dtype="category")


tags_chart = alt.Chart(tag_count_df).mark_bar().encode(
    alt.X("Count:Q"),
    alt.Y("Tag:N", sort="-x", title=None),
    color=alt.value("#1f77b4"),
    tooltip=["Count:Q"],  # Show the count when hovering over the bar
).properties(
    # Set the size of the chart
    width=500,
    height=333.33,
    title = "Tags"
).configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=13,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=18,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)

tags_chart.save("figures/voted/altair_tag_counts.html")
# tags_chart.save("figures/voted/altair_tag_counts.png")
tags_chart

### Number of Upvotes and Downvotes and If I am Subscribed

Count the number of total upvotes/downvotes and the number of posts and comments that I have voted in each subreddit that I am subscribed/not subscribed and visualize them in a bar chart.

In [10]:
# Drop np.nan values from the MyVote column
post_votes_df = post_votes_df.dropna(subset=["MyVote"]).reset_index(drop=True)
comment_votes_df = comment_votes_df.dropna(subset=["MyVote"]).reset_index(drop=True)
# Concat
votes_df = pd.concat([post_votes_df, comment_votes_df]).reset_index(drop=True)

# Count the number of votes
chart_votes = alt.Chart(votes_df).mark_bar().encode(
    alt.X("MyVote:O", sort="-x", axis=alt.Axis(labelAngle=-45, labelExpr="datum.label == '1' ? 'Upvote' : 'Downvote'")),
    alt.Y("count()"),
    tooltip=["count()"],  # Show the count when hovering over the bar
).properties(
    width=600,
    height=300,
)

# Count the number of subscribed subs
chart_subscribed = alt.Chart(votes_df).mark_bar().encode(
    alt.X("IsSubscribed:N", sort="-y", axis=alt.Axis(labelAngle=-45, labelExpr="datum.label == 'true' ? 'Subscribed' : 'Not Subscribed'")),
    alt.Y("count()"),
    tooltip=["count()"],  # Show the count when hovering over the bar
).properties(
    width=600,
    height=300,
)

chart = alt.hconcat(chart_votes, chart_subscribed)

chart = chart.configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)

chart

In [11]:
# Save the charts

chart_votes = chart_votes.configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)
chart_votes = chart_subscribed.configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)

chart_votes.save("figures/voted/altair_vote_counts.html")
# chart_post_votes.save("figures/voted/altair_vote_counts.png")
chart_subscribed.save("figures/voted/altair_subscribed_counts.html")
# chart_post_subscribed.save("figures/voted/altair_subscribed_counts.png")

### Histogram for Total Score

Histogram for the total score of the posts and comments that I have voted in each subreddit.  

Even though scores are generally on the low side, there are some outliers with high scores. Therefore, I will both use a histogram with a selected range and a log histogram to see the distribution better.

In [12]:
post_scores = post_votes_df[["Score"]].dropna()
comment_scores = comment_votes_df[["Score"]].dropna()
# Concat
scores = pd.concat([post_scores, comment_scores]).reset_index(drop=True)

brush = alt.selection_interval(encodings=['x'])
# Create a histogram for the net votes
chart_scores_1 = alt.Chart(scores).mark_bar().encode(
    x=alt.X("Score:Q").bin(alt.Bin(extent=[scores["Score"].min(), scores["Score"].max()])),
    y=alt.Y("count()"),
    tooltip=["count()"],  # Show the count when hovering over the bar
).properties(
    width=1200,
    height=400,
    title="All Scores",
).add_params(
    brush
)

chart_scores_2 = alt.Chart(scores).mark_bar().encode(
    x=alt.X("Score:Q").bin(alt.Bin(maxbins=7)),
    y=alt.Y("count()"),
    tooltip=["count()"],  # Show the count when hovering over the bar
).properties(
    width=1200,
    height=400,
).transform_filter(
    brush
)

scores_logged = scores.copy()
scores_logged["Score"] = scores_logged["Score"].map(lambda x: np.log10(x) if x > 0 else 0)
chart_scores_log = alt.Chart(scores_logged).mark_bar().encode(
    x=alt.X("Score:Q", title="Score (log10 spacing)", axis=alt.Axis(labelExpr="pow(10, datum.value)")).bin(),
    y=alt.Y("count()"),
    tooltip=["count()"],  # Show the count when hovering over the bar
).properties(
    width=1200,
    height=400,
    title="Scores less than 1000",
)

chart_scores = alt.vconcat(chart_scores_1, chart_scores_2, chart_scores_log).configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)

chart_scores

In [13]:
# Save the charts

ch = (chart_scores_1 | chart_scores_2).configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)
chart_scores_log = chart_scores_log.configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)

ch.save("figures/voted/altair_scores_interactive.html")
chart_scores_log.save("figures/voted/altair_scores_log.html")
# chart_post_scores_log.save("figures/voted/altair_post_scores_log.png")

## Created Posts and Comments

In [14]:
posts_df = pd.read_csv("data/final_data/post_headers.csv").rename(columns={"Scores": "Score"})
comments_df = pd.read_csv("data/final_data/comment_headers.csv")

### Subs

In [15]:
subreddit_count_df = created_df = pd.concat([posts_df[["Subreddit"]], comments_df[["Subreddit"]]]).reset_index(drop=True)
subreddit_count_df["Count"] = 1
subreddit_count_df = subreddit_count_df.groupby("Subreddit").count().reset_index()
subreddit_count_df["IsSubscribed"] = subreddit_count_df["Subreddit"].map(lambda x: bool(is_subscribed_to(x, subreddits_df)))
subreddit_count_df["Subreddit"] = subreddit_count_df["Subreddit"].astype(dtype="category")


# Add a slider to it to filter the minimum count
slider1 = alt.binding_range(min=1, max=subreddit_count_df["Count"].max(), step=1, name="Minimum Count")
slider2 = alt.binding_range(min=1, max=subreddit_count_df["Count"].max(), step=1, name="Max Count")
selector1 = alt.selection_point(fields=["minCount"], bind=slider1, name="minCount", value=[{"minCount": 1}])
selector2 = alt.selection_point(fields=["maxCount"], bind=slider2, name="maxCount", value=[{"maxCount": subreddit_count_df["Count"].max()}])
# Create the chart
chart = alt.Chart(subreddit_count_df).mark_bar().encode(
    alt.X("Subreddit:N", sort="-y", title="Subreddit", axis=alt.Axis(labelAngle=-45)),
    alt.Y("Count:Q"),
    tooltip=["Count:Q", "Subreddit:N"],  # Show the count when hovering over the bar
    color = alt.Color(
        "IsSubscribed:N", 
        scale=alt.Scale(domain=[True, False], range=["#1f77b4", "#ff7f0e"]),
        legend=alt.Legend(title="Is Subscribed", labelFontSize=12, titleFontSize=14, offset=0, orient="top-right"),
        ),
).properties(
    width=1200,
    height=400,
).add_params(
    selector1,
    selector2,
).transform_filter(
    (alt.datum.Count >= selector1.minCount) & (alt.datum.Count <= selector2.maxCount)
)

chart = chart.configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=11,
    titleFontSize=18,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=18,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)

chart.save("figures/created/altair_subreddit_counts.html")
# chart.save("figures/created/altair_subreddit_counts.png")
chart

### Tags

In [16]:
tags_df = created_df = pd.concat([posts_df[["Subreddit"]], comments_df[["Subreddit"]]]).reset_index(drop=True)
tags_df["Tags"] = tags_df["Subreddit"].map(lambda x: subreddits_df[subreddits_df["subreddit"] == x]["Tags"].reset_index(drop=True)[0] if x in subreddits_df["subreddit"].values else None)
tags_df = tags_df.dropna().reset_index(drop=True).drop(columns=["Subreddit"])
tag_count_dict = tags_df["Tags"].map(lambda x: x.split(", ")).explode().value_counts().to_dict()
tag_count_df = pd.DataFrame({"Tag": list(tag_count_dict.keys()), "Count": list(tag_count_dict.values())})
tag_count_df["Tag"] = tag_count_df["Tag"].astype(dtype="category")


tags_chart = alt.Chart(tag_count_df).mark_bar().encode(
    alt.X("Count:Q"),
    alt.Y("Tag:N", sort="-x", title=None),
    color=alt.value("#1f77b4"),
    tooltip=["Count:Q"],  # Show the count when hovering over the bar
).properties(
    # Set the size of the chart
    width=500,
    height=333.33,
    title = "Tags"
).configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=13,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=18,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)

tags_chart.save("figures/created/altair_tag_counts.html")
# tags_chart.save("figures/created/altair_tag_counts.png")
tags_chart

### If I am Subscribed

In [17]:
created_df = pd.concat([posts_df[["IsSubscribed"]], comments_df[["IsSubscribed"]]]).reset_index(drop=True)

# Count the number of subscribed subs
chart_subscribed = alt.Chart(created_df).mark_bar().encode(
    alt.X("IsSubscribed:N", sort="-y", axis=alt.Axis(labelAngle=-45)),
    alt.Y("count()"),
    tooltip=["count()"],  # Show the count when hovering over the bar
).properties(
    width=600,
    height=300,
).configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)

chart_subscribed.save("figures/created/altair_subscribed_counts.html")
# chart_post_subscribed.save("figures/created/altair_subscribed_counts.png")
chart_subscribed

### Score Histograms

In [18]:
# Concat
scores = pd.concat([posts_df[["Score"]], comments_df[["Score"]]]).dropna().reset_index(drop=True)

brush = alt.selection_interval(encodings=['x'])
# Create a histogram for the net votes
chart_scores_1 = alt.Chart(scores).mark_bar().encode(
    x=alt.X("Score:Q").bin(alt.Bin(extent=[scores["Score"].min(), scores["Score"].max()])),
    y=alt.Y("count()"),
    tooltip=["count()"],  # Show the count when hovering over the bar
).properties(
    width=1200,
    height=400,
    title="All Scores",
).add_params(
    brush
)

chart_scores_2 = alt.Chart(scores).mark_bar().encode(
    x=alt.X("Score:Q").bin(alt.Bin(maxbins=7)),
    y=alt.Y("count()"),
    tooltip=["count()"],  # Show the count when hovering over the bar
).properties(
    width=1200,
    height=400,
).transform_filter(
    brush
)

scores_logged = scores.copy()
scores_logged["Score"] = scores_logged["Score"].map(lambda x: np.log10(x) if x > 0 else 0)
chart_scores_log = alt.Chart(scores_logged).mark_bar().encode(
    x=alt.X("Score:Q", title="Score (log10 spacing)", axis=alt.Axis(labelExpr="datum.value == round(datum.value) ? pow(10, datum.value) : ''")).bin(),
    y=alt.Y("count()"),
    tooltip=["count()"],  # Show the count when hovering over the bar
).properties(
    width=1200,
    height=400,
    title="Scores less than 1000",
)

chart_scores = alt.vconcat(chart_scores_1, chart_scores_2, chart_scores_log).configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)

chart_scores

In [19]:
# Save the charts

ch = (chart_scores_1 | chart_scores_2).configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)
chart_scores_log = chart_scores_log.configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)

ch.save("figures/created/altair_scores_interactive.html")
chart_scores_log.save("figures/created/altair_scores_log.html")
# chart_post_scores_log.save("figures/voted/altair_post_scores_log.png")

### If I comment on my own posts

Count the number of comments that I have made on my own posts/others' posts and visualize them in a bar chart.

In [20]:
chart = alt.Chart(comments_df).mark_bar().encode(
    alt.X("IsPostOwned:N", sort="-y", axis=alt.Axis(labelAngle=-45, labelExpr="datum.value == 1 ? 'Owned' : 'Not Owned'")),
    alt.Y("count()"),
    tooltip=["count()"],  # Show the count when hovering over the bar
).properties(
    width=600,
    height=300,
).configure(
    padding={"left": 15, "right": 15, "top": 15, "bottom": 15},
    title={"fontSize": 18},
).configure_view(
    stroke=None,
).configure_axisX(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domainWidth=2,  # Set the width of the axis line
    domainColor="#000"  # Set the color of the axis line
).configure_axisY(
    labelFontSize=14,
    titleFontSize=14,
    grid=False,  # Remove the grid
    domain=False,  # Remove the axis line
)

chart.save("figures/created/altair_is_post_owned.html")
# chart_post_subscribed.save("figures/created/altair_is_post_owned.png")
chart
