In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

# Limit results so this can be minified + shared on pastebin.
TOP_POSTER_LIMIT = 10
DOMAIN_LIMIT = 25

# In the spirit of freepathons being quarterly this will track how
# discourse on the frontpage of freerepublic.com evolves over a
# quarterly time frame.
Q1_2021_TIME_SERIES_QUERY = """
SELECT
    timestamp,
    title,
    posted_by,
    source_url
FROM
    frontpages;
"""

df = pd.read_csv('data.csv')
df["timestamp"] = pd.to_datetime(df["timestamp"],unit='s')
df["title"] = df["title"].astype("category")
df["posted_by"] = df["posted_by"].astype("category")
df["source_url"] = df["source_url"].astype("category")

In [None]:
# A thread is considered sourced if it has a URL link.
sourced = df[df.source_url != ""]

# This is not entirely accurate since some vanity posts have links.
vanity = df[df.source_url == ""]

In [None]:
def extract_domain(url) -> str:
    return (
        url.
            replace("www.","").
            replace("http://", "").
            replace("https://", "").
            split("/")[0]
    )


sourced.insert(len(sourced.columns)-1, 'domain', (
    sourced.
        source_url.
        apply(extract_domain).
        astype("category")
))

In [None]:
# Draws a chart that shows where discussion for a particular thread originated from.
data = sourced[["source_url", "domain"]].drop_duplicates()
sns.countplot(
    y="domain",
    data=data,
    order=data.domain.value_counts().iloc[:DOMAIN_LIMIT].index,
)

In [None]:
# Draws a chart that shows who posted a topic to be discussed that has a URL source.
data = sourced[["source_url", "posted_by"]].drop_duplicates()
top_sourced_posters = data.posted_by.value_counts().iloc[:TOP_POSTER_LIMIT].index
sns.countplot(
    y="posted_by",
    data=data,
    order=top_sourced_posters,
)

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

# Draws a table that shows which freepers posted topics from a sourced link that originated
# from a particular domain.
who_is_posting_from_what = (
    sourced[["posted_by", "source_url", "domain"]].
    query('posted_by in @top_sourced_posters').
    drop_duplicates().
    groupby(["domain", "posted_by"])
)
who_is_posting_from_what[["domain"]].describe()