In [121]:
import time
from datetime import datetime
import pandas as pd
import pytz
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import numpy as np
from plotly.subplots import make_subplots
from ast import literal_eval

pio.renderers.default = "notebook"


In [122]:
df = pd.read_csv(
    "./dataset/medium_articles_add_info_5.csv",
    engine="c",
    usecols=lambda x: x != "creator_country",
)

df.head()


Unnamed: 0.1,Unnamed: 0,id,title,authors,url,tags,text,is_deleted,clap_count,voter_count,...,topics,collection_name,collection_slug,collection_subscribers,creator_id,creator_follower_count,creator_following_count,creator_medium_member_at,creator_ats_qualified_at,creator_has_subdomain
0,0,969b6a42443f,Mental Note Vol. 24,['Ryan Fan'],https://medium.com/invisible-illness/mental-no...,"['Mental Health', 'Health', 'Psychology', 'Sci...",Photo by Josh Riemer on Unsplash\n\nMerry Chri...,False,153.0,9.0,...,['Mental Health'],Invisible Illness,invisible-illness,53361.0,63463afc4a3f,11406.0,4449.0,1648761000000.0,1629705000000.0,True
1,1,ae2ec0a9fc1d,Your Brain On Coronavirus,['Simon Spichak'],https://medium.com/age-of-awareness/how-the-pa...,"['Mental Health', 'Coronavirus', 'Science', 'P...",Your Brain On Coronavirus\n\nA guide to the cu...,False,80.0,5.0,...,"['Coronavirus', 'Health']",Age of Awareness,age-of-awareness,46698.0,71fb8c6e7cce,2033.0,122.0,0.0,1622733000000.0,False
2,2,f0b097d533bb,Mind Your Nose,[],https://medium.com/neodotlife/mind-your-nose-f...,"['Biotechnology', 'Neuroscience', 'Brain', 'We...",Mind Your Nose\n\nHow smell training can chang...,False,50.0,1.0,...,['Science'],NEO.LIFE,neodotlife,8721.0,c638ea84c9f5,1505.0,4.0,0.0,1631215000000.0,False
3,3,fc6719090e75,The 4 Purposes of Dreams,['Eshan Samaranayake'],https://medium.com/science-for-real/the-4-purp...,"['Health', 'Neuroscience', 'Mental Health', 'P...",Passionate about the synergy between science a...,True,,,...,,,,,,,,,,
4,4,2e5d74db978,Surviving a Rod Through the Head,['Rishav Sinha'],https://medium.com/live-your-life-on-purpose/s...,"['Brain', 'Health', 'Development', 'Psychology...","You’ve heard of him, haven’t you? Phineas Gage...",False,271.0,19.0,...,['Neuroscience'],Science For Life,science-for-life,405.0,3be91f80efb8,610.0,573.0,0.0,1620202000000.0,True


In [123]:
df.info(memory_usage="deep")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192368 entries, 0 to 192367
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                192368 non-null  int64  
 1   id                        192368 non-null  object 
 2   title                     192363 non-null  object 
 3   authors                   192368 non-null  object 
 4   url                       192368 non-null  object 
 5   tags                      192368 non-null  object 
 6   text                      192368 non-null  object 
 7   is_deleted                192368 non-null  bool   
 8   clap_count                180002 non-null  float64
 9   voter_count               180002 non-null  float64
 10  post_responses            180002 non-null  float64
 11  reading_time              180002 non-null  float64
 12  curation_status           179322 non-null  object 
 13  seo_title                 6225 non-null    o

In [124]:
def get_current_time():
    epoch = time.time()
    tz_naive = datetime.now()
    tz_aware = pytz.timezone("UTC").localize(tz_naive)

    result = {"epoch": epoch, "tz_naive": tz_naive, "tz_aware": tz_aware}
    return result


# process data
df = df.dropna(subset=["title", "url", "is_deleted"])
df.drop(
    [
        "seo_title",
        "seo_description",
        "collection_slug",
        "creator_ats_qualified_at",
    ],
    axis=1,
    inplace=True,
)
df = df.loc[~df["is_deleted"], :]
now = get_current_time()
epoch_now = now["epoch"]
df["latest_published_at"] = df["latest_published_at"].apply(
    lambda x: x / 1000 if x > epoch_now else x
)
df["latest_published_at_epoch"] = df["latest_published_at"]
df["latest_published_at"] = pd.to_datetime(df["latest_published_at"], unit="s")
df["in_collection"] = ~df["collection_name"].isna()
df["is_locked"] = df["is_locked"].astype("bool")
df["creator_has_subdomain"] = df["creator_has_subdomain"].astype("bool")
df["collection_name_with_self_publish"] = df["collection_name"].fillna("Self Publish")
df["is_creator_medium_member"] = df["creator_medium_member_at"].apply(lambda x: x > 0)
df["curation_status"] = pd.Categorical(
    df["curation_status"],
    categories=[
        "CURATION_STATUS_DISTRIBUTED",
        "CURATION_STATUS_DISTRIBUTED_AND_DISABLED",
        "CURATION_STATUS_NOT_DISTRIBUTED",
        "CURATION_STATUS_NOT_REVIEWED",
        "CURATION_STATUS_DISABLED",
    ],
    ordered=True,
)
df["collection_name"] = pd.Categorical(df["collection_name"])
df["month_year"] = df["latest_published_at"].dt.strftime("%Y-%m")

unnamed_columns = [col for col in df if "Unnamed" in col]
df.drop(columns=unnamed_columns, inplace=True)

print(df.info(memory_usage="deep"))
print(len(df.index))


<class 'pandas.core.frame.DataFrame'>
Int64Index: 179997 entries, 0 to 192367
Data columns (total 29 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   id                                 179997 non-null  object        
 1   title                              179997 non-null  object        
 2   authors                            179997 non-null  object        
 3   url                                179997 non-null  object        
 4   tags                               179997 non-null  object        
 5   text                               179997 non-null  object        
 6   is_deleted                         179997 non-null  bool          
 7   clap_count                         179997 non-null  float64       
 8   voter_count                        179997 non-null  float64       
 9   post_responses                     179997 non-null  float64       
 10  reading_time        

In [125]:
# nb articles over time
df_count_by_month = df[["latest_published_at", "title"]].copy()
df_count_by_month = df_count_by_month.set_index("latest_published_at")
df_count_by_month = df_count_by_month["title"].resample("M").count()
df_count_by_month = df_count_by_month.reset_index()
df_count_by_month.rename(columns={"title": "count"}, inplace=True)
px.bar(
    df_count_by_month,
    x="latest_published_at",
    y="count",
    labels={"latest_published_at": "Publish Time", "count": "Number articles"},
    title="Number articles over time",
    width=1000,
)


In [126]:
q_low = df["latest_published_at"].quantile(0.25)
median = df["latest_published_at"].median()
q_high = df["latest_published_at"].quantile(0.75)
print(q_low)
print(median)
print(q_high)


2020-02-17 16:05:54.536000
2020-12-05 16:37:22.348999936
2020-12-26 23:32:13.447000064


In [127]:
# reaction over time
df_reactions_by_month = df[
    ["latest_published_at", "clap_count", "voter_count", "post_responses", "title"]
].copy()
df_reactions_by_month["month_year"] = df_reactions_by_month[
    "latest_published_at"
].dt.strftime("%Y-%m")
df_reactions_by_month = (
    df_reactions_by_month.groupby(by=["month_year"])
    .agg(
        clap_avg=("clap_count", "median"),
        voter_avg=("voter_count", "median"),
        nb_articles=("title", "count"),
    )
    .reset_index()
)
df_reactions_by_month = df_reactions_by_month.loc[
    df_reactions_by_month["nb_articles"] > 100, :
]

trace1 = go.Scatter(
    x=df_reactions_by_month["month_year"],
    y=df_reactions_by_month["clap_avg"],
    mode="lines",
    name="Average claps per article",
)
trace2 = go.Scatter(
    x=df_reactions_by_month["month_year"],
    y=df_reactions_by_month["voter_avg"],
    mode="lines",
    name="Average voters per article",
)
fig = go.Figure(data=[trace1, trace2], layout=go.Layout(title="Reactions", width=1000))
fig.update_layout(yaxis_range=[0, 200])
fig.add_vline(
    x=datetime.strptime("2017-03-21", "%Y-%m-%d").timestamp() * 1000,
    annotation_text="Introduction of Partner Program in March 2017",
)
fig.show()


In [130]:
def compare_curation_status_on_bool_column(df, column, true_group_name, false_group_name):
    df = df[[column, "curation_status", "title"]].copy()
    nb_article_false = len(df.loc[~df[column], :])
    nb_article_true = len(df.loc[df[column], :])
    df_grouped_by_column_and_curation = df.groupby(
        by=[column, "curation_status"], as_index=False
    ).count()
    df_grouped_by_column_and_curation.rename(columns={"title": "count"}, inplace=True)
    df_grouped_by_column_and_curation[
        "percentage"
    ] = df_grouped_by_column_and_curation.apply(
        (
            lambda x: x["count"] * 100.0 / nb_article_true
            if x[column]
            else x["count"] * 100.0 / nb_article_false,
        ),
        axis=1,
    )
    curation_true = df_grouped_by_column_and_curation.loc[
        df_grouped_by_column_and_curation[column]
    ]["curation_status"]
    percentage_true = df_grouped_by_column_and_curation.loc[
        df_grouped_by_column_and_curation[column]
    ]["percentage"]
    curation_false = df_grouped_by_column_and_curation.loc[
        ~df_grouped_by_column_and_curation[column]
    ]["curation_status"]
    percentage_false = df_grouped_by_column_and_curation.loc[
        ~df_grouped_by_column_and_curation[column]
    ]["percentage"]

    colors = [
        "#00CC96",
        "#FFA15A",
        "#EF543B",
        "#636EFA",
        "#AB63FA",
    ]
    fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "pie"}]])
    trace1 = go.Pie(
        labels=curation_true,
        values=percentage_true,
        domain=dict(x=[0, 0.5]),
        name=true_group_name,
        sort=False,
        marker=dict(colors=colors),
    )
    trace2 = go.Pie(
        labels=curation_false,
        values=percentage_false,
        domain=dict(x=[0.5, 1]),
        name=false_group_name,
        sort=False,
        marker=dict(colors=colors),
    )
    data = [trace1, trace2]
    layout = go.Layout(
        title=f"Article distribution status. {true_group_name} vs {false_group_name}",
        width=1200,
    )
    fig = go.Figure(data=data, layout=layout)

    return fig


In [131]:
fig = compare_curation_status_on_bool_column(
    df, "in_collection", "Publish in Collection", "Self Publish"
)
fig.show()


In [133]:
fig = compare_curation_status_on_bool_column(
    df,"is_locked", "Paid articles", "Public articles"
)
fig.show()


In [134]:
fig = compare_curation_status_on_bool_column(
    df, "is_creator_medium_member", "Medium member creators", "Not medium member creators"
)
fig.show()


In [135]:
fig = compare_curation_status_on_bool_column(
    df, "creator_has_subdomain", "Creator use subdomain", "Creator use sub path in URL"
)
fig.show()


In [136]:
def compare_metric_on_bool_column(
    df, bool_column, metric_column, agg_mode, metric_name, true_group_name, false_group_name
):
    df_reaction_by_column = df[[bool_column, metric_column]].copy()
    df_reaction_by_column = df_reaction_by_column.groupby(
        by=[bool_column], as_index=False
    ).agg({metric_column: agg_mode})
    df_reaction_by_column["category"] = df_reaction_by_column[bool_column].map(
        {True: true_group_name, False: false_group_name}
    )

    trace = go.Bar(
        x=df_reaction_by_column["category"],
        y=df_reaction_by_column[metric_column],
        width=0.3,
        name=metric_name,
    )
    layout = go.Layout(
        {
            "title": f"{metric_name}. {false_group_name} vs {true_group_name}",
            "yaxis_title": metric_name,
            "width": 1200,
        }
    )
    fig = go.Figure(data=[trace], layout=layout)
    return fig


In [137]:
# fig = make_subplots(rows=1, cols=1)
fig = compare_metric_on_bool_column(
    df,
    "in_collection",
    "clap_count",
    "median",
    "Number of average claps (median)",
    "Publish in collection",
    "Self Publish",
)
# fig.add_trace(trace1, row=1, col=1)
# # trace2 = compare_metric_on_bool_column(
# #     "in_collection",
# #     "clap_count",
# #     "mean",
# #     "Number of average claps (mean)",
# #     "Self Publish",
# #     "Publish in collection",
# # )
# # fig.add_trace(trace2, row=1, col=2)
# layout = go.Layout(
#     {
#         "title": "Number of average claps. Self Publish vs Publish in collection",
#         "yaxis_title": "Number of average claps",
#     },
#     width=1200,
# )
# fig.update_layout(layout)
fig.show()


In [138]:
# fig = make_subplots(rows=1, cols=1)
fig = compare_metric_on_bool_column(
    df,
    "is_locked",
    "clap_count",
    "median",
    "Number of average claps (median)",
    "Paid articles",
    "Public articles",
)
# fig.add_trace(trace1, row=1, col=1)
# # trace2 = compare_metric_on_bool_column(
# #     "is_locked",
# #     "clap_count",
# #     "mean",
# #     "Number of average claps (mean)",
# #     "Paid articles",
# #     "Public articles",
# # )
# # fig.add_trace(trace2, row=1, col=2)
# layout = go.Layout(
#     {
#         "title": "Number of average claps. Paid articles vs Public articles",
#         "yaxis_title": "Number of average claps",
#     },
#     width=1000,
# )
# fig.update_layout(layout)
fig.show()


In [139]:
# fig = make_subplots(rows=1, cols=1)
fig = compare_metric_on_bool_column(
    df,
    "is_creator_medium_member",
    "clap_count",
    "median",
    "Number of average claps (median)",
    "Medium Members",
    "Not Medium Members",
)
# fig.add_trace(trace1, row=1, col=1)
# # trace2 = compare_metric_on_bool_column(
# #     "is_creator_medium_member",
# #     "clap_count",
# #     "mean",
# #     "Number of average claps (mean)",
# #     "Medium Members",
# #     "Not Medium Members",
# # )
# # fig.add_trace(trace2, row=1, col=2)
# layout = go.Layout(
#     {
#         "title": "Number of average claps. Medium Members vs Not Medium Members",
#         "yaxis_title": "Number of average claps",
#     },
#     width=1000,
# )
# fig.update_layout(layout)
fig.show()


In [140]:
fig = make_subplots(rows=1, cols=1)
fig = compare_metric_on_bool_column(
    df,
    "creator_has_subdomain",
    "clap_count",
    "median",
    "Number of average claps (median)",
    "Sub Domain",
    "Path in URL",
)
# fig.add_trace(trace1, row=1, col=1)
# # trace2 = compare_metric_on_bool_column(
# #     "creator_has_subdomain",
# #     "clap_count",
# #     "mean",
# #     "Number of average claps (mean)",
# #     "Sub Domain",
# #     "Path in URL",
# # )
# # fig.add_trace(trace2, row=1, col=2)
# layout = go.Layout(
#     {
#         "title": "Number of average claps. Sub Domain vs Path in URL",
#         "yaxis_title": "Number of average claps",
#     },
#     width=1000,
# )
# fig.update_layout(layout)
fig.show()


In [141]:
# reaction by reading time
df_reaction_by_reading_time = df[
    ["title", "voter_count", "clap_count", "post_responses", "reading_time"]
].copy()
df_reaction_by_reading_time["reading_time"] = df_reaction_by_reading_time[
    "reading_time"
].apply(round)
reading_time_high = df_reaction_by_reading_time["reading_time"].quantile(0.99)
df_reaction_by_reading_time = df_reaction_by_reading_time[
    df_reaction_by_reading_time["reading_time"] < reading_time_high
]

df_reaction_by_reading_time = (
    df_reaction_by_reading_time.groupby(by=["reading_time"])
    .agg(nb_articles=("title", "count"), clap_avg=("clap_count", "median"))
    .reset_index()
)
df_reaction_by_reading_time = df_reaction_by_reading_time.sort_values(
    by=["reading_time"]
)
px.bar(
    df_reaction_by_reading_time,
    x="reading_time",
    y=["nb_articles"],
    barmode="group",
    labels={"reading_time": "Reading time"},
    title="Distribution of articles by reading time",
    width=1000,
)


In [142]:
px.bar(
    df_reaction_by_reading_time,
    x="reading_time",
    y=["clap_avg"],
    barmode="group",
    labels={"reading_time": "Reading time"},
    title="Distribution of average claps by reading time",
    width=1000,
)


In [143]:
# top collection
df_reaction_by_collection = df[
    [
        "collection_name",
        "title",
        "voter_count",
        "clap_count",
        "post_responses",
        "reading_time",
    ]
].copy()
df_reaction_by_collection = (
    df_reaction_by_collection.groupby(by=["collection_name"], as_index=False)
    .agg(
        nb_articles=("title", "count"),
        voter_avg=("voter_count", "median"),
        clap_avg=("clap_count", "median"),
        responses_avg=("post_responses", "median"),
        reading_time_avg=("reading_time", "median"),
    )
    .sort_values(by=["nb_articles"], ascending=[False])
)
df_reaction_by_collection["claps_per_voter_avg"] = (
    df_reaction_by_collection["clap_avg"] / df_reaction_by_collection["voter_avg"]
)

df_collection = df[["collection_name", "collection_subscribers"]].drop_duplicates(
    subset=["collection_name"], keep="last"
)
df_reaction_by_collection = pd.merge(
    df_reaction_by_collection, df_collection, how="inner", on="collection_name"
)
display(df_reaction_by_collection.head(10))

px.bar(
    df_reaction_by_collection.head(10),
    x="collection_name",
    y=["voter_avg", "clap_avg"],
    barmode="group",
    labels={"collection_name": "Collection"},
    title="Reactions of popular collections",
)


Unnamed: 0,collection_name,nb_articles,voter_avg,clap_avg,responses_avg,reading_time_avg,claps_per_voter_avg,collection_subscribers
0,Towards Data Science,4074,25.0,113.0,1.0,6.09827,4.52,647198.0
1,The Startup,3623,11.0,110.0,0.0,5.308491,10.0,764069.0
2,Better Programming,1429,30.0,221.0,1.0,4.757547,7.366667,209786.0
3,ILLUMINATION,1386,5.0,109.5,1.0,2.914623,21.9,61987.0
4,Analytics Vidhya,1364,5.0,24.0,0.0,4.747799,4.8,57302.0
5,DataDrivenInvestor,1034,5.0,85.5,0.0,4.672327,17.1,59305.0
6,UX Collective,695,39.0,236.0,1.0,5.888679,6.051282,450591.0
7,HackerNoon.com,602,12.5,123.0,1.0,5.024057,9.84,468783.0
8,JavaScript in Plain English,588,10.0,89.5,0.0,3.729245,8.95,67853.0
9,An Injustice!,542,19.0,296.5,2.0,5.422799,15.605263,20292.0


In [146]:
# top author

df_authors = df[
    [
        "title",
        "authors",
        "voter_count",
        "clap_count",
        "post_responses",
        "creator_has_subdomain",
        "creator_follower_count",
        "topics",
    ]
].copy()
df_authors["authors"] = df_authors["authors"].apply(literal_eval)
df_authors = df_authors.explode("authors")
print(df_authors["authors"].nunique())

df_authors_stats = (
    df_authors.groupby(by=["authors"])
    .agg(
        nb_articles=("title", "count"),
        voter_avg=("voter_count", "median"),
        voter_sum=("voter_count", "sum"),
        voter_max=("voter_count", "max"),
        clap_avg=("clap_count", "median"),
        clap_sum=("clap_count", "sum"),
        clap_max=("clap_count", "max"),
        creator_follower_count=("creator_follower_count", "max"),
        creator_has_subdomain=("creator_has_subdomain", "max"),
    )
    .reset_index()
)

df_authors_many_articles_stats = df_authors_stats.loc[
    df_authors_stats["nb_articles"] >= 10, :
]
print(len(df_authors_many_articles_stats))


80951
1626


In [151]:
df_top_authors_by_voters = df_authors_many_articles_stats.sort_values(
    by=["voter_avg"], ascending=[False]
)[:10]
px.bar(
    df_top_authors_by_voters[::-1],
    x="voter_avg",
    y="authors",
    orientation="h",
    labels={"voter_avg": "Average voters per article", "authors": "Authors"},
    title="Top authors on Medium (by number voters per article)",
    width=1000,
)


In [152]:
display(df_top_authors_by_voters)

Unnamed: 0,authors,nb_articles,voter_avg,voter_sum,voter_max,clap_avg,clap_sum,clap_max,creator_follower_count,creator_has_subdomain
65390,Sarah Cooper,13,1222.0,44927.0,22341.0,7409.0,187230.0,79166.0,143908.0,True
52662,Nassim Nicholas Taleb,12,1033.5,17927.0,5954.0,3230.0,91915.0,26295.0,120881.0,True
23656,Eric Elliott,25,1012.0,46257.0,11868.0,5619.0,231666.0,50370.0,110453.0,False
53665,Nick Babich,10,963.0,21955.0,8632.0,3536.0,97699.0,44424.0,81370.0,False
35254,Jessica Valenti,11,932.0,18207.0,5304.0,9098.0,148963.0,41326.0,48633.0,True
76532,Vaidehi Joshi,11,706.0,8898.0,1632.0,5183.0,64111.0,12112.0,28462.0,False
64700,Samer Buna,13,630.0,24199.0,12595.0,3928.0,144966.0,79156.0,25998.0,False
49802,Michael Thompson,10,619.0,8111.0,2447.0,3950.5,48420.0,11062.0,77491.0,True
48648,Medium Creators,11,582.0,7322.0,1341.0,4842.0,54572.0,8731.0,72043.0,False
18057,Danny Sapio,11,482.0,5858.0,1216.0,1940.0,25758.0,4916.0,14068.0,True


In [153]:
df_top_authors_by_voters_with_topics = df_authors.merge(
    df_top_authors_by_voters, how="inner", on="authors"
)
df_top_authors_by_voters_with_topics["topics"] = df_top_authors_by_voters_with_topics[
    "topics"
].apply(lambda x: literal_eval(x) if type(x) == str else None)
df_top_authors_by_voters_with_topics = df_top_authors_by_voters_with_topics.explode(
    "topics"
)
df_top_authors_by_voters_with_topics_count = (
    df_top_authors_by_voters_with_topics.groupby(by=["authors", "topics"])
    .agg(nb_articles=("topics", "count"), voter_avg =("voter_avg", "mean"))
    .reset_index().sort_values(by=["voter_avg"], ascending=[False])
)

rows = 5
cols = 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=df_top_authors_by_voters_with_topics_count["authors"].unique())


def create_topic_bar_chart(df, author):
    df_filtered = df.loc[df["authors"] == author, :]
    x = df_filtered["topics"]
    y = df_filtered["nb_articles"]
    return go.Bar(x=x, y=y)


c = 0
for i in range(1, 1 + rows):
    for j in range(1, 1 + cols):
        author = df_top_authors_by_voters_with_topics_count["authors"].unique()[c]
        trace = create_topic_bar_chart(
            df_top_authors_by_voters_with_topics_count, author
        )
        c = c + 1
        fig.add_trace(trace, row=i, col=j)

fig.update_layout(go.Layout(title=f"Top author topics", height=1200, width=1200))
fig.show()


In [154]:
df_top_authors_by_claps = df_authors_many_articles_stats.sort_values(
    by=["clap_avg"], ascending=[False]
)
px.bar(
    df_top_authors_by_claps[20::-1],
    x="clap_avg",
    y="authors",
    orientation="h",
    labels={"clap_avg": "Average claps per article"},
    title="Top authors on Medium (by number claps received per article)",
)


In [155]:
nb_total_authors = len(df_authors_stats.index)
df_top_10per_authors = df_authors_stats.sort_values(
    by=["voter_sum"], ascending=[False]
).head(int(0.1 * nb_total_authors))

perc_voter_of_top_10per_author = (
    df_top_10per_authors["voter_sum"].sum() / df_authors_stats["voter_sum"].sum() * 100
)
print(f"10% top authors receive: {perc_voter_of_top_10per_author}% voters")
df_authors_stats_removed_outline = df_authors_stats[df_authors_stats["voter_sum"] <df_authors_stats["voter_sum"].quantile(0.99)]
px.histogram(
    df_authors_stats_removed_outline,
    x="voter_sum",
    labels={"voter_sum": "Voter received"},
    title="Distribution of voters received",width=1000
).update_layout(yaxis_title="Number authors")


10% top authors receive: 91.35839479279964% voters


In [156]:
df_top_10per_authors["subdomain_label"] = df_top_10per_authors[
    "creator_has_subdomain"
].map({True: "Creators use subdomain", False: "Creators don't use subdomain"})

df_top_10per_authors_group = (
    df_top_10per_authors.groupby(by=["subdomain_label"])
    .agg(nb_authors=("authors", "count"))
    .reset_index()
)
fig = px.pie(
    df_top_10per_authors_group,
    values="nb_authors",
    names="subdomain_label",
    title="Top authors prefer subdomain or sub path?",
    width=1000,
)
fig.show()


In [159]:
q_high_follower = df_authors_stats["creator_follower_count"].quantile(0.99)
q_high_clap = df_authors_stats["voter_avg"].quantile(0.99)
df_authors_stats_filtered = df_authors_stats[
    (df_authors_stats["creator_follower_count"] < q_high_follower)
]
df_authors_stats_filtered = df_authors_stats_filtered[
    df_authors_stats_filtered["voter_avg"] < q_high_clap
]
px.scatter(
    x=df_authors_stats_filtered["creator_follower_count"],
    y=df_authors_stats_filtered["voter_avg"],
    width=1000,
    labels={"x": "Number of followers", "y": "Average voter per article"},
)


In [160]:
q_high_follower = df_authors_stats["creator_follower_count"].quantile(0.99)
q_high_clap = df_authors_stats["voter_max"].quantile(0.99)
df_authors_stats_filtered_2 = df_authors_stats[
    df_authors_stats["creator_follower_count"] < q_high_follower
]
df_authors_stats_filtered_2 = df_authors_stats_filtered_2[
    df_authors_stats_filtered_2["voter_max"] < q_high_clap
]
px.scatter(
    x=df_authors_stats_filtered_2["creator_follower_count"],
    y=df_authors_stats_filtered_2["voter_max"],
    width=1000,
    labels={"x": "Number of followers", "y": "Max voter received"},
)


In [161]:
# most frequent topics
df_topics = df[["title", "topics", "latest_published_at"]].copy()
df_topics["topics"] = df_topics["topics"].apply(literal_eval)
df_topics = df_topics.explode("topics")

df_topics_count = (
    df_topics.groupby(by=["topics"])
    .agg(nb_articles=("title", "count"))
    .reset_index()
    .sort_values(by=["nb_articles"], ascending=[False])
    .head(10)
)
px.bar(
    df_topics_count[::-1],
    x="nb_articles",
    y="topics",
    orientation="h",
    labels={"nb_articles": "Number articles"},
    title="Most popular topics on Medium",
    width=1000,
)


In [108]:
list_topics_ref = [
    {"topics": "Poetry", "nb_articles": 597000},
    {"topics": "Cryptocurrency", "nb_articles": 552000},
    {"topics": "Politics", "nb_articles": 458000},
    {"topics": "Marketing", "nb_articles": 357000},
    {"topics": "Design", "nb_articles": 303000},
    {"topics": "Programming", "nb_articles": 265000},
    {"topics": "Machine Learning", "nb_articles": 208000},
    {"topics": "JavaScript", "nb_articles": 204000},
    {"topics": "Data Science", "nb_articles": 188000},
    {"topics": "Self", "nb_articles": 155000},
    {"topics": "Work", "nb_articles": 107000},
]
df_topics_reference = pd.DataFrame(list_topics_ref)
px.bar(
    df_topics_reference[::-1],
    x="nb_articles",
    y="topics",
    orientation="h",
    labels={"nb_articles": "Number articles"},
    title="Most popular topics on Medium - Reference",
)


In [162]:
def filter_topic(topic):
    return topic in [
        "Programming",
        "Cryptocurrency",
        "Coronavirus",
        "Health",
        "Politics",
    ]


# topics evolution over time
df_topics_to_analyze = df_topics.copy()
df_topics_to_analyze["keep"] = df_topics_to_analyze["topics"].apply(filter_topic)
df_topics_to_analyze = df_topics_to_analyze.loc[df_topics_to_analyze["keep"], :]
df_topics_to_analyze["month_year"] = df_topics_to_analyze[
    "latest_published_at"
].dt.strftime("%Y-%m")

# count nb articles for each topic by month
df_topic_by_month = (
    df_topics_to_analyze.groupby(by=["month_year", "topics"])
    .agg(nb_articles=("title", "count"))
    .reset_index()
)
# compute cummulative nb articles for each topics
df_topic_by_month["cummulative_nb_articles"] = df_topic_by_month.groupby(["topics"])[
    "nb_articles"
].cumsum()

df_count_by_month["month_year"] = df_count_by_month["latest_published_at"].dt.strftime(
    "%Y-%m"
)
df_count_by_month["cummulative_nb_articles"] = df_count_by_month["count"].cumsum()
df_topic_by_month = df_topic_by_month.merge(
    df_count_by_month, how="inner", on="month_year", suffixes=["", "_total"]
)
df_topic_by_month["percentage"] = (
    df_topic_by_month["cummulative_nb_articles"]
    * 100.0
    / df_topic_by_month["cummulative_nb_articles_total"]
)

fig = px.line(
    df_topic_by_month,
    x="month_year",
    y="percentage",
    color="topics",
    title="Evolution of topics over time",
    labels={"percentage": "% article in topic", "month_year": "Time"},
    width=1000,
)
fig.show()


In [163]:
# most frequent tags
df_tags = df[["title", "tags"]].copy()
df_tags["tags"] = df_tags["tags"].apply(literal_eval)
df_tags = df_tags.explode("tags")

df_tags_count = (
    df_tags.groupby(by=["tags"])
    .agg(nb_articles=("title", "count"))
    .reset_index()
    .sort_values(by=["nb_articles"], ascending=[False])
    .head(20)
)
px.bar(
    df_tags_count[::-1],
    x="nb_articles",
    y="tags",
    orientation="h",
    labels={"nb_articles": "Number articles"},
    title="Most popular tags on Medium",
)


In [111]:
def count_words(text: str):
    return len(text.split(" "))


In [112]:
df["title_length"] = df["title"].apply(count_words)
df_by_title_length = df[["title", "title_length"]].copy()
q_high = df_by_title_length["title_length"].quantile(0.99)
df_by_title_length = df_by_title_length[df_by_title_length["title_length"] < q_high]

px.histogram(
    df_by_title_length,
    x="title_length",
    labels={"title_length": "Title length"},
    title="Distribution of title length",
).update_layout(yaxis_title="Number articles", width=1000)


In [113]:
import re

df["text_processed"] = df["text"].apply(lambda x: re.sub("\s{2,}", " ", x))
df["text_processed_length"] = df["text_processed"].apply(count_words)
df_by_text_length = df[["title", "text_processed_length"]].copy()
q_high = df_by_text_length["text_processed_length"].quantile(0.99)
df_by_text_length = df_by_text_length[
    df_by_text_length["text_processed_length"] < q_high
]

px.histogram(
    df_by_text_length,
    x="text_processed_length",
    labels={"text_processed_length": "Content length"},
    title="Distribution of content length",
).update_layout(yaxis_title="Number articles", width=1000)


In [114]:
# relation between article length & reading times
df_art_len_n_reading_time = df.loc[
    df["is_locked"] == False, ["title", "text_processed", "image_count", "reading_time"]
].copy()
df_art_len_n_reading_time["total_words"] = df_art_len_n_reading_time[
    ["title", "text_processed"]
].apply(
    lambda row: count_words(row["title"]) + count_words(row["text_processed"]), axis=1
)
# q_high_len = df_art_len_n_reading_time["total_words"].quantile(0.99)
# q_low_len = df_art_len_n_reading_time["total_words"].quantile(0.01)
q_high_img = df_art_len_n_reading_time["image_count"].quantile(0.75)
q_low_img = df_art_len_n_reading_time["image_count"].quantile(0.25)
# q_high_time = df_art_len_n_reading_time["reading_time"].quantile(0.99)
# q_low_time = df_art_len_n_reading_time["reading_time"].quantile(0.01)
df_art_len_n_reading_time = df_art_len_n_reading_time.loc[
    (df_art_len_n_reading_time["image_count"] < q_high_img)
    & (df_art_len_n_reading_time["image_count"] > q_low_img),
    :,
]
px.scatter(
    x=df_art_len_n_reading_time["total_words"],
    y=df_art_len_n_reading_time["reading_time"],
    trendline="ols",
    width=1000,
    labels={"x": "Total words", "y": "Reading time"},
)


In [115]:
from sklearn.linear_model import LinearRegression


X = df_art_len_n_reading_time[["total_words", "image_count"]].values
y = df_art_len_n_reading_time["reading_time"].values
X = X.reshape(-1, 2)
model = LinearRegression(fit_intercept=True)
model.fit(X, y)
coef = model.coef_
intercept = model.intercept_
print(
    f"Linear Regression model: reading_time = {coef[0]}*total_words + {coef[1]}*nb_images + {intercept}" 
)
test = pd.Series([264, 0])
test = test.values.reshape(-1, 2)
predictions = model.predict(test)[0]
print(predictions)


Linear Regression model: reading_time = 0.003744765179416645*total_words + 0.28548385418461236*nb_images + 0.06752432629795857
1.0561423336639528


In [116]:
import string


def get_strong_words():
    with open("./dataset/strong_words.txt") as f:
        words = [line.strip() for line in f.readlines()]
    return words


strong_words = get_strong_words()
strong_words = [word.lower() for word in strong_words]


def rm_punc_from_word(word):
    clean_alphabet_list = [
        alphabet for alphabet in word if alphabet not in string.punctuation
    ]
    return "".join(clean_alphabet_list)


def rm_punc_from_text(text):
    clean_word_list = [rm_punc_from_word(word) for word in text]
    return "".join(clean_word_list)


def count_strong_words(title: str) -> int:

    title = title.lower()
    title = rm_punc_from_text(title)
    words = title.split()

    strong_words_in_title = sum([1 if word in strong_words else 0 for word in words])
    return strong_words_in_title


df["title_strong_words_count"] = df["title"].apply(count_strong_words)

df_reaction_by_good_title = df[
    [
        "title",
        "title_strong_words_count",
        "voter_count",
        "clap_count",
        "post_responses",
    ]
]
df_reaction_by_good_title = df_reaction_by_good_title.groupby(
    by=["title_strong_words_count"], as_index=False
).agg({"title": "count", "voter_count": "median", "clap_count": "median"})
df_reaction_by_good_title.rename(
    columns={"title": "count", "title_strong_words_count": "nb_strong_words_in_title"},
    inplace=True,
)
df_reaction_by_good_title["claps_by_voter"] = (
    df_reaction_by_good_title["clap_count"] / df_reaction_by_good_title["voter_count"]
)
px.bar(
    df_reaction_by_good_title,
    x="nb_strong_words_in_title",
    labels={"nb_strong_words_in_title": "Number of strong words in the title"},
    title="Reaction by strong words in title",
    y=["clap_count", "voter_count"],
    barmode="group",
    width=1000,
)


In [117]:
df["curation_status_numeric"] = (
    df["curation_status"]
    .map(
        {
            "CURATION_STATUS_DISTRIBUTED": 5,
            "CURATION_STATUS_DISTRIBUTED_AND_DISABLED": 4,
            "CURATION_STATUS_NOT_DISTRIBUTED": 3,
            "CURATION_STATUS_NOT_REVIEWED": 2,
            "CURATION_STATUS_DISABLED": 1,
        }
    )
    .astype(float)
)


In [118]:
df_numeric = df[
    [
        "reading_time",
        "text_processed_length",
        "collection_subscribers",
        "clap_count",
        "voter_count",
        "post_responses",
        "title_length",
        "title_strong_words_count",
        "is_locked",
        "in_collection",
        "is_creator_medium_member",
        "creator_has_subdomain",
        "creator_follower_count",
        "curation_status_numeric",
    ]
]
df_corr = df_numeric.corr()  # Generate correlation matrix

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x=df_corr.columns,
        y=df_corr.index,
        z=np.array(df_corr),
        colorscale=px.colors.diverging.RdBu,
    )
)


In [119]:
df_id = df[["id", "latest_published_at"]].copy()
df_id["len"] = df_id["id"].apply(len)
df_id["month_year"] = df_id["latest_published_at"].dt.strftime("%Y-%m")
df_id = (
    df_id.groupby(by=["month_year", "len"])
    .agg(len_count=("len", "count"))
    .reset_index()
)
df_id["len_count_12"] = df_id.apply(
    lambda x: x["len_count"] if x["len"] == 12 else 0, axis=1
)
df_id["len_count_11"] = df_id.apply(
    lambda x: x["len_count"] if x["len"] == 11 else 0, axis=1
)

px.bar(df_id, x="month_year", y=["len_count_12", "len_count_11"], barmode="group")


In [120]:
def extract_domain(url):
    if not url:
        return None
    domain = url.split("/")[2]
    return domain


df["domain"] = df["url"].apply(extract_domain)
df_domain_count = (
    df[["domain", "title", "month_year"]]
    .groupby(by=["domain", "month_year"])
    .agg(nb_articles=("title", "count"))
    .reset_index()
)
df_domain_count_2 = (
    df_domain_count.groupby(by=["month_year"])
    .agg(nb_articles=("nb_articles", "sum"))
    .reset_index()
)
df_domain_count = df_domain_count[df_domain_count["domain"] == "medium.com"]
df_domain_count = df_domain_count.merge(
    df_domain_count_2, how="inner", on=["month_year"], suffixes=["_medium", "_total"]
)
df_domain_count = df_domain_count[df_domain_count["nb_articles_total"] > 100]
df_domain_count["percentage"] = (
    df_domain_count["nb_articles_medium"] * 100.0 / df_domain_count["nb_articles_total"]
)
px.line(
    df_domain_count,
    x="month_year",
    y="percentage",
    width=1000,
    labels={"percentage": "% article published on medium.com", "month_year": "Time"},
)
