In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

import plotly.express as px
import wordcloud

In [None]:
medium = pd.read_excel('medium-posting-schedule.xlsx')

In [None]:
medium['Stats updated'].value_counts()

In [None]:
audience = pd.read_excel('medium-audience-analysis.xlsx').rename(columns={'New followers': 'new_followers'})
audience

## Audience stats

In [None]:
audience['old_followers'] = audience['new_followers'].shift(1).cumsum().fillna(0).astype(int)
audience


In [None]:
fig = px.bar(audience.melt(id_vars='Date', value_vars=['old_followers', 'new_followers']),
             x='Date', y='value', color='variable',
             category_orders={'variable': ['old_followers', 'new_followers']},
             labels={'variable':'Follower type',
                    },
             color_discrete_sequence=px.colors.qualitative.T10,
            )

legend_handles = {'old_followers': 'Old', 'new_followers': 'New'}
fig.for_each_trace(lambda t: t.update(name = legend_handles[t.name]))

fig.update_xaxes(title='Date')
fig.update_yaxes(title='Follower count')
fig.show()

## Article stats

In [None]:
fig=px.bar(medium['Status'].value_counts().reset_index(), 
           x='index', y='Status', color='index',
           category_orders={'index': ['published', 'submitted', 'scheduled', 'finished draft', 'early draft', 'idea']},
           labels={'index': 'Article status', 'Status': 'Article count'},
           color_discrete_sequence=px.colors.qualitative.G10,
           width=500,
           text_auto='.0f',
          )
fig.update_layout(showlegend=False)
fig.show()

In [None]:
cond = medium['Status'] == 'published'
published = medium[cond].copy()
published

In [None]:
publication_order = published['Publication'].value_counts().index.values

In [None]:
fig=px.bar(published['Publication'].value_counts().reset_index(), 
           x='index', y='Publication', color='index',
           labels={'index': 'Publication', 'Status': 'Article count'},
           category_orders={'index': publication_order},
           color_discrete_sequence=px.colors.qualitative.G10,
           width=500,
           text_auto='.0f',
          )
fig.update_layout(showlegend=False)
fig.update_yaxes(title='Count')
fig.show()

In [None]:
fig=px.scatter(published['Publication date'].value_counts().reset_index(),
               x='index', y='Publication date', size='Publication date',
               labels={'index': 'Publication date', 'Publication date':'Article count'}, 
              )
fig.show()

In [None]:
# TODO divide this by posts accepted in publications
# and posts published on my page

this_df = (published['Publication date'].sort_values() - published['Publication date'].sort_values().shift()).dropna().dt.days

fig=px.violin(this_df,
              labels={'variable': '', 'value': 'Post frequency (days)'},
              width=800,
              box=True,
              points='all',
              orientation='h',
             )

fig.add_annotation(text=f'Median: {this_df.median():.1f} days', x=0, xref='paper', y=1, yref='paper', showarrow=False,
                   bgcolor='white',
                  )
fig.update_xaxes(tick0=0)#, dtick=1)
fig.update_yaxes(labelalias={'Publication date': ''})
fig.show()

In [None]:
fig=px.box(published,
           x='Publication', y='Lifetime earnings', color='Publication',
           labels={'Lifetime earnings': 'Lifetime earnings ($)'},
           category_orders={'Publication': publication_order},
           color_discrete_sequence=px.colors.qualitative.G10,
           log_y=True,
           width=800,
          )
#fig.update_layout(
#    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
#))
fig.add_annotation(text=f'Total earnings: {published["Lifetime earnings"].sum():.2f} $',
                   x=0.9, xref='paper',
                   y=0.9, yref='paper', 
                   showarrow=False,  
                   bgcolor='white',
                  )
fig.show()

In [None]:
fig=px.bar(published,
           x='Publication', y='Lifetime earnings', color='Publication',
           labels={'Lifetime earnings': 'Lifetime earnings ($)'},
           category_orders={'Publication': publication_order},
           color_discrete_sequence=px.colors.qualitative.G10,
           width=800,
          )

fig.show()

In [None]:
published['Lifetime'] = (published['Stats updated'] - published['Publication date']).dt.days

fig=px.scatter(published.sort_values('Lifetime'),
               x='Lifetime',
               y='Lifetime earnings',
               color='Publication',
               color_discrete_sequence=px.colors.qualitative.G10,
               category_orders={'Publication': publication_order},
               labels={'Lifetime earnings': 'Lifetime earnings ($)', 'Lifetime': 'Lifetime (days)'},
               width=600,
              )
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  mode='lines+markers',
                  selector=dict(mode='markers'))
fig.update_xaxes(autorange="reversed")
#fig.update_layout(
#    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
#))
fig.show()

In [None]:
fig=px.bar(published,
           x='Lifetime earnings',
           y='Title', color='Lifetime earnings',
           labels={'Lifetime earnings': 'Lifetime earnings ($)', 'Title': ''},
           color_continuous_scale=px.colors.sequential.Viridis,
          )

fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

In [None]:
fig=px.bar(published,
           x='Lifetime earnings',
           y='Title', color='Publication',
           category_orders={'Publication': publication_order},
           labels={'Lifetime earnings': 'Lifetime earnings ($)', 'Title': ''},
           color_discrete_sequence=px.colors.qualitative.G10,
          )

fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

In [None]:
published['Publication date'] += pd.DateOffset(hours=23, minutes=59)

fig=px.timeline(published,
                x_start='Submission date', x_end='Publication date',
                y='Publication',
                category_orders={'Publication': publication_order},
                color='Publication',
                color_discrete_sequence=px.colors.qualitative.G10,
               )

fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))
fig.update_xaxes(title='Time line')
fig.show()

## Topic processing

In [None]:
topics = published[['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5']]

In [None]:
unique_topics = []
for c in topics.columns:
    unique_topics.extend(topics[c].unique())
unique_topics = np.sort(np.unique(unique_topics))

In [None]:
newhead = []
for ut in unique_topics:
    newcol = []
    for i in range(topics.shape[0]):
        newcol.append(ut in topics.iloc[i].values)
        
    published['TOPIC_'+ut] = newcol
    newhead.append('TOPIC_'+ut)

In [None]:
tmp = published[newhead].sum().to_frame().reset_index()
tmp.columns = ['topic', 'count']
tmp['topic'] = tmp['topic'].apply(lambda x: x.replace('TOPIC_', ''))

# https://www.wortwolken.com/
tmp = tmp[['count', 'topic']]
tmp.to_csv('medium_topic_count.csv', index=False, header=False)

In [None]:
fig=px.bar(tmp, x='topic', y='count',
           color='count',
           color_continuous_scale=px.colors.sequential.Viridis,
          )

fig.update_yaxes(title='Articles under topic')
fig.update_xaxes(title='Topic', categoryorder='total descending')
fig.show()

In [None]:
df_wordcloud = pd.read_csv('medium_topic_count.csv', header=None, names=['count', 'topic']).dropna()
dict_wordcloud = dict(zip(df_wordcloud['topic'], df_wordcloud['count']))

In [None]:
x, y = np.ogrid[:300, :300]

mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)

wc = wordcloud.WordCloud(colormap='magma', background_color='white', mask=mask)
wc.generate_from_frequencies(dict_wordcloud)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
ax=plt.gca()
ax.set_aspect('equal')
plt.show()

New followers per month and articles published

In [None]:
dummy = pd.DataFrame(dict(Date=published['Publication date'], dummy=1))
dummy = dummy.groupby(pd.Grouper(key='Date',freq='M')).agg({'dummy':'sum'}).reset_index()

In [None]:
dummy = pd.merge(dummy, audience)
dummy['total_article_count'] = dummy['dummy'].cumsum()
dummy['total_follower_count'] = dummy['new_followers'].cumsum()
dummy['log_total_follower_count'] = np.log(dummy['total_follower_count'])

In [None]:
dummy

In [None]:
# exponential trendline
from scipy.optimize import curve_fit
p0,_ = curve_fit(lambda x, a, b: a*x + b, dummy['total_article_count'].values, dummy['log_total_follower_count'].values)

plt.plot(dummy['total_article_count'].values, 
         np.exp(p0[0]*dummy['total_article_count'].values + p0[1]), 'o--')
plt.plot(dummy['total_article_count'].values, dummy['total_follower_count'].values, 's--')

In [None]:
fig = px.scatter(dummy,
                 x='total_article_count', y='total_follower_count',
                 labels={'total_article_count': 'Cumulated published articles',
                         'total_follower_count': 'Cumulated followers'}
                )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  mode='lines+markers',
                  selector=dict(mode='markers'))
fig.show()

In [None]:
fig = px.box(dummy,
                 x='dummy', y='new_followers',
                 labels={'dummy': 'Articles published per month',
                         'new_followers': 'New followers per month'
                        },
                )
fig.update_xaxes(dtick=1)
fig.show()

In [None]:
fig = px.scatter(dummy.loc[dummy['Date']>=pd.to_datetime('2023-08-01')],
                 x='dummy', y='Payout',
                 size='total_follower_count', color='total_follower_count',
                 labels={'total_follower_count': 'Total followers (end of month)',
                         'dummy': 'Monthly new articles',
                         'Payout': 'Monthly revenue (US-$)'},
                 color_continuous_scale=px.colors.sequential.Viridis
                )

fig.show()

## Analysis

In [None]:
fig=px.scatter(published,
               x='Reads',
               y='Lifetime earnings',
               color='Publication',
               trendline='ols',
               trendline_scope="overall",
               width=600,
               height=400,
               color_discrete_sequence=px.colors.qualitative.G10,
              )

fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))

fig.show()

results = px.get_trendline_results(fig)
print(results.iloc[0]["px_fit_results"].summary())

In [None]:
fig=px.scatter(published,
               x='Views',
               y='Lifetime earnings',
               color='Publication',
               trendline='ols',
               trendline_scope="overall",
               width=600,
               height=400,
               color_discrete_sequence=px.colors.qualitative.G10,
              )

fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))

fig.show()

results = px.get_trendline_results(fig)
print(results.iloc[0]["px_fit_results"].summary())

In [None]:
fig=px.scatter(published,
               x='Claps',
               y='Lifetime earnings',
               color='Publication',
               trendline='ols',
               trendline_scope="overall",
               width=600,
               height=400,
               color_discrete_sequence=px.colors.qualitative.G10,
              )

fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))

fig.show()

results = px.get_trendline_results(fig)
print(results.iloc[0]["px_fit_results"].summary())

In [None]:
published['reads_views_frac'] = published['Reads'] / published['Views'] * 100

In [None]:
fig = px.box(published,
             y='reads_views_frac', x='Publication',
             color='Publication',
             color_discrete_sequence=px.colors.qualitative.G10,
             points='all',
             hover_name='Title',
            )

fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))
fig.update_yaxes(title='Read ratio (%)')
fig.show()

## Sklearn

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder

In [None]:
X = published[['Publication', 'Views', 'Reads', 'Length (mins)', 'Claps', 'Responses',
       #'Member reads', 'Non-member reads', 'Member read ratio', 'Claps',
       #'Highlights', 'Replies', 'Follows',
              ]].copy()
y = published['Lifetime earnings'].copy()

In [None]:
enc = OrdinalEncoder()
X['Publication_enc'] = enc.fit_transform(X['Publication'].values.reshape(-1, 1)).squeeze()
X = X.drop(columns=['Publication'])

In [None]:
rf = RandomForestRegressor()
rf.fit(X, y)

In [None]:
rf.score(X, y)

In [None]:
tmp = pd.DataFrame(dict(fimp=rf.feature_importances_, name=rf.feature_names_in_)).sort_values('fimp')

In [None]:
sns.barplot(data=tmp, y='name', x='fimp')
ax=plt.gca()
ax.set_xlabel('Feature importance')
ax.set_ylabel('Feature name')