In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

import plotly.express as px

In [None]:
medium = pd.read_excel('medium-posting-schedule.xlsx')

In [None]:
medium['Stats updated'].value_counts()

## Article stats

In [None]:
fig=px.bar(medium['Status'].value_counts().reset_index(), 
           x='index', y='Status', color='index',
           category_orders={'index': ['published', 'submitted', 'finished draft', 'early draft', 'idea']},
           labels={'index': 'Article status', 'Status': 'Article count'},
           color_discrete_sequence=px.colors.qualitative.G10,
           width=500,
           text_auto='.0f',
          )
fig.update_layout(showlegend=False)
fig.show()

In [None]:
cond = medium['Status'] == 'published'
published = medium[cond].copy()
published

In [None]:
fig=px.bar(published['Publication'].value_counts().reset_index(), 
           x='index', y='Publication', color='index',
           labels={'index': 'Publication', 'Status': 'Article count'},
           color_discrete_sequence=px.colors.qualitative.G10,
           width=500,
           text_auto='.0f',
          )
fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig=px.scatter(published['Acceptance date'].value_counts().reset_index(),
               x='index', y='Acceptance date', size='Acceptance date',
               labels={'index': 'Acceptance date', 'Acceptance date':'Article count'}, 
              )
fig.show()

In [None]:
# TODO divide this by posts accepted in publications
# and posts published on my page

this_df = (published['Acceptance date'].sort_values() - published['Acceptance date'].sort_values().shift()).dropna().dt.days

fig=px.violin(this_df,
              labels={'variable': '', 'value': 'Post frequency (days)'},
              width=800,
              box=True,
              points='all',
              orientation='h',
             )

fig.add_annotation(text=f'Median: {this_df.median():.1f} days', x=0, xref='paper', y=1, yref='paper', showarrow=False,
                   bgcolor='white',
                  )
fig.update_xaxes(tick0=0, dtick=1)
fig.update_yaxes(labelalias={'Acceptance date': ''})
fig.show()

In [None]:
fig=px.box(published,
           x='Publication', y='Lifetime earnings', color='Publication',
           labels={'Lifetime earnings': 'Lifetime earnings ($)'},
           color_discrete_sequence=px.colors.qualitative.G10,
           log_y=True,
           width=600,
          )
fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))
fig.add_annotation(text=f'Total earnings: {published["Lifetime earnings"].sum():.2f} $',
                   x=0.9, xref='paper',
                   y=0.9, yref='paper', 
                   showarrow=False,  
                   bgcolor='white',
                  )
fig.show()

In [None]:
published['Lifetime'] = (published['Stats updated'] - published['Acceptance date']).dt.days

fig=px.scatter(published.sort_values('Lifetime'),
               x='Lifetime',
               y='Lifetime earnings',
               color='Publication',
               color_discrete_sequence=px.colors.qualitative.G10,
               labels={'Lifetime earnings': 'Lifetime earnings ($)', 'Lifetime': 'Lifetime (days)'},
               width=600,
              )
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  mode='lines+markers',
                  selector=dict(mode='markers'))
fig.update_xaxes(autorange="reversed")
fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))
fig.show()

In [None]:
fig=px.bar(published,
           x='Lifetime earnings',
           y='Title', color='Lifetime earnings',
           labels={'Lifetime earnings': 'Lifetime earnings ($)', 'Title': ''},
           color_continuous_scale=px.colors.sequential.Viridis,
          )

fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

In [None]:
fig=px.bar(published,
           x='Lifetime earnings',
           y='Title', color='Publication',
           labels={'Lifetime earnings': 'Lifetime earnings ($)', 'Title': ''},
           color_discrete_sequence=px.colors.qualitative.G10,
          )

fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

In [None]:
fig=px.violin(published,
              x='Publication',
              y='Member read ratio',
              color='Publication',
              color_discrete_sequence=px.colors.qualitative.G10,
              box=True,
              width=600,
             )

fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))
fig.show()

In [None]:
fig=px.timeline(published,
                x_start='Submission date', x_end='Acceptance date',
                y='Publication',
                color='Publication',
                color_discrete_sequence=px.colors.qualitative.G10,
               )

fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))
fig.update_xaxes(title='Time line')
fig.show()

## Topic processing

In [None]:
topics = published[['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5']]

In [None]:
unique_topics = []
for c in topics.columns:
    unique_topics.extend(topics[c].unique())
unique_topics = np.sort(np.unique(unique_topics))

In [None]:
newhead = []
for ut in unique_topics:
    newcol = []
    for i in range(topics.shape[0]):
        newcol.append(ut in topics.iloc[i].values)
        
    published['TOPIC_'+ut] = newcol
    newhead.append('TOPIC_'+ut)

In [None]:
tmp = published[newhead].sum().to_frame().reset_index()
tmp.columns = ['topic', 'count']
tmp['topic'] = tmp['topic'].apply(lambda x: x.replace('TOPIC_', ''))

# https://www.wortwolken.com/
tmp = tmp[['count', 'topic']]
tmp.to_csv('medium_topic_count.csv', index=False, header=False)

In [None]:
fig=px.bar(tmp, x='topic', y='count',
           color='count',
           color_continuous_scale=px.colors.sequential.Viridis,
          )

fig.update_yaxes(title='Articles under topic')
fig.update_xaxes(title='Topic', categoryorder='total descending')
fig.show()

## Analysis

In [None]:
fig=px.scatter(published,
               x='Reads',
               y='Lifetime earnings',
               color='Publication',
               trendline='ols',
               trendline_scope="overall",
               width=600,
               height=400,
               color_discrete_sequence=px.colors.qualitative.G10,
              )

fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))

fig.show()

results = px.get_trendline_results(fig)
print(results.iloc[0]["px_fit_results"].summary())

In [None]:
fig=px.scatter(published,
               x='Member reads',
               y='Lifetime earnings',
               color='Publication',
               trendline='ols',
               trendline_scope="overall",
               width=600,
               height=400,
               color_discrete_sequence=px.colors.qualitative.G10,
              )

fig.update_layout(
    legend=dict(yanchor="top", y=1.2, xanchor="left", x=0.01, orientation="h",
))

fig.show()

results = px.get_trendline_results(fig)
print(results.iloc[0]["px_fit_results"].summary())

## Sklearn

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder

In [None]:
X = published[['Publication', 'Views', 'Reads ', 'Length (mins)',
       #'Member reads', 'Non-member reads', 'Member read ratio', 'Claps',
       #'Highlights', 'Replies', 'Follows',
              ]].copy()
y = published['Lifetime earnings'].copy()

In [None]:
enc = OrdinalEncoder()
X['Publication_enc'] = enc.fit_transform(X['Publication'].values.reshape(-1, 1)).squeeze()
X = X.drop(columns=['Publication'])

In [None]:
rf = RandomForestRegressor()
rf.fit(X, y)

In [None]:
rf.score(X, y)

In [None]:
tmp = pd.DataFrame(dict(fimp=rf.feature_importances_, name=rf.feature_names_in_)).sort_values('fimp')

In [None]:
sns.barplot(data=tmp, y='name', x='fimp')
ax=plt.gca()
ax.set_xlabel('Feature importance')
ax.set_ylabel('Feature name')