In [None]:
import calendar
import datetime
import math
import os
from pprint import pprint

import matplotlib
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# %matplotlib widget
%matplotlib inline

# TODO:
* error bars for the "by month" plots
* top_x authors over time - all months not just top months
* top_x_all authors over time
* make by-month and per-month graphs more smooth

In [None]:
FILE_NAME = 'all'
# FILE_NAME = 'test'
OUTPUT_DATA_FILE = os.path.join('data', f'{FILE_NAME}_data.parquet')
OUTPUT_METADATA_FILE = os.path.join('data', f'{FILE_NAME}_meta.parquet')

AGGREGATION_MIN = 5

os.makedirs('pngs', exist_ok=True)

In [None]:
def plot_dataframe(df_to_plot, xvalues=None, title=None, xlabel=None, ylabel=None, ylim_bottom=0, yscale=None, lables=None, override_font=False, show_plot=True, output_file_name=None):
    with plt.xkcd(scale=0.5):
        # The styalized font XKCD uses doesn't have very much unicode coverage, override font if you need to use unicode text
        if override_font:
            matplotlib.rc('font', family='Arial')
        
        # Set fig size
        plt.figure(figsize=(1920/80, 1080/80))
        
        # Plot with labels if provided, else without
        if xvalues is not None:
            plt.plot(xvalues, df_to_plot)
        else:
            plt.plot(df_to_plot)

        if ylim_bottom is not None:
            plt.ylim(bottom=ylim_bottom)
        
        # Style plot and add text
        plt.grid(True, lw=0.5, zorder=0)
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        
        # Semilog-y plots
        # TODO: is this working?
        if yscale is not None:
            plt.yscale(yscale)

        # Add lables if provided
        if lables is not None:
            plt.legend(lables)

        # Save and plot!
        if output_file_name is not None:
            plt.savefig(output_file_name)
        if show_plot:
            plt.show()

def plot_windowed_msg_per_min(msg_per_min, window_length=10, show_plot=True, output_file_name='test.png'):
    # Pad with zeros before the start of the server for filtering
    pad_length = int(math.ceil(window_length/2.0))
    pre_pad = msg_per_min[:pad_length]
    pre_pad = pre_pad.tshift(-pad_length)
    pre_pad[:] = 0

    post_pad = msg_per_min[-pad_length:]
    post_pad = post_pad.tshift(pad_length)
    post_pad[:] = msg_per_min[-pad_length:].mean()

    # Filter with centered blackman-harris window function and slice off the pad data
    filtered_msg_per_min = pd.concat([pre_pad, msg_per_min, post_pad])
    filtered_msg_per_min = filtered_msg_per_min.rolling(window_length, center=True, win_type='blackmanharris').mean()[pad_length:-pad_length]
    
    # Plot filtered data
    plot_dataframe(
        filtered_msg_per_min,
        title=f'Averaged Smoothed Message Rate Over History ({window_length*AGGREGATION_MIN/60/24} day window)',
        xlabel='Datetime (ref:UTC)',
        ylabel='msg/min (avg)',
        show_plot=show_plot,
        output_file_name=output_file_name
    )

In [None]:
data = pd.read_parquet(f'{OUTPUT_DATA_FILE}')
metadata = pd.read_parquet(f'{OUTPUT_METADATA_FILE}')

In [None]:
data

In [None]:
metadata

In [None]:
print(f'Number of messages per channel out of {len(data)} total messages:')
msgs_by_user = data['channel_name'].value_counts()
msgs_by_user

In [None]:
msg_per_min = pd.Series(1, index=data['creation_datetime']).resample(f'{AGGREGATION_MIN}min').count()/AGGREGATION_MIN
# msg_per_min

In [None]:
window_length = int(round(datetime.timedelta(days=7).total_seconds()/60/AGGREGATION_MIN))
plot_windowed_msg_per_min(msg_per_min, window_length, output_file_name='pngs/msg_rate_7day_window.png')

In [None]:
window_length = int(round(datetime.timedelta(days=30).total_seconds()/60/AGGREGATION_MIN))
plot_windowed_msg_per_min(msg_per_min, window_length, output_file_name='pngs/msg_rate_30day_window.png')

In [None]:
window_length = int(round(datetime.timedelta(days=365).total_seconds()/60/AGGREGATION_MIN))
plot_windowed_msg_per_min(msg_per_min, window_length, output_file_name='pngs/msg_rate_365day_window.png')

In [None]:
# window_length = int(round(datetime.timedelta(days=7).total_seconds()/60/AGGREGATION_MIN))
# plot_windowed_msg_per_min(msg_per_min, window_length, show_plot=False)

# window_length = int(round(datetime.timedelta(days=30).total_seconds()/60/AGGREGATION_MIN))
# plot_windowed_msg_per_min(msg_per_min, window_length, show_plot=False)

# window_length = int(round(datetime.timedelta(days=365).total_seconds()/60/AGGREGATION_MIN))
# plot_windowed_msg_per_min(msg_per_min, window_length, show_plot=True, output_file_name='msg_rate_multi_window.png')

In [None]:
# Message rate over hours of the day
plot_dataframe(
    msg_per_min.groupby(msg_per_min.index.hour).mean(),
    title='Average message rate over hour of the day',
    xlabel='Hour of the day (ref:UTC)',
    ylabel='msg/min (avg)',
#     yscale='log'
    output_file_name='pngs/hour_of_day.png',
)

In [None]:
# Message rate over day of the week
plot_dataframe(
    msg_per_min.groupby(msg_per_min.index.dayofweek).mean(),
    xvalues=list(calendar.day_name),
    title='Average message rate over day of the week',
    xlabel='Day of the week (ref:UTC)',
    ylabel='msg/min (avg)',
#     yscale='log'
    output_file_name='pngs/day_of_week.png',
)

In [None]:
# Message rate over day of the week
plot_dataframe(
    msg_per_min.groupby(msg_per_min.index.weekofyear).mean(),
    title='Message rate over week of the year',
    xlabel='Week of the Year (ref:UTC)',
    ylabel='msg/min (avg)',
#     yscale='log'
    output_file_name='pngs/week_of_year.png',
)

In [None]:
# comulitive sum of messages across users
plot_dataframe(
    data['author'].value_counts().to_numpy()/len(data),
    title='Fraction of total messages by user',
    xlabel='Users',
    ylabel='Fraction of total messages',
#     yscale='log'
    output_file_name='pngs/msg_by_user_fraction.png',
)

In [None]:
# comulitive sum of messages across users, reverse-sorted
plot_dataframe(
    data['author'].value_counts()[::-1].cumsum().to_numpy()/len(data),
    title='Cumsum of fraction of total messages by user',
    xlabel='Users',
    ylabel='Fraction of total messages',
#     yscale='log'
    output_file_name='pngs/msg_by_user_cumsum.png',
)

In [None]:
author_counts_by_month = [(n, g['author'].value_counts()) for n, g in data.groupby(pd.Grouper(key='creation_datetime', freq='M'))]
mean_msgs_per_author_counts_by_month = pd.Series([i[1].mean() for i in author_counts_by_month], [i[0] for i in author_counts_by_month])
active_users_by_month = pd.Series([i[1].count() for i in author_counts_by_month], [i[0] for i in author_counts_by_month])

In [None]:
print('% of total messages for the top 10 most prolific authors:')
msgs_by_user = data['author'].value_counts()
print(msgs_by_user[:10]/len(data)*100)

# print('')
# # Replace "author" with the author string of your choice, the format is "name#1234"
# print(f'msgs by "author": {msgs_by_user["author"]/len(data)*100}%    #{msgs_by_user.index.get_loc("author")+1} on the server')

In [None]:
# Average msgs per active user per month
plot_dataframe(
    mean_msgs_per_author_counts_by_month,
    title='Average messages per active user per month',
    xlabel='Datetime (ref:UTC)',
    ylabel='Average messages/user',
#     yscale='log'
    output_file_name='pngs/msg_per_user_per_month.png',
)

In [None]:
# Average msgs per active user by month
plot_dataframe(
    mean_msgs_per_author_counts_by_month.groupby(mean_msgs_per_author_counts_by_month.index.month).mean(),
    title='Average msgs per active user by month',
    xlabel='Month (ref:UTC)',
    ylabel='Average messages/user',
#     yscale='log'
    output_file_name='pngs/msg_per_user_by_month.png',
)

In [None]:
# Average msgs per active user per month
plot_dataframe(
    active_users_by_month,
    title='Active active users per month',
    xlabel='Datetime (ref:UTC)',
    ylabel='Active Users',
#     yscale='log'
    output_file_name='pngs/active_users_by_month.png',
)

In [None]:
# Average msgs per active user by month
plot_dataframe(
    active_users_by_month.groupby(active_users_by_month.index.month).mean(),
    title='Average msgs per active user by month',
    xlabel='Month (ref:UTC)',
    ylabel='Average messages/user',
#     yscale='log'
    output_file_name='pngs/msg_per_user_by_month.png',
)

In [None]:
TOP_N_PER_MONTH = 1

top_author_counts_by_month = [(i, j[0:TOP_N_PER_MONTH]) for i, j in author_counts_by_month]
all_top_authors = set()
for i, j in top_author_counts_by_month:
    [all_top_authors.add(i) for i in j.index.to_list()]

print(f'All users that have been in the top {TOP_N_PER_MONTH} authors in any given month in the history of the server:')
pprint(sorted(list(all_top_authors)))

In [None]:
# Init the dataframe
top_authors_across_months_count = pd.DataFrame(index=[i for i, j in top_author_counts_by_month])
for i in all_top_authors:
    top_authors_across_months_count[i] = 0.0

# over all months and the top authors of all time, calculate the number of messages send, zero if they had no messages that month
for i, j in author_counts_by_month:
    for k in all_top_authors:
        top_authors_across_months_count.at[i, k] = j.get(k, 0)

# Average msgs per active user by month
plot_dataframe(
    top_authors_across_months_count/(30*24),
    title=f'Average msgs per hour for each of the top {TOP_N_PER_MONTH} users in any month',
    xlabel='Datetime (ref:UTC)',
    ylabel='messages/hour',
#     yscale='log'
    output_file_name='pngs/rate_top_bymonth_users.png',
    lables=all_top_authors,
    override_font=True,
)

In [None]:
# Init the dataframe
top_authors_across_months_perc = pd.DataFrame(index=[i for i, j in top_author_counts_by_month])
for i in all_top_authors:
    top_authors_across_months_perc[i] = 0.0

# over all months and the top authors of all time, calculate % of total messages for each of the months, zero if they had no messages that month
for i, j in author_counts_by_month:
    for k in all_top_authors:
#         print(i)
#         print(j)
#         print(k)
#         raise ValueError()
        top_authors_across_months_perc.at[i, k] = 100*j.get(k, 0)/j.sum()

# Average msgs per active user by month
plot_dataframe(
    top_authors_across_months_perc,
    title=f'Percentage of total msgs per month for each of the top {TOP_N_PER_MONTH} users in any month',
    xlabel='Datetime (ref:UTC)',
    ylabel='Percent of total messages/month',
#     yscale='log'
    output_file_name='pngs/perc_top_bymonth_users.png',
    lables=all_top_authors,
    override_font=True,
)

In [None]:
TOP_N_EVER = 5
top_n_users = msgs_by_user[:TOP_N_EVER].index.to_list()

In [None]:
# Init the dataframe
top_authors_perc = pd.DataFrame(index=[i for i, j in top_author_counts_by_month])
for i in top_n_users:
    top_authors_perc[i] = 0.0

# over all months and the top authors of all time, calculate the number of messages send, zero if they had no messages that month
for i, j in author_counts_by_month:
    for k in top_n_users:
#         print('**********')
#         print(i)
#         print(j)
#         print(k)
#         raise ValueError()
        top_authors_perc.at[i, k] = 100.0*j.get(k, 0)/float(j.sum())

top_authors_perc

# Average msgs per active user by month
plot_dataframe(
    top_authors_perc,
    title=f'Percent of total msgs per month for each of the top {TOP_N_EVER} users of all time',
    xlabel='Datetime (ref:UTC)',
    ylabel='Percentage of messages/month',
#     yscale='log'
    output_file_name='pngs/rate_top_users.png',
    lables=msgs_by_user[:TOP_N_EVER].index.to_list(),
    override_font=True,
)

In [None]:
# Init the dataframe
top_authors_count = pd.DataFrame(index=[i for i, j in top_author_counts_by_month])
for i in top_n_users:
    top_authors_count[i] = 0

# over all months and the top authors of all time, calculate the number of messages send, zero if they had no messages that month
for i, j in author_counts_by_month:
    for k in top_n_users:
        top_authors_count.at[i, k] = j.get(k, 0)

top_authors_count

# Average msgs per active user by month
plot_dataframe(
    top_authors_count/(30*24),
    title=f'Average msgs per hour for each of the top {TOP_N_EVER} authors ever',
    xlabel='Datetime (ref:UTC)',
    ylabel='messages/hour',
#     yscale='log'
    output_file_name='pngs/perc_top_users.png',
    lables=msgs_by_user[:TOP_N_EVER].index.to_list(),
    override_font=True,
)

In [None]:
msgs_by_user[:TOP_N_EVER].index.to_list()

In [None]:
all_top_authors