In [None]:
import os
import sys
import pathlib
import json

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import seaborn as sns

import scipy

import matplotlib.pylab as plt
plt.set_loglevel('error')
%matplotlib inline

import logging
logging.basicConfig(stream=sys.stdout, format='%(asctime)-15s %(message)s',
                level=logging.DEBUG, datefmt=None)
logger = logging.getLogger("db-ingest")

from IPython.display import display, Markdown

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
users_df = pd.read_csv('../data/users.csv', index_col=0)
users_df.head(3)
events_df = pd.read_csv('../data/events.csv', index_col=0)
events_df.head(3)

In [None]:
events_df.columns
users_df.columns

In [None]:
users_df = users_df.loc[:, ['id', 'created', 'storage', 'plan']]
users_df.head()
events_df = events_df.loc[:, ['client.user_id', 'direction', 'size', 'status', 'time.backend', 'timestamp']]
events_df.head()

In [None]:
### TRANSFER SPEED
KB = 1e3
mS = 1e-3
events_df['size_KB'] = events_df['size'].apply(lambda x: x / KB)
events_df['time.backend [s]'] = events_df['time.backend'].apply(lambda x: x / KB)
events_df['transfer_speed [KB/s]'] = events_df['size_KB'] / events_df['time.backend [s]']
events_df.loc[:5, ['size', 'size_KB', 'time.backend', 'time.backend [s]', 'transfer_speed [KB/s]']]

In [None]:
events_df.to_csv('../data/events_post.csv')
users_df.to_csv('../data/users_post.csv')

# Data Analysis: 

### Questions:
1. Are users’ average upload speed and average download speed correlated?
Bonus: what if we only include users with at least 20 upload and
download events each?

2. Are users with a storage plan of >= 100 GB more active (in terms of number
of files uploaded) than users with a < 100 GB plan?

3. Are files larger than 10MB uploaded faster than files smaller than 10MB?
Bonus: is this consistent on a per-user basis?


In [None]:
len(events_df)
len(users_df)
df = events_df.merge(users_df, left_on='client.user_id', right_on='id', how='inner')
df['storage [GB]'] = df.storage.apply(lambda x: x * 9.31 * 1e-10)
df['storage [TB]'] = df.storage.apply(lambda x: x * 9.31 * 1e-12)
len(df)
df.columns
df.head()

In [None]:
g = df.groupby(['client.user_id', 'direction']).agg({'transfer_speed [KB/s]': 'mean', 'direction': 'count'}).rename(columns={'direction': 'direction_count'})
_ = sns.displot(g, x='transfer_speed [KB/s]', hue='direction', kind='kde')

The mean upload transfer speed is less large than the download transfer speed, with short tails

In [None]:
mask = g.direction_count >= 20
gg = g[mask]
_ = sns.displot(gg, x='transfer_speed [KB/s]', hue='direction', kind='kde')

In [None]:
df['storage [GB]'].value_counts(dropna=False)

In [None]:
df['storage >= 100 GB'] = df['storage [GB]'] >= 100.0
counts = df['storage >= 100 GB'].value_counts()
display(Markdown(f'# value counts storage >= 100 GB: TRUE == {counts[1]} / FALSE == {counts[0]}'))
display(Markdown(f'# users with a storage >= 100 GB: % {round(100 * counts[1] / counts[0], 2)}'))

In [None]:
g = df.groupby(['client.user_id', 'direction', 'storage >= 100 GB']).agg({'direction': 'count'}).\
    rename(columns={'direction': 'direction_count'})

mask = g.index.get_level_values(1) == 'upload'
display(Markdown(f'### number of events (both directions): {len(g)}'))
display(Markdown(f'### number of upload events: {mask.sum()}'))

_ = sns.displot(g[mask], x='direction_count', hue='storage >= 100 GB', kind='kde') # upload
_ = plt.title('direction == upload')
_ = sns.displot(g[~mask], x='direction_count', hue='storage >= 100 GB', kind='kde') # download
_ = plt.title('direction == download')

### Statistics correlation

In [None]:
g = df.groupby(['client.user_id', 'direction']).\
    agg({'transfer_speed [KB/s]': 'mean', 'direction': 'count'}).\
    rename(columns={'direction': 'direction_count'})
g

# Kolmogorov-Smirnov Test:
The null-hypothesis is about the two distributions are the same. Thus, the lower the p-value the greater the statistical evidence the two distribution *are different*

In [None]:
mask = g.index.get_level_values(1) == 'upload'
ats_upload = g[mask]
ats_download = g[~mask]
scipy.stats.ks_2samp(ats_upload['transfer_speed [KB/s]'], ats_download['transfer_speed [KB/s]'])

In [None]:
def ecdf(x):
    xs = np.sort(x)
    ys = np.arange(1, len(xs)+1)/float(len(xs))
    return xs, ys


_ = plt.plot(*ecdf(ats_upload['transfer_speed [KB/s]']), label='upload')
_ = plt.plot(*ecdf(ats_download['transfer_speed [KB/s]']), label='download')
_ = plt.title('ECDF for download/upload transfer_speed [KB/s]')
_ = plt.legend()

In [None]:
ats_upload_gt20 = ats_upload[ats_upload.direction_count >= 20]
ats_download_gt20 = ats_download[ats_download.direction_count >= 20]
scipy.stats.ks_2samp(ats_upload_gt20['transfer_speed [KB/s]'], ats_download_gt20['transfer_speed [KB/s]'])

In [None]:
_ = plt.plot(*ecdf(ats_upload_gt20['transfer_speed [KB/s]']), label='upload')
_ = plt.plot(*ecdf(ats_download_gt20['transfer_speed [KB/s]']), label='download')
_ = plt.title('ECDF for download/upload transfer_speed [KB/s] (direction count >= 20)')
_ = plt.legend()