In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots 
from tqdm.notebook import tqdm
import re 
pd.set_option('max_columns', None)
pd.options.display.max_colwidth = 100

In [None]:
df = pd.read_csv('data.csv', index_col=0)
df.shape

We will begin by tallying the number of requests for each unique CID and group them by this count. From there, we can calculate the size of the files within each group, allowing us to understand the distribution of file sizes for the most requested content.

# Request time

In [None]:
df1 = df[['cid','bytes_returned']].groupby('cid').agg(['count','mean'])
df1.columns = df1.columns.get_level_values(1)
df1 = df1.reset_index()
df1['mean'] = df1['mean'].astype(int)
df1 = df1.rename(columns={"mean": "size"})
df1 = df1.sort_values(by=['size'])
df1.head()

In [None]:
df2 = pd.DataFrame(df1['count'].value_counts())
df2 = df2.reset_index()
df2 = df2.rename(columns={"index": "request_time"})
df2 = df2.sort_values(by=['request_time'])
df2.head()

In [None]:
df3 = pd.DataFrame(columns = ['request_time', 'count'])

def addRow(df3, l, r):
    df_temp = df1[(df1['count'] >= l) & (df1['count'] < r)]
    c = df_temp.count()[0]
    df3 = df3.append({'request_time':'['+str(l)+','+str(r)+')', 'count':c}, ignore_index = True)
    return df3

df3 = addRow(df3, 1, 2)
df3 = addRow(df3, 2, 10)
df3 = addRow(df3, 10, 100)
df3 = addRow(df3, 100, 1000)
df3 = addRow(df3, 1000, 10000)
df3 = addRow(df3, 10000, 100000)

df3 = df3.replace('[1,2)', '1')
df3 = df3.replace('[10000,100000)', '[10000,+∞)')

total = df3['count'].sum()
df3['percentage'] = df3['count']/total
df3.head()

In [None]:
fig = px.bar(df3, x='request_time', y='count', text=[str(x[0])+'\n{0:1.2f}%'.format(x[1]*100) for x in zip(df3['count'],df3['percentage'])])

fig.update_xaxes(title='range of times each cid is requested')
fig.update_yaxes(title='cid count')
                                                                 
fig.show()

In [None]:
df5 = df1.copy()

df5['type'] = ''

def addType(l, r):
    df5.loc[(df5['count'] >= l) & (df5['count'] < r), 'type'] = '['+str(l)+','+str(r)+')'

df5.loc[df5['count'] == 1, 'type'] = '1'
addType(2, 10)
addType(10, 100)
addType(100, 1000)
addType(1000, 10000)
df5.loc[df5['count'] > 10000, 'type'] = '[10000,+∞)'

df5.head()

In [None]:
def q1(x):
    return x.quantile(0.01)

def q10(x):
    return x.quantile(0.1)

def q90(x):
    return x.quantile(0.9)

def q99(x):
    return x.quantile(0.99)


df6 = df5[['type','size']].groupby('type').agg(['min',q1,q10,'median',q90,q99,'max'])
df6_MB = df6/pow(1024,2)
df6_MB

In [None]:
data = df5[['type','size']]
data['size'] = data['size']/1024

fig = px.box(data, x="type", y="size", log_y=True)
fig.update_xaxes(title='range of times each cid is requested')
fig.update_yaxes(title='file size in KB (log scale)')
fig.show()