In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots 
from tqdm.notebook import tqdm
import re 
pd.set_option('max_columns', None)
pd.options.display.max_colwidth = 100
from pandas.api.types import CategoricalDtype

In [None]:
df = pd.read_csv('data.csv', index_col=0)
df.shape

In [None]:
df7 = df[['cid','agent']].groupby('cid').agg(['count',pd.Series.nunique])
df7.columns = df7.columns.get_level_values(1)
df7 = df7.reset_index()
df7 = df7.rename(columns={"nunique": "unique"})
df7['count'] = df7['count'].astype(int)
df7['unique'] = df7['unique'].astype(int)
df7 = df7.sort_values(by=['count'], ascending=False)
df7.head()

In [None]:
df7['count_type'] = ''

def addCountType(l, r, name):
    df7.loc[(df7['count'] >= l) & (df7['count'] < r), 'count_type'] = name

addCountType(1, 2, '1')
addCountType(2, 10, '[2,10)')
addCountType(10, 100, '[10,100)')
addCountType(100, 1000, '[100,1000)')
addCountType(1000, 10000, '[1000,10000)')
addCountType(10000, 100000, '[10000,100000)')

df7[df7['count']>100000].shape[0]

In [None]:
df7['unique_type'] = ''

def addUniqueType(l, r, name):
    df7.loc[(df7['unique'] >= l) & (df7['unique'] < r), 'unique_type'] = name

addUniqueType(1, 2, '1')
addUniqueType(2, 10, '[2,10)')
addUniqueType(10, 100, '[10,100)')
addUniqueType(100, 1000, '[100,1000)')

df7[df7['unique'] >= 1000].shape[0]

In [None]:
df8 = df7[['count_type','unique_type','cid']].groupby(['count_type','unique_type']).agg('count')
df8 = df8.rename(columns={"cid": "count"})
df8 = df8.reset_index()
total = df8['count'].sum()
df8['percentage'] = df8['count']/total*100
df8['percentage'] = df8['percentage'].apply(lambda x:round(x,3))

In [None]:
df8.head()

In [None]:
df8.groupby('unique_type').agg('sum')

In [None]:
cat_unique_order = CategoricalDtype(
    ['1', '[2,10)', '[10,100)', '[100,1000)'], 
    ordered=True
)
df8['unique_type'] = df8['unique_type'].astype(cat_unique_order)
df8 = df8.sort_values('unique_type')

cat_count_order = CategoricalDtype(
    ['1', '[2,10)', '[10,100)', '[100,1000)','[1000,10000)','[10000,100000)'], 
    ordered=True
)
df8['count_type'] = df8['count_type'].astype(cat_count_order)
df8 = df8.sort_values('count_type')
df8 = df8.dropna()
df8.head()

In [None]:
data = df8.pivot(index='unique_type', columns='count_type', values='count')
data = np.log(data)
data = data.fillna(0)
data.head()

In [None]:
text = df8.pivot(index='unique_type', columns='count_type', values='percentage')

text = text.T
text['1'] = text['1'] .astype(str) + "%"
text['[2,10)'] = text['[2,10)'] .astype(str) + "%"
text['[10,100)'] = text['[10,100)'] .astype(str) + "%"
text['[100,1000)'] = text['[100,1000)'] .astype(str) + "%"

text = text.T
text = text.replace("nan%","0%")
text.head()

In [None]:
fig = px.imshow(data, 
                origin='lower',
               labels=dict(color="percentage of total cid"), text_auto=True)
fig = fig.update_traces(text=text, texttemplate="%{text}", hovertemplate=None)

fig.update_xaxes(side='bottom')
fig.update_xaxes(title="range of times each cid is requested")
fig.update_yaxes(title="range of unique user agents each cid is requested by")

fig.show()

In [None]:
data = df7[df7['count']>100]
data = data.reset_index().drop(['index'],axis=1).reset_index()
data['idx_percentage'] = data['index']/data.shape[0]
data.head()

In [None]:
fig = go.Figure([
    go.Scatter(
        name='total number of times<br>cid is requested',
        mode='lines',
        x=data['idx_percentage'],
        y=data['count']
    ),
    go.Scatter(
        name='number of unique user agent<br>cid is requested by',
        mode='lines',
        x=data['idx_percentage'],
        y=data['unique']
    )
])

# fig.update_xaxes(visible=False, showticklabels=False)
fig.update_xaxes(tickformat = ',.0%')
fig.update_yaxes(type="log")

fig.update_layout(xaxis_title='cid (request time >100 and sorted by request time)', yaxis_title='count (log scale)')

fig.show()