# Statistics of KuaiRec

## Load data

In [None]:
import pandas as pd

print("Loading big matrix...")
big_matrix = pd.read_csv("data/big_matrix.csv")
print("Loading small matrix...")
small_matrix = pd.read_csv("data/small_matrix.csv")

print("Loading social network...")
social_network = pd.read_csv("data/social_network.csv")
social_network["friend_list"] = social_network["friend_list"].map(eval)

print("Loading item features...")
item_feat = pd.read_csv("data/item_categories.csv")
item_feat["feat"] = item_feat["feat"].map(eval)

print("Loading user features...")
user_feat = pd.read_csv("data/user_features.csv")

print("Loading items' daily features...")
item_daily_feat = pd.read_csv("data/item_daily_feat.csv")

print("All data loaded.")

## Visualization of the four tables

In [None]:
big_matrix

In [None]:
small_matrix

In [None]:
item_feat

In [None]:
social_network

In [None]:
item_daily_feat

In [None]:
user_feat

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
myfont = FontProperties(fname="./SimHei.ttf")

def visual_continue(df, func=None):
    ax = sns.distplot(df)
    if func:
        func(ax)
    
    gca = plt.gca()
    fig_title = "Statistcis of {}".format(df.name)
    gca.set_title(fig_title, fontsize=14)
    gca.set_ylabel("Density", fontsize=14)
    gca.set_xlabel(df.name, fontsize=14)
    
    plt.show()

def visual_statistics_discrete(df, var="my_variable", display_ratio=True, func=None, order=None, size=(6, 4.5)):
    ncount = len(df)

    fig = plt.figure(figsize=size)
    ax1 = fig.add_axes([0.14, 0.15, 0.74, 0.75])
    sns.countplot(x=df, color="#9fc5e8", linewidth=.6, edgecolor='k', ax=ax1, order=order)


    plt.grid(axis='y', linestyle='-.')

    gca = plt.gca()
    fig_title = "Statistcis of {}".format(var)
    gca.set_title(fig_title, fontsize=14)
    gca.set_ylabel("Count", fontsize=14)
    gca.set_xlabel(var, fontsize=14)
    
    if func:
        func(ax1)

    if display_ratio:
        # Make twin axis
        ax2 = ax1.twinx()
        ax2.set_ylabel("ratio (%)", fontsize=14)


        for p in ax1.patches:
            x = p.get_bbox().get_points()[:, 0]
            y = p.get_bbox().get_points()[1, 1]
            ax1.annotate('{:.1f}%'.format(100. * y / ncount), (x.mean(), y),
                         ha='center', va='bottom', fontsize=10, rotation=30)  # set the alignment of the text

        ax2.set_ylim(0, ax1.get_ylim()[1] / ncount * 100)

    plt.savefig("f{var}.png")
    plt.show()
    

In [None]:
import warnings; warnings.simplefilter('ignore')

## Statistics of social network

In [None]:
print(social_network.friend_list.map(len).describe())
visual_statistics_discrete(social_network.friend_list.map(len), "number of friends")

## Statistics of video features

In [None]:
num_feat = item_feat.feat.map(len)
print(num_feat)
visual_statistics_discrete(num_feat, "number of tags")

## Count of 31 tags 

In [None]:
import collections
import itertools

cnt = item_feat.feat.map(collections.Counter)
cnt_all = collections.Counter()
for d in cnt:
    cnt_all.update(d)
# print(dict(cnt_all))
all_feat = pd.Series(sorted(list(itertools.chain.from_iterable([[i]*k for i,k in cnt_all.items()]))),name="feat")
# print(all_feat)
visual_statistics_discrete(all_feat, "tag", size=(12,4.5))

## Watch_ratio in big matrix

In [None]:
big_watch_ratio = big_matrix.watch_ratio[big_matrix.watch_ratio <= 5]
print(big_watch_ratio.describe())
visual_continue(big_watch_ratio)

## Watch_ratio in small matrix

In [None]:
small_watch_ratio = small_matrix.watch_ratio[small_matrix.watch_ratio <= 5]
print("watch_ratio in small matrix")
print(small_watch_ratio.describe())
visual_continue(small_watch_ratio)

## Video duration of the big matrix (in millisecond)

In [None]:
big_video_duration = big_matrix.video_duration
print(big_video_duration.describe())
# visual_continue(big_video_duration)
visual_continue(big_video_duration[big_video_duration < 100000])

## Video duration of the small matrix (in millisecond)

In [None]:
small_video_duration = small_matrix.video_duration
print(small_video_duration.describe())
# visual_continue(small_video_duration)
visual_continue(small_video_duration[small_video_duration < 100000])

## 大矩阵中每个用户的总播放次数

In [None]:
big_play_time = big_matrix.groupby('user_id').agg({"date":len})
big_play_time.name = "play times"
print(big_play_time.describe())
visual_continue(big_play_time)

## 小矩阵中每个用户的总播放次数

In [None]:
small_play_time = small_matrix.groupby('user_id').agg({"date":len})
small_play_time.name = "play times"
print(small_play_time.describe())
visual_continue(small_play_time)

## 大矩阵中每个用户的日播放次数

In [None]:
big_daily_play_time = big_matrix.groupby(['user_id', 'date']).size()
big_daily_play_time.name = "play times"
print(big_daily_play_time.describe())
visual_continue(big_daily_play_time)

## 小矩阵中每个用户的日播放次数

In [None]:
small_daily_play_time = small_matrix.groupby(['user_id', 'date']).size()
small_daily_play_time.name = "play times"
print(small_daily_play_time.describe())
visual_continue(small_daily_play_time)

## 大矩阵中播放日期分布

In [None]:
import functools
def adjust_xticks(ax):
    # print(ax.get_xticklabels())
    ax.set_xticks(list(range(0,len(ax.get_xticklabels()),3)))
    # print(ax.get_xticklabels())
    # ax.set_xticklabels(rotation = 45)
    for tick in ax.get_xticklabels():
        tick.set_rotation(45)
    # plt.xticks(fontsize=8)
    # ax.set_xticklabels(["{}".format(str(i)) for i in ax.get_xticks()])

In [None]:
visual_statistics_discrete(big_matrix.date, "date", display_ratio=False, func=adjust_xticks, order=small_matrix.date[~small_matrix.date.isna()].map(int).unique())

## 小矩阵中播放日期分布

In [None]:
visual_statistics_discrete(small_matrix.date[~small_matrix.date.isna()].map(int), "date", display_ratio=False, func=adjust_xticks)