In [2]:
import sys
import json
import re
import pandas as pd
from collections import defaultdict, Counter
from math import log
from random import shuffle
import csv
import multiprocessing
from multiprocessing import Pool
import numpy as np
import pickle

time_start = pd.Timestamp('2020-01-12')
time_mid = pd.Timestamp('2020-04-12')
time_end = pd.Timestamp('2020-07-13')

print('process vod/tv data')
df = pd.read_parquet('/home/chlee/kktv/transfer/tv_data/interaction_log_v6_20200724_train.parquet', engine='pyarrow')
df = df[ (df['client_upload_timestamp'] > time_start) & (df['client_upload_timestamp'] <= time_end) ]
vod_df = df[ (df['item_type']=='movie') | (df['item_type']=='series') ]
tv_df = df[ ~((df['item_type']=='movie') | (df['item_type']=='series')) ]

vod_play_df = vod_df[vod_df['interaction'] == 'play']
tv_play_df = tv_df[tv_df['interaction'] == 'play']

overlap_users = set(vod_play_df['user_id'].unique()).intersection(set(tv_play_df['user_id'].unique()))
print("overlap users amount: {}".format(len(overlap_users)))

overlap_tv_play_df = tv_play_df[tv_play_df.user_id.isin(overlap_users)]
overlap_vod_play_df = vod_play_df[vod_play_df.user_id.isin(overlap_users)]


print('process item meta...')
with open('/home/chlee/kktv/transfer/tv_data/item_metadata_v6_20200724_train.json', 'r') as j_file:
    item_meta = json.load(j_file)

item_meta_df = pd.DataFrame(item_meta)
vod_item_meta_df = item_meta_df[(item_meta_df['type'] == 'movie') | (item_meta_df['type'] == 'series')]
tv_item_meta_df = item_meta_df[~((item_meta_df['type'] == 'movie') | (item_meta_df['type'] == 'series'))]


tv_genre_table = pd.read_csv('/home/chlee/kktv/transfer/tv_data/catv_genre.csv')
vod_genre_table = pd.read_csv('/home/chlee/kktv/transfer/tv_data/vod_genre.csv')

tv_genre = tv_genre_table.main_genre.unique()
vod_genre = vod_genre_table.main_genre.unique()

def filter_tv_genre(genre_list): # input a list
    item_big_genre_list = []
    for i in range(len(genre_list)):
        if genre_list[i] in tv_genre:
            big_genre = genre_list[i]
            item_big_genre_list.append(big_genre)
    
    return item_big_genre_list


def filter_vod_genre(genre_list): # input a list
    item_big_genre_list = []
    for i in range(len(genre_list)):
        if genre_list[i] in vod_genre:
            big_genre = genre_list[i]
            item_big_genre_list.append(big_genre)
    
    return item_big_genre_list

tv_item_meta_df['big_genre'] = tv_item_meta_df['genre'].apply(lambda x: filter_tv_genre(x))
vod_item_meta_df['big_genre'] = vod_item_meta_df['genre'].apply(lambda x: filter_vod_genre(x))

print('finish processing item meta')


process vod/tv data
overlap users amount: 22071
process item meta...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


finish processing item meta


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [3]:
# read sub genre dist.
with open('./genre_analysis/user_tv_sub_genre_dist.pickle', 'rb') as pickle_file:
    user_tv_sub_genre_dist_pc = pickle.load(pickle_file)
with open('./genre_analysis/user_vod_sub_genre_dist.pickle', 'rb') as pickle_file:
    user_vod_sub_genre_dist_pc = pickle.load(pickle_file)

### Total combination

In [4]:
new_overlap_users = set(user_vod_sub_genre_dist_pc.keys()).intersection(set(user_tv_sub_genre_dist_pc.keys()))
print(len(new_overlap_users))

21924


In [5]:
total_lenth = len(new_overlap_users)

total_dict = defaultdict(float)
for user in new_overlap_users:
    for k,v in user_tv_sub_genre_dist_pc[user].items():
        for _k,_v in user_vod_sub_genre_dist_pc[user].items():
            value = v * _v
            key = (k, _k)
            total_dict[key] += value
sort_total_sub_dict = {k:round((v/total_lenth), 4) for k,v in sorted(total_dict.items(), key=lambda x:x[1], reverse=True)}

In [6]:
list(sort_total_sub_dict.items())[:10]

[(('113:定時・総合', 'J306:アクション･ファンタジー'), 0.025),
 (('113:定時・総合', 'J314:バンダイチャンネル'), 0.0246),
 (('87:トークバラエティ', 'J306:アクション･ファンタジー'), 0.0243),
 (('113:定時・総合', 'J704:バンダイチャンネル'), 0.0241),
 (('87:トークバラエティ', 'J314:バンダイチャンネル'), 0.0241),
 (('87:トークバラエティ', 'J704:バンダイチャンネル'), 0.0237),
 (('113:定時・総合', 'J304:ファミリー･キッズ(シリーズ)'), 0.021),
 (('87:トークバラエティ', 'J304:ファミリー･キッズ(シリーズ)'), 0.0206),
 (('95:その他', 'J306:アクション･ファンタジー'), 0.0202),
 (('95:その他', 'J314:バンダイチャンネル'), 0.02)]

In [7]:
with open('./genre_analysis/sorted_sub_genre_combination_dist.pickle', 'wb') as pickle_file:
    pickle.dump(sort_total_sub_dict, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

### Given VOD genre

In [8]:
new_overlap_users_sub = set(user_vod_sub_genre_dist_pc.keys()).intersection(set(user_tv_sub_genre_dist_pc.keys()))
len(new_overlap_users_sub)

21924

In [9]:
tv_genre = set(tv_genre_table.genre)
tv_main_genre = set(tv_genre_table.main_genre)
tv_sub_genre = tv_genre - tv_main_genre
print(len(tv_sub_genre))

vod_genre = set(vod_genre_table.genre)
vod_main_genre = set(vod_genre_table.main_genre)
vod_sub_genre = vod_genre - vod_main_genre
print(len(vod_sub_genre))

106
137


In [10]:
given_vod_sub_genre = {}
for v in vod_sub_genre:
    given_vod_sub_genre[v] = {}

In [11]:
# 分母改為有看過那個vod genre的人數
vod_sub_genre_user_amount = defaultdict(int)

for user in new_overlap_users_sub:
    for genre in user_vod_sub_genre_dist_pc[user].keys():
        vod_sub_genre_user_amount[genre] += 1
        
        for _tv_genre in user_tv_sub_genre_dist_pc[user].keys():           
            if _tv_genre not in given_vod_sub_genre[genre].keys():
                given_vod_sub_genre[genre][_tv_genre] = user_tv_sub_genre_dist_pc[user][_tv_genre]
            else:
                given_vod_sub_genre[genre][_tv_genre] += user_tv_sub_genre_dist_pc[user][_tv_genre]

#sort and normalize
for key in given_vod_sub_genre.keys():
    temp_dict = {k:round(v/vod_sub_genre_user_amount[key], 4) for k,v in sorted(given_vod_sub_genre[key].items(), \
                                                                        key=lambda x:x[1], reverse=True)}
    given_vod_sub_genre[key] = temp_dict

In [12]:
with open('./genre_analysis/given_vod_sub_genre.pickle', 'wb') as pickle_file:
    pickle.dump(given_vod_sub_genre, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

### Given TV genre

In [13]:
given_tv_sub_genre = {}
for t in tv_sub_genre:
    given_tv_sub_genre[t] = {}

In [14]:
tv_sub_genre_user_amount = defaultdict(int)

for user in new_overlap_users_sub:
    for genre in user_tv_sub_genre_dist_pc[user].keys():
        tv_sub_genre_user_amount[genre] += 1
        
        for _vod_genre in user_vod_sub_genre_dist_pc[user].keys():
            if _vod_genre not in given_tv_sub_genre[genre].keys():
                given_tv_sub_genre[genre][_vod_genre] = user_vod_sub_genre_dist_pc[user][_vod_genre]
            else:
                given_tv_sub_genre[genre][_vod_genre] += user_vod_sub_genre_dist_pc[user][_vod_genre]

#sort and normalize
for key in given_tv_sub_genre.keys():
    temp_dict = {k:round(v/tv_sub_genre_user_amount[key], 4) for k,v in sorted(given_tv_sub_genre[key].items(), \
                                                                        key=lambda x:x[1], reverse=True)}
    given_tv_sub_genre[key] = temp_dict

In [15]:
with open('./genre_analysis/given_tv_sub_genre.pickle', 'wb') as pickle_file:
    pickle.dump(given_tv_sub_genre, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

### Overall

In [18]:
# tv

In [16]:
overall_tv_sub_genre = {}
for user in list(new_overlap_users_sub):
    for _tv_genre in user_tv_sub_genre_dist_pc[user].keys():
        if _tv_genre not in overall_tv_sub_genre.keys():
            overall_tv_sub_genre[_tv_genre] = user_tv_sub_genre_dist_pc[user][_tv_genre]
        else:
            overall_tv_sub_genre[_tv_genre] += user_tv_sub_genre_dist_pc[user][_tv_genre]

#sort and normalize
s_n_overall_tv_sub_genre = {k:round(v/len(new_overlap_users_sub), 4) for k,v in sorted(overall_tv_sub_genre.items(), \
                                  key=lambda x:x[1], reverse=True)}

In [17]:
with open('./genre_analysis/overall_tv_sub_genre_dist.pickle', 'wb') as pickle_file:
    pickle.dump(s_n_overall_tv_sub_genre, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
# vod

In [20]:
overall_vod_sub_genre = {}
for user in list(new_overlap_users_sub):
    for _vod_genre in user_vod_sub_genre_dist_pc[user].keys():
        if _vod_genre not in overall_vod_sub_genre.keys():
            overall_vod_sub_genre[_vod_genre] = user_vod_sub_genre_dist_pc[user][_vod_genre]
        else:
            overall_vod_sub_genre[_vod_genre] += user_vod_sub_genre_dist_pc[user][_vod_genre]

#sort and normalize
s_n_overall_vod_sub_genre = {k:round(v/len(new_overlap_users_sub), 4) for k,v in sorted(overall_vod_sub_genre.items(), \
                                  key=lambda x:x[1], reverse=True)}

In [21]:
with open('./genre_analysis/overall_vod_sub_genre_dist.pickle', 'wb') as pickle_file:
    pickle.dump(s_n_overall_vod_sub_genre, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

### Compare

In [22]:
## s_n_overall_tv_sub_genre
## given_vod_sub_genre

In [23]:
vod_sub_comp_dict = defaultdict(float)
for _vod_genre in vod_sub_genre:
    for _tv_genre in tv_sub_genre:
        try:
            if given_vod_sub_genre[_vod_genre][_tv_genre] > s_n_overall_tv_sub_genre[_tv_genre]:
                dif = (given_vod_sub_genre[_vod_genre][_tv_genre] - s_n_overall_tv_sub_genre[_tv_genre])
                dif = round(dif, 4)
                vod_sub_comp_dict[(_vod_genre, _tv_genre)] = dif
        except KeyError:
            continue

In [24]:
s_vod_sub_comp_dict = {k:v for k,v in sorted(vod_sub_comp_dict.items(), key=lambda x: x[1], reverse=True)}
s_vod_sub_comp_dict

{('J502:海外(刑事･医療)', '35:海外ドラマ'): 0.0773,
 ('J506:海外(歴史･その他)', '35:海外ドラマ'): 0.072,
 ('J1301:囲碁将棋チャンネル', '180:囲碁・将棋'): 0.0588,
 ('J514:華流(ラブストーリー･その他)', '35:海外ドラマ'): 0.0573,
 ('J610:パチンコ･パチスロ', '191:その他'): 0.0563,
 ('n401:ライフスタイル／語学', '113:定時・総合'): 0.0539,
 ('J8566:北九州', '35:海外ドラマ'): 0.0514,
 ('J8565:福岡', '35:海外ドラマ'): 0.0514,
 ('J611:麻雀･競馬', '181:麻雀・パチンコ'): 0.051,
 ('n406:クラシック／オペラ', '113:定時・総合'): 0.0507,
 ('J313:オトメ･声優･ライブ', '129:国内アニメ'): 0.05,
 ('J611:麻雀･競馬', '191:その他'): 0.0486,
 ('J501:海外(見逃し･新作)', '35:海外ドラマ'): 0.0472,
 ('n407:美術', '113:定時・総合'): 0.0471,
 ('n406:クラシック／オペラ', '147:歴史・紀行'): 0.0459,
 ('J601:バラエティ(見逃し･新作)', '52:ゴルフ'): 0.0436,
 ('J513:華流(歴史)', '35:海外ドラマ'): 0.0428,
 ('n502:スポーツ', '113:定時・総合'): 0.042,
 ('n405:歌舞伎／古典／演劇', '122:特集・ドキュメント'): 0.042,
 ('n401:ライフスタイル／語学', '122:特集・ドキュメント'): 0.0417,
 ('n406:クラシック／オペラ', '145:社会・時事'): 0.0416,
 ('J611:麻雀･競馬', '57:競馬・公営競技'): 0.0414,
 ('n407:美術', '122:特集・ドキュメント'): 0.0405,
 ('J312:美少女', '129:国内アニメ'): 0.0404,
 ('n502:スポーツ', '122:特集・ドキュメント'):

In [25]:
list(s_vod_sub_comp_dict.items())[:10]

[(('J502:海外(刑事･医療)', '35:海外ドラマ'), 0.0773),
 (('J506:海外(歴史･その他)', '35:海外ドラマ'), 0.072),
 (('J1301:囲碁将棋チャンネル', '180:囲碁・将棋'), 0.0588),
 (('J514:華流(ラブストーリー･その他)', '35:海外ドラマ'), 0.0573),
 (('J610:パチンコ･パチスロ', '191:その他'), 0.0563),
 (('n401:ライフスタイル／語学', '113:定時・総合'), 0.0539),
 (('J8566:北九州', '35:海外ドラマ'), 0.0514),
 (('J8565:福岡', '35:海外ドラマ'), 0.0514),
 (('J611:麻雀･競馬', '181:麻雀・パチンコ'), 0.051),
 (('n406:クラシック／オペラ', '113:定時・総合'), 0.0507)]

In [31]:
with open('./genre_analysis/sorted_vod_sub_dif.pickle', 'wb') as pickle_file:
    pickle.dump(s_vod_sub_comp_dict, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
## s_n_overall_vod_sub_genre
## given_tv_sub_genre

In [28]:
tv_sub_comp_dict = defaultdict(float)
for _tv_genre in tv_sub_genre:
    for _vod_genre in vod_sub_genre:
        try:
            if given_tv_sub_genre[_tv_genre][_vod_genre] > s_n_overall_vod_sub_genre[_vod_genre]:
                dif = (given_tv_sub_genre[_tv_genre][_vod_genre] - s_n_overall_vod_sub_genre[_vod_genre])
                dif = round(dif, 4)
                tv_sub_comp_dict[(_tv_genre, _vod_genre)] = dif
        except KeyError:
            continue

In [29]:
s_tv_sub_comp_dict = {k:v for k,v in sorted(tv_sub_comp_dict.items(), key=lambda x: x[1], reverse=True)}
s_tv_sub_comp_dict

{('193:社会福祉', 'J304:ファミリー･キッズ(シリーズ)'): 0.1302,
 ('163:ダンス・バレエ', 'J304:ファミリー･キッズ(シリーズ)'): 0.0983,
 ('184:会話・語学', 'J304:ファミリー･キッズ(シリーズ)'): 0.0976,
 ('196:障害者', 'J304:ファミリー･キッズ(シリーズ)'): 0.0968,
 ('187:大学生・受験', 'J304:ファミリー･キッズ(シリーズ)'): 0.0965,
 ('165:歌舞伎・古典', 'J304:ファミリー･キッズ(シリーズ)'): 0.094,
 ('75:民族音楽・ワールドミュージック', 'J304:ファミリー･キッズ(シリーズ)'): 0.0892,
 ('207:その他', 'J304:ファミリー･キッズ(シリーズ)'): 0.0785,
 ('130:海外アニメ', 'J304:ファミリー･キッズ(シリーズ)'): 0.0619,
 ('74:童謡・キッズ', 'J304:ファミリー･キッズ(シリーズ)'): 0.06,
 ('19:アニメ', 'J304:ファミリー･キッズ(シリーズ)'): 0.0579,
 ('180:囲碁・将棋', 'J304:ファミリー･キッズ(シリーズ)'): 0.0556,
 ('186:中学生・高校生', 'J304:ファミリー･キッズ(シリーズ)'): 0.0529,
 ('143:その他', 'J304:ファミリー･キッズ(シリーズ)'): 0.0503,
 ('151:文学・文芸', 'J304:ファミリー･キッズ(シリーズ)'): 0.0426,
 ('185:幼児・小学生', 'J304:ファミリー･キッズ(シリーズ)'): 0.0423,
 ('180:囲碁・将棋', 'J613:趣味･教養'): 0.038,
 ('73:民謡・邦楽', 'J304:ファミリー･キッズ(シリーズ)'): 0.0357,
 ('180:囲碁・将棋', 'J1301:囲碁将棋チャンネル'): 0.031,
 ('195:高齢者', 'J304:ファミリー･キッズ(シリーズ)'): 0.0294,
 ('54:バスケットボール', 'J308:スポーツ･青春'): 0.0281,
 ('188:生涯教育・資格'

In [30]:
list(s_tv_sub_comp_dict.items())[:10]

[(('193:社会福祉', 'J304:ファミリー･キッズ(シリーズ)'), 0.1302),
 (('163:ダンス・バレエ', 'J304:ファミリー･キッズ(シリーズ)'), 0.0983),
 (('184:会話・語学', 'J304:ファミリー･キッズ(シリーズ)'), 0.0976),
 (('196:障害者', 'J304:ファミリー･キッズ(シリーズ)'), 0.0968),
 (('187:大学生・受験', 'J304:ファミリー･キッズ(シリーズ)'), 0.0965),
 (('165:歌舞伎・古典', 'J304:ファミリー･キッズ(シリーズ)'), 0.094),
 (('75:民族音楽・ワールドミュージック', 'J304:ファミリー･キッズ(シリーズ)'), 0.0892),
 (('207:その他', 'J304:ファミリー･キッズ(シリーズ)'), 0.0785),
 (('130:海外アニメ', 'J304:ファミリー･キッズ(シリーズ)'), 0.0619),
 (('74:童謡・キッズ', 'J304:ファミリー･キッズ(シリーズ)'), 0.06)]

In [32]:
with open('./genre_analysis/sorted_tv_sub_dif.pickle', 'wb') as pickle_file:
    pickle.dump(s_tv_sub_comp_dict, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)