In [1]:
import os
import sys
import json
import pickle
import datetime
from tqdm import tqdm
from multiprocessing import Pool as ProcessPool

import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from itertools import groupby

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split

from function import *

In [2]:
def seqget_data_from_id( uid ):
    dfs = []
    for end in data_dict[uid]:
        launch_type , date = launch_dict[ uid ]
        launch_type =launch_type[ date <= end ]
        date = date[ date <= end ]
        dfs.append( pd.DataFrame(pd.Series( { 'user_id' : uid , 
                                 'end_date' : end , 
                                 'launch_type' : launch_type , 
                                 'launch_date' : date} ) ).T )
    return pd.concat(dfs)

def seq_seq( df , need_2 = True ):
    def check( row , e ):
        if e in launch_dic[ row.user_id ]:
            return 1
        else:
            return 0
    def func( row ):
        end_date = row.end_date
        f = lambda x : 1 if x > 0 else 0
        if not need_2 : 
            seq = [ f(e) for e in row.launch_seq]
        else:
            seq = [ e for e in row.launch_seq]
        future = [ check( row , e ) for e in range( end_date + 1 , end_date + 8 )  ]
        return pd.Series( { 'launch_seq' : seq , 'end_date' : end_date ,  
                           'future' : future , 'label_list' : row.label_list } ).T
    data = df.apply( lambda x : func(x) , axis=1 )
    data = data.reset_index()
    del data['index']
    return pd.concat([ data , pd.DataFrame(df.user_id).reset_index() ] , axis=1 )

In [3]:
playback = pd.read_csv("data/user_playback_data.csv", dtype={"item_id": str})
playback = playback.sort_values( 'date' )
video_data = pd.read_csv("data/video_related_data.csv", dtype=str)
playback = playback.merge(video_data[video_data.item_id.notna()], how="left", on="item_id")
playback.tail()
playback_grp = playback.groupby(["user_id"]).agg(
    playtime_list=("playtime", list ),
    item_seq = ("item_id", list ),
    date_list=("date", list ),
    duration_list=("duration", list),
    father_id_list=("father_id", list),
    tag_list=("tag_list", list),
    cast_list=("cast", list)
).reset_index()
playback_grp[ 'playtime_list' ] = playback_grp.playtime_list.apply( lambda x : np.array(x) )
playback_grp[ 'item_seq' ] = playback_grp.item_seq.apply( lambda x : np.array(x) )
playback_grp[ 'date_list' ] = playback_grp.date_list.apply( lambda x : np.array(x) )

In [4]:
to_pickle( playback_grp , 'playback_grp.pkl' )

In [5]:
launch = pd.read_csv("data/app_launch_logs.csv")
launch_gp = launch.groupby("user_id")
launch_grp = pd.concat( [launch_gp['date'].apply( np.array ) , launch_gp['launch_type'].apply( np.array ) ]  , axis = 1 ).reset_index()

In [6]:
to_pickle( launch_grp , 'launch_grp.pkl' )

In [8]:
where = 'online'
for dataset in ['trainb', 'testb']:
    print( f'data/{where}_{dataset}.csv' )
    data = pd.read_csv( f'data/{where}_{dataset}.csv' )

    launch_grp = load_pickle( 'launch_grp.pkl' )
    data_dict = data.groupby( 'user_id' )['date'].agg(list).to_dict()

    launch_dict = {}
    for i , r in tqdm( launch_grp.iterrows() ):
        launch_dict[r.user_id] = [ r.launch_type , r.date ]



    pool = ProcessPool(10)
    dfs = pool.map(seqget_data_from_id, list( data_dict.keys() ) )
    pool.close()
    pool.join()
    tmpdata = pd.concat(dfs)

    launch_grp = load_pickle( 'launch_grp.pkl' )
    launch_grp.columns = [ 'user_id' ,  'launch_date'  ,  'launch_type' ]
    launch_grp = launch_grp[ launch_grp.user_id.isin( tmpdata.user_id.unique() ) ]
    launch_dic = dict( zip(launch_grp.user_id , launch_grp.launch_date ) )

    dfs = df_split( tmpdata , 6000 )
    pool = ProcessPool(8)
    dfs = pool.map(fill_launch_seq, dfs )
    pool.close()
    pool.join()
    tmpdata = pd.concat(dfs)

    dfs = df_split( tmpdata , 10000 )
    pool = ProcessPool(8)
    dfs = pool.map(get_label_list, dfs )
    pool.close()
    pool.join()
    tmpdata = pd.concat(dfs)

    dfs = df_split( tmpdata , 10000 )
    pool = ProcessPool(8)
    dfs = pool.map(seq_seq, dfs )
    pool.close()
    pool.join()
    seq = pd.concat(dfs)

    tmpdata = seq.merge( playback_grp , on = 'user_id' , how = 'left'  )



    ls = df_split( tmpdata,100000 )
    res = []
    for df in tqdm( ls ):
        dfs = df_split( df,1000 )
        pool = ProcessPool(10)
        dfs = pool.map(seqmodifylist, dfs )
        pool.close()
        pool.join()
        res = res + list(dfs)
        del pool
        gc.collect()
    tmpdata = pd.concat(ls)
    dfs = df_split( tmpdata , 10000 )
    pool = ProcessPool(8)
    dfs = pool.map(seqget_playtime, dfs )
    pool.close()
    pool.join()
    tmpdata = pd.concat(dfs)



    tmpdata.playtime_seq = tmpdata.playtime_seq.apply( 
        lambda x : np.array( [0]*64 ) if isinstance(x,float) else np.array(x) )
    dfs = df_split( tmpdata , 10000 )
    pool = ProcessPool(8)
    dfs = pool.map(new_seq, dfs )
    pool.close()
    pool.join()
    tmpdata = pd.concat(dfs)

    cols = [ 'user_id' , 'end_date' , 'future' , 'list' ]

    tmpdata[cols].to_csv(f'data/{where}{dataset}seqex3.csv')

data/online_testb.csv


600000it [00:20, 29884.57it/s]
100%|██████████| 6/6 [00:00<00:00, 33244.15it/s]
100%|██████████| 4/4 [00:00<00:00, 26132.74it/s]
100%|██████████| 4/4 [00:00<00:00, 17476.27it/s]
100%|██████████| 1/1 [00:00<00:00, 8272.79it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 36/36 [00:00<00:00, 53869.05it/s]
100%|██████████| 1/1 [00:16<00:00, 16.64s/it]
100%|██████████| 4/4 [00:00<00:00, 23399.19it/s]
100%|██████████| 4/4 [00:00<00:00, 26973.02it/s]
