In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
import pickle
import pandas as pd
import cudf
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import word2vec
from sklearn.manifold import TSNE
from sklearn.cluster import BisectingKMeans

In [2]:
EXP_NO = 'exp005'
TRAIN_PATH = '../data/processed/train.csv'
OUTPUT_DIR = Path(f'../model/item2vec/{EXP_NO}')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
CATEGORY_TYPES = ["A", "B", "C", "D"]

In [3]:
df = cudf.read_csv(TRAIN_PATH, parse_dates=['time_stamp']).to_pandas()

all_product_ids = df['product_id'].unique()
df = df[df['event_type']>0]
df = df.sort_values(['user_id', 'time_stamp']).reset_index(drop=True)

In [4]:
for category in CATEGORY_TYPES:
    df_ = df[df['category'] == category]
    favorite_items = df_.groupby('user_id').agg({'product_id': list}).reset_index()
    
    with open(OUTPUT_DIR/f'items_{category}.txt', 'w') as f:
        for user_id, items in favorite_items.values:
            items = [items[i] for i in range(len(items)) if i==0 or items[i] != items[i-1]]
            f.write(' '.join(items) + '\n')
            
    sentences = word2vec.LineSentence(OUTPUT_DIR/f'items_{category}.txt')
    model = word2vec.Word2Vec(sentences, vector_size=64, epochs=20, window=5, min_count=5, seed=0, workers=8)
    print(f'vectors.shape: {model.wv.vectors.shape}')
    with open(OUTPUT_DIR/f'item2vec_{category}.pickle', 'wb') as f:
        pickle.dump(model, f)

vectors.shape: (13383, 64)
vectors.shape: (14141, 64)
vectors.shape: (94331, 64)
vectors.shape: (44374, 64)


In [13]:
item_cluster = []
for i, category in enumerate(CATEGORY_TYPES):
    with open(OUTPUT_DIR/f'item2vec_{category}.pickle', 'rb') as f:
        model = pickle.load(f)
        
    vectors = model.wv.vectors
    item_names = model.wv.index_to_key
    categories = np.array(list(map(lambda s: s.split('_')[1], item_names)))
    
    k = 20
    vectors = vectors / np.linalg.norm(vectors).reshape(-1, 1)
    kmeans = BisectingKMeans(n_clusters=k, bisecting_strategy='largest_cluster', random_state=0)
    C = kmeans.fit_predict(vectors)
    item_cluster += list(zip(item_names, C))
    
    with open(OUTPUT_DIR/f'kmeans_{category}.pickle', 'wb') as f:
        pickle.dump(kmeans, f)
    
    #print(pd.Series(C).value_counts())
    #n = 20 if category in ['A', 'B'] else 200
    #vectors = vectors[::n]
    #item_names = item_names[::n]
    #categories = categories[::n]
    #C = C[::n]
    #
    #tsne = TSNE(perplexity=5)
    #Z = tsne.fit_transform(vectors)
    #
    #for c in range(kmeans.n_clusters):
    #    idx = C==c
    #    plt.scatter(Z[idx, 0], Z[idx, 1], s=3)
    #plt.show()

In [10]:
item_cluster = pd.DataFrame(item_cluster, columns=['product_id', 'cluster_id'])
rare_items = list(set(all_product_ids) - set(item_cluster['product_id']))
rare_items = pd.DataFrame({'product_id':rare_items, 'cluster_id':-1})
item_cluster = pd.concat([item_cluster, rare_items]).reset_index(drop=True)
item_cluster.to_csv(OUTPUT_DIR/'item_cluster.csv', index=False)

In [11]:
item_cluster.head()

Unnamed: 0,product_id,cluster_id
0,00013874_a,0
1,00005175_a,0
2,00001225_a,0
3,00001943_a,0
4,00001429_a,0
