In [1]:
#The data for this ex can be downloaded from http://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-dataFull.7z

import pandas as pd
from collections import Counter
import tensorflow as tf
from tffm import TFFMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
buys = open('yoochoose-buys.dat', 'r')
clicks = open('yoochoose-clicks.dat', 'r')

In [3]:
initial_buys_df = pd.read_csv(buys, names=['Session ID', 'Timestamp', 'Item ID', 'Category', 'Quantity'],
                              dtype={'Session ID': 'float32', 'Timestamp': 'str', 'Item ID': 'float32',
                                     'Category': 'str'})
initial_buys_df.set_index('Session ID', inplace=True)

initial_clicks_df = pd.read_csv(clicks, names=['Session ID', 'Timestamp', 'Item ID', 'Category'],
                                dtype={'Category': 'str'})
initial_clicks_df.set_index('Session ID', inplace=True)


In [4]:
initial_buys_df.head()

Unnamed: 0_level_0,Timestamp,Item ID,Category,Quantity
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
420374.0,2014-04-06T18:44:58.314Z,214537888.0,12462,1
420374.0,2014-04-06T18:44:58.325Z,214537856.0,10471,1
281626.0,2014-04-06T09:40:13.032Z,214535648.0,1883,1
420368.0,2014-04-04T06:13:28.848Z,214530576.0,6073,1
420368.0,2014-04-04T06:13:28.858Z,214835024.0,2617,1


In [5]:
initial_clicks_df.head()

Unnamed: 0_level_0,Timestamp,Item ID,Category
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2014-04-07T10:51:09.277Z,214536502,0
1,2014-04-07T10:54:09.868Z,214536500,0
1,2014-04-07T10:54:46.998Z,214536506,0
1,2014-04-07T10:57:00.306Z,214577561,0
2,2014-04-07T13:56:37.614Z,214662742,0


In [6]:
initial_buys_df = initial_buys_df.drop('Timestamp', 1)
initial_clicks_df = initial_clicks_df.drop('Timestamp', 1)

In [7]:
x = Counter(initial_buys_df.index).most_common(10000)
x

[(5638444.0, 144),
 (10808253.0, 120),
 (9014734.0, 81),
 (428198.0, 72),
 (6832724.0, 72),
 (6149111.0, 64),
 (601904.0, 62),
 (2233614.0, 54),
 (7586548.0, 54),
 (10683806.0, 54),
 (2920884.0, 48),
 (4543178.0, 48),
 (8211827.0, 48),
 (9209529.0, 48),
 (953756.0, 45),
 (8301793.0, 44),
 (10449092.0, 42),
 (10966823.0, 42),
 (4630111.0, 41),
 (1753739.0, 40),
 (2561659.0, 40),
 (5015396.0, 40),
 (8749796.0, 40),
 (1655343.0, 39),
 (6958333.0, 39),
 (10129729.0, 39),
 (980883.0, 36),
 (3394936.0, 36),
 (4382259.0, 36),
 (5033466.0, 36),
 (6529877.0, 36),
 (7614712.0, 36),
 (8166894.0, 36),
 (9010472.0, 36),
 (9822698.0, 36),
 (10317602.0, 36),
 (1032524.0, 35),
 (2081373.0, 35),
 (6006624.0, 34),
 (10760937.0, 34),
 (216528.0, 33),
 (557899.0, 33),
 (5563564.0, 33),
 (8117182.0, 33),
 (8653737.0, 33),
 (10043431.0, 33),
 (10219991.0, 33),
 (3734474.0, 32),
 (4932653.0, 32),
 (5022813.0, 32),
 (8959164.0, 32),
 (10162424.0, 32),
 (10393571.0, 32),
 (3460796.0, 31),
 (312929.0, 30),
 (17

In [8]:
top_k = dict(x).keys()
top_k

dict_keys([5638444.0, 10808253.0, 9014734.0, 428198.0, 6832724.0, 6149111.0, 601904.0, 2233614.0, 7586548.0, 10683806.0, 2920884.0, 4543178.0, 8211827.0, 9209529.0, 953756.0, 8301793.0, 10449092.0, 10966823.0, 4630111.0, 1753739.0, 2561659.0, 5015396.0, 8749796.0, 1655343.0, 6958333.0, 10129729.0, 980883.0, 3394936.0, 4382259.0, 5033466.0, 6529877.0, 7614712.0, 8166894.0, 9010472.0, 9822698.0, 10317602.0, 1032524.0, 2081373.0, 6006624.0, 10760937.0, 216528.0, 557899.0, 5563564.0, 8117182.0, 8653737.0, 10043431.0, 10219991.0, 3734474.0, 4932653.0, 5022813.0, 8959164.0, 10162424.0, 10393571.0, 3460796.0, 312929.0, 1755124.0, 2626047.0, 2420953.0, 2267228.0, 3392844.0, 3459694.0, 3459889.0, 4184669.0, 5241462.0, 6494147.0, 6647051.0, 7368184.0, 7086686.0, 7264851.0, 7533293.0, 7932302.0, 8688934.0, 9438617.0, 9458982.0, 10769751.0, 10921266.0, 5422111.0, 8503882.0, 11376688.0, 11275514.0, 813918.0, 1338984.0, 2209084.0, 3031139.0, 4338377.0, 4342126.0, 5037817.0, 5951718.0, 8600704.0, 965

In [9]:
initial_buys_df = initial_buys_df[initial_buys_df.index.isin(top_k)]
initial_clicks_df = initial_clicks_df[initial_clicks_df.index.isin(top_k)]

In [10]:
initial_clicks_df.head()

Unnamed: 0_level_0,Item ID,Category
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1
932,214826906,0
932,214826906,0
932,214826906,0
932,214826955,0
932,214826955,0


In [11]:
initial_buys_df['_Session ID'] = initial_buys_df.index

In [12]:
initial_buys_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 106956 entries, 420471.0 to 11422804.0
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Item ID      106956 non-null  float32
 1   Category     106956 non-null  object 
 2   Quantity     106956 non-null  int64  
 3   _Session ID  106956 non-null  float64
dtypes: float32(1), float64(1), int64(1), object(1)
memory usage: 3.7+ MB


In [13]:
transformed_buys = pd.get_dummies(initial_buys_df)
transformed_clicks = pd.get_dummies(initial_clicks_df)

In [14]:
transformed_buys.head()

Unnamed: 0_level_0,Item ID,Quantity,_Session ID,Category_0,Category_1024,Category_1036,Category_10367,Category_1037,Category_104,Category_1041,...,Category_931,Category_932,Category_936,Category_937,Category_941,Category_9424,Category_97,Category_99,Category_994,Category_9947
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
420471.0,214717888.0,1,420471.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
420471.0,214821024.0,1,420471.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
420471.0,214829280.0,1,420471.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
420471.0,214819552.0,1,420471.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
420471.0,214746384.0,1,420471.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
filtered_buys = transformed_buys.filter(regex="Item.*|Category.*")
filtered_clicks = transformed_clicks.filter(regex="Item.*|Category.*")

In [16]:
historical_buy_data = filtered_buys.groupby(filtered_buys.index).sum()
historical_buy_data = historical_buy_data.rename(columns=lambda column_name: 'buy history:' + column_name)

In [17]:
historical_click_data = filtered_clicks.groupby(filtered_clicks.index).sum()
historical_click_data = historical_click_data.rename(columns=lambda column_name: 'click history:' + column_name)

In [18]:
merged1 = pd.merge(transformed_buys, historical_buy_data, left_index=True, right_index=True)

In [19]:
merged1

Unnamed: 0_level_0,Item ID,Quantity,_Session ID,Category_0,Category_1024,Category_1036,Category_10367,Category_1037,Category_104,Category_1041,...,buy history:Category_931,buy history:Category_932,buy history:Category_936,buy history:Category_937,buy history:Category_941,buy history:Category_9424,buy history:Category_97,buy history:Category_99,buy history:Category_994,buy history:Category_9947
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
932.0,214826960.0,2,932.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
932.0,214826624.0,2,932.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
932.0,214826912.0,1,932.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
932.0,214709744.0,2,932.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
932.0,214819744.0,1,932.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11561722.0,214853104.0,4,11561722.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11561722.0,214835008.0,1,11561722.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11561722.0,214853104.0,2,11561722.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11561722.0,214853120.0,1,11561722.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
merged2 = pd.merge(merged1, historical_click_data, left_index=True, right_index=True)

In [21]:
merged2.columns.to_list()

['Item ID',
 'Quantity',
 '_Session ID',
 'Category_0',
 'Category_1024',
 'Category_1036',
 'Category_10367',
 'Category_1037',
 'Category_104',
 'Category_1041',
 'Category_1042',
 'Category_1046',
 'Category_1047',
 'Category_10471',
 'Category_106709',
 'Category_1089',
 'Category_10996',
 'Category_114',
 'Category_1140',
 'Category_1141',
 'Category_11414',
 'Category_1146',
 'Category_1151',
 'Category_11624',
 'Category_1182',
 'Category_120',
 'Category_1203',
 'Category_12043',
 'Category_1245',
 'Category_1246',
 'Category_12462',
 'Category_125',
 'Category_1250',
 'Category_1256',
 'Category_1257',
 'Category_12776',
 'Category_1287',
 'Category_1297',
 'Category_12985',
 'Category_1308',
 'Category_1309',
 'Category_13090',
 'Category_131',
 'Category_1340',
 'Category_135',
 'Category_1350',
 'Category_13509',
 'Category_1351',
 'Category_1356',
 'Category_1360',
 'Category_14032',
 'Category_141',
 'Category_1413',
 'Category_1444',
 'Category_14451',
 'Category_1455',


In [22]:
# Create the MF model, you can play around with the parameters 
model = TFFMRegressor(
    order=2,
    rank=7,
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    n_epochs=100,
    batch_size=-1,
    init_std=0.001,
    input_type='dense'
)
merged2.drop(['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID'], axis=1, inplace=True)
X = np.array(merged2)
X = np.nan_to_num(X)
y = np.array(merged2['Quantity'].to_numpy())
# Split data into train, test
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2)
#Split testing data in half: Full information vs Cold-start
X_te, X_te_cs, y_te, y_te_cs = train_test_split(X_te, y_te, test_size=0.5)
cold_start = pd.DataFrame(X_te_cs, columns=merged2.columns)
# What happens if we only have access to categories and no historical click/purchase data?
# Let's delete historical click and purchasing data for the cold_start test set
for column in cold_start.columns:
    if ('buy' in column or 'click' in column) and ('Category' not in column):
        cold_start[column] = 0
# Compute the mean squared error for both test sets
model.fit(X_tr, y_tr, show_progress=True)
predictions = model.predict(X_te)
cold_start_predictions = model.predict(X_te_cold)
print('MSE: {}'.format(mean_squared_error(y_te, predictions)))
print('Cold-start MSE: {}'.format(mean_squared_error(y_te_cold, predictions)))
model.destroy()
# Fun fact: Dropping the category columns in the training dataset makes the MSE even smaller 
# but doing so means that we cannot tackle the cold-start recommendation problem

RuntimeError: Variable += value not supported. Use variable.assign_add(value) to modify the variable value and variable = variable + value to get a new Tensor object.