In [8]:
# Import libraries
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 500)
pd.options.display.max_colwidth = 1000
from scipy.sparse.linalg import spsolve
import scipy.sparse as sparse
import seaborn as sns

In [11]:
# Import csv file
train_path = "trivagoRecSysChallengeData2019_v2/train.csv"
test_path = "trivagoRecSysChallengeData2019_v2/test.csv"

In [15]:
# Create DataFrames
trivago_df = pd.read_csv(train_path)
trivago_test_df = pd.read_csv(test_path)

In [6]:
# Sanity check
trivago_df.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


In [16]:
# Convert UNIX time stamp (timestime column) to UTC 
date_conv = lambda x: datetime.datetime.utcfromtimestamp(x).strftime("%Y-%m-%d")
time_conv = lambda x: datetime.datetime.utcfromtimestamp(x).strftime("%H:%M:%S")
trivago_df["date"] = trivago_df["timestamp"].map(date_conv)
trivago_df["time"] = trivago_df["timestamp"].map(time_conv)

In [17]:
trivago_df.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,date,time
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,,2018-11-01,01:57:40
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,2018-11-01,01:58:42
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,2018-11-01,01:58:42
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,2018-11-01,01:58:52
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,,2018-11-01,01:58:52


In [18]:
trivago_df.drop("timestamp", axis = 1, inplace = True)

In [19]:
trivago_df.head()

Unnamed: 0,user_id,session_id,step,action_type,reference,platform,city,device,current_filters,impressions,prices,date,time
0,00RL8Z82B2Z1,aff3928535f48,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,,2018-11-01,01:57:40
1,00RL8Z82B2Z1,aff3928535f48,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,2018-11-01,01:58:42
2,00RL8Z82B2Z1,aff3928535f48,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,2018-11-01,01:58:42
3,00RL8Z82B2Z1,aff3928535f48,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,2018-11-01,01:58:52
4,00RL8Z82B2Z1,aff3928535f48,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,,2018-11-01,01:58:52


In [24]:
# Create dataframe for sparse matrix
matrix_df = pd.DataFrame()
matrix_df = trivago_df[["user_id", "reference", "action_type"]]

In [25]:
# Sanity check
matrix_df.head()

Unnamed: 0,user_id,reference,action_type
0,00RL8Z82B2Z1,Newtown,search for poi
1,00RL8Z82B2Z1,666856,interaction item image
2,00RL8Z82B2Z1,666856,interaction item image
3,00RL8Z82B2Z1,666856,interaction item image
4,00RL8Z82B2Z1,109038,interaction item image


In [27]:
# Drop rows where action_type is not clickout - clickout will be the proxy for 
# popularity, since the dataframe only has implicit feedback.
matrix_df = matrix_df[(matrix_df["action_type"] == "clickout item")]

In [28]:
# Sanity check
matrix_df.head(10)

Unnamed: 0,user_id,reference,action_type
13,00RL8Z82B2Z1,109038,clickout item
15,00RL8Z82B2Z1,1257342,clickout item
115,02SRUT1NQYH1,2795374,clickout item
121,03K8AXBL4BX2,1032816,clickout item
122,03K8AXBL4BX2,1032816,clickout item
176,03P4VFKK12UO,65685,clickout item
177,03P4VFKK12UO,1320460,clickout item
180,0473FZ8UNXRS,3143258,clickout item
181,066TUPQWUEV5,2552514,clickout item
184,06S61EKCW1JY,110591,clickout item


In [29]:
# Sanity check
matrix_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1586586 entries, 13 to 15932991
Data columns (total 3 columns):
user_id        1586586 non-null object
reference      1586586 non-null object
action_type    1586586 non-null object
dtypes: object(3)
memory usage: 48.4+ MB


In [30]:
# Check number of unique values
matrix_df.nunique()

user_id        717774
reference      289506
action_type         1
dtype: int64

In [46]:
# Replace "clikout item" for an int 1 to sum up ocurrences per user
f = lambda x: x.replace("clickout item", "1")
g = lambda x: int(x)
matrix_df["action_type"] = matrix_df["action_type"].map(f)
matrix_df["action_type"] = matrix_df["action_type"].map(g)

In [51]:
# groupby user and item, sum the number of clicks
matrix_df = matrix_df.groupby(["user_id","reference"])[["action_type"]].sum()

In [52]:
# Sanity check
matrix_df.head(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,action_type
user_id,reference,Unnamed: 2_level_1
0001VQMGUI65,477811,1
0001VQMGUI65,950829,1
0001VQMGUI65,2019467,1
0001VQMGUI65,3133074,2
0003QTCX5MJX,2195060,1
0004IOZI7CKF,110985,1
0004IOZI7CKF,2627602,1
0004IOZI7CKF,7822344,1
0004WCFRV3FB,1439375,1
0006W0R5A5V8,6776722,1


In [56]:
# Change column name
matrix_df["clicks"] = matrix_df["action_type"]
matrix_df.drop("action_type", axis = 1, inplace = True)

In [58]:
# Reset index
matrix_df = matrix_df.reset_index()

In [59]:
# Sanity check
matrix_df.head()

Unnamed: 0,user_id,reference,clicks
0,0001VQMGUI65,477811,1
1,0001VQMGUI65,950829,1
2,0001VQMGUI65,2019467,1
3,0001VQMGUI65,3133074,2
4,0003QTCX5MJX,2195060,1


In [63]:
# Get unique customers
users = list(np.sort(matrix_df.user_id.unique()))

# Get unique products that were purchased
properties = list(matrix_df.reference.unique())

# Get all clicks
clicks = list(matrix_df.clicks)

# Get row indices
rows = matrix_df.user_id.astype("category", categories = users).cat.codes

# Get column indices
cols = matrix_df.reference.astype("category", categories = properties).cat.codes

# Create sparse matrix
sparse_matrix = sparse.csr_matrix((clicks, (rows, cols)), shape = (len(users), len(properties)))

In [64]:
# Sanity check
sparse_matrix

<717774x289506 sparse matrix of type '<class 'numpy.int64'>'
	with 1306901 stored elements in Compressed Sparse Row format>

In [65]:
# Check sparsity of the sparse_matrix

# Number of possible interactions in the matrix
matrix_size = sparse_matrix.shape[0] * sparse_matrix.shape[1]

# Number of items interacted with
num_clicks = len(sparse_matrix.nonzero()[0])

# Sparsity percentage (needs to be less than 99%)
sparsity = 100 * (1 - (num_clicks / matrix_size))

print(sparsity)

99.99937107711408


In [None]:
# Can't use collaborative filteringbecause matrix sparsity is over 99%