In [55]:
import pandas as pd
import numpy as np

In [56]:
df = pd.read_excel("Sample 5000.xlsx")

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   event_time     10001 non-null  object 
 1   event_type     10001 non-null  object 
 2   product_id     10001 non-null  object 
 3   category_id    10001 non-null  int64  
 4   category_code  6623 non-null   object 
 5   category1      6623 non-null   object 
 6   category2      6623 non-null   object 
 7   category3      2881 non-null   object 
 8   category4      2 non-null      object 
 9   brand          8595 non-null   object 
 10  price          10001 non-null  float64
 11  user_id        10001 non-null  int64  
 12  user_session   10001 non-null  object 
 13  interaction    10001 non-null  int64  
dtypes: float64(1), int64(3), object(10)
memory usage: 1.1+ MB


In [58]:
df = df.drop(['category1', 'category2', 'category3', 'category4'], axis=1)

## Removing Null Values
- For brands that have null values, we're gonna replace with 'unknown' brand.
- For category_code with null values, we're going to remove them because it would be useless.

In [59]:
df['category_code'] = df['category_code'].fillna(value="empty")
df['category_code'].astype(str)

df['brand'] = df['brand'].fillna(value="unknown")
df['brand'].astype(str)

0          samsung
1          samsung
2             sony
3            intel
4          samsung
           ...    
9996           msi
9997           msi
9998     garanterm
9999       samsung
10000      samsung
Name: brand, Length: 10001, dtype: object

In [60]:
users_count = dict(df.user_id.value_counts())

for row in df.iterrows():
    
    # Retrieving value
    category_code_col = row[1]['category_code']
    user_id_col = row[1]['user_id']
    # Apply logic
        ## We set {user-id: count_it_appears}
        ## If count_it_appear == 1 and (category_code_col = nan), then we remove them
        
    count = users_count[user_id_col]
    if count == 1 and category_code_col == 'empty':
        df = df.drop(df[df.user_id == user_id_col].index)

In [61]:
print("Null Values check: ", "\n", df.isnull().sum())

Null Values check:  
 event_time       0
event_type       0
product_id       0
category_id      0
category_code    0
brand            0
price            0
user_id          0
user_session     0
interaction      0
dtype: int64


## Feature Engineer

In [62]:
df['user_score'] = df['event_type'].map({'view':1, 'cart':10, 'purchase':50})
df['user_purchase'] = df['event_type'].apply(lambda x: 1 if x == 'purchase' else 0)

In [63]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,interaction,user_score,user_purchase
0,2019-10-01 08:47:35 UTC,view,1001588,2053013555631879936,electronics.smartphone,samsung,460.5,244951053,91769fdf-461b-4e43-9c73-88a07481b75c,1,1,0
1,2019-10-01 08:48:28 UTC,view,1003535,2053013555631879936,electronics.smartphone,samsung,460.5,244951053,91769fdf-461b-4e43-9c73-88a07481b75c,1,1,0
3,2019-10-01 16:48:28 UTC,view,6400036,2053013554121929984,computers.components.cpu,intel,338.23,295655799,eb8f2cea-4c5b-4e00-880f-3bcfa28549ff,1,1,0
4,2019-10-01 17:07:37 UTC,view,1004870,2053013555631879936,electronics.smartphone,samsung,286.84,306087674,a15f469a-968f-4c8c-8317-6dffed3f5523,1,1,0
5,2019-10-01 17:15:42 UTC,view,44100021,2100065069302799872,empty,smoby,655.03,306087674,0c032f47-6050-4609-b07a-bf82d4b7c515,1,1,0


In [69]:
# df['price_category'] = 1
# for category_code in df['category_code'].unique():
#     df.loc[df['category_code']==category_code,'price_category'] = pd.qcut(x=df['price'][df['category_code']==category_code],q=5, duplicates='drop')

In [78]:
for category_code in df['category_code'].unique():
    df['price_category'] = pd.qcut(df['price'],  
                            q=[0, .2, .4, .6, .8, 1],
                              labels=[1,2,3,4,5])

In [84]:
group = df.groupby(['user_id', 'product_id'])['user_score', 'user_purchase'].sum().reset_index()
group['user_purchase'] = group['user_purchase'].apply(lambda x: 1 if x>1 else x)
group['user_score'] = group['user_score'].apply(lambda x: 100 if x>100 else x)

# apply MinMaxScaler to the user scores to obtain an interaction score with a value between 0 and 1
# >=0.5: a very high probability that a purchase has occurred
# <0.5: no purchase occurs below the threshold of 0.5

from sklearn.preprocessing import MinMaxScaler

std = MinMaxScaler(feature_range=(0.025, 1))
std.fit(group['user_score'].values.reshape(-1,1))
group['interaction_score'] = std.transform(group['user_score'].values.reshape(-1,1))

group = group.merge(df[['product_id','category_code','brand','price','price_category']].drop_duplicates('product_id'),on=['product_id'])

  """Entry point for launching an IPython kernel.


## Modelling

In [174]:
inputs = group.drop('interaction_score', axis =1)
X = inputs
y = group['interaction_score']

In [175]:
from sklearn.model_selection import train_test_split
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8)

# Now since we want the valid and test size to be equal (10% each of overall data). 
# we have to define valid_size=0.5 (that is 50% of remaining data)
test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

print({X_train.shape}), print(y_train.shape)
print({X_valid.shape}), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

{(4803, 8)}
(4803,)
{(600, 8)}
(600,)
(601, 8)
(601,)


(None, None)

In [183]:
X_train_matrix = pd.pivot_table(X_train,values='user_score',index='user_id',columns='product_id')
X_train_matrix = X_train_matrix.fillna(0)

In [184]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [235]:
# filtering by item category, price category and brand

product_cat = X_train[['product_id','price_category','category_code','brand']].drop_duplicates('product_id')
product_cat['product_id'] = product_cat['product_id'].astype(str)
product_cat.sort_values(by='product_id', ascending=True, inplace=True)
product_cat = product_cat.sort_values(by='product_id')

# Reciprocal of 2 is 0.5, cos 1/2. 
price_cat_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['price_category']).reshape(-1,1))+1)
euclidean_matrix = pd.DataFrame(price_cat_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

# TfidfVectorizer() converts texts to word freq counts... 
tfidf_vectorizer = TfidfVectorizer()
doc_term = tfidf_vectorizer.fit_transform(list(product_cat['category_code']))
dt_matrix = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in product_cat['product_id']], columns=tfidf_vectorizer.get_feature_names())
cos_similar_matrix = pd.DataFrame(cosine_similarity(dt_matrix.values),columns=product_cat['product_id'],index=product_cat['product_id'])

tfidf_vectorizer = TfidfVectorizer()
doc_term = tfidf_vectorizer.fit_transform(list(product_cat['brand']))
dt_matrix1 = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in product_cat['product_id']], columns=tfidf_vectorizer.get_feature_names())
dt_matrix1 = dt_matrix1 + 0.01
cos_similar_matrix1 = pd.DataFrame(cosine_similarity(dt_matrix1.values),columns=product_cat['product_id'],index=product_cat['product_id'])

similarity_matrix = cos_similar_matrix.multiply(euclidean_matrix).multiply(cos_similar_matrix1)
# content_matrix = X_train_matrix.dot(similarity_matrix)
content_matrix = np.matrix(X_train_matrix)*np.matrix(similarity_matrix)



# apply MinMaxScaler again to obtain the trained User-Item Matrix of predicted interaction scores
content_matrix = pd.DataFrame(content_matrix)
std = MinMaxScaler(feature_range=(0, 1))
std.fit(content_matrix.values)
content_matrix = std.transform(content_matrix.values)

In [241]:
content_matrix = pd.DataFrame(content_matrix,columns=sorted(X_train['product_id'].unique()),index=sorted(X_train['user_id'].unique()))
content_df = content_matrix.stack().reset_index()
content_df = content_df.rename(columns={'level_0':'user_id','level_1':'product_id',0:'predicted_interaction'})

In [240]:
content_df

Unnamed: 0,level_0,level_1,0
0,244951053,1001588,0.025732
1,244951053,1002099,0.007452
2,244951053,1002101,0.007452
3,244951053,1002102,0.007452
4,244951053,1002367,0.001417
...,...,...,...
4806195,512393698,52900039,0.002475
4806196,512393698,52900050,0.002475
4806197,512393698,52900077,0.002475
4806198,512393698,52900079,0.005540


In [None]:
content_matrix = pd.DataFrame(content_matrix,columns=sorted(X_train['product_id'].unique()),index=sorted(X_train['user_id'].unique()))
content_df = content_matrix.stack().reset_index()
content_df = content_df.rename(columns={'level_0':'user_id','level_1':'product_id',0:'predicted_interaction'})
X_valid = X_valid.merge(content_df,on=['user_id','product_id'])

X_valid['predicted_purchase'] = X_valid['predicted_interaction'].apply(lambda x:1 if x>=0.5 else 0)

In [188]:
similarity_matrix.shape

(3433, 3433)