In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel("Sample 20000.xlsx")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   event_time     10001 non-null  object 
 1   event_type     10001 non-null  object 
 2   product_id     10001 non-null  int64  
 3   category_id    10001 non-null  int64  
 4   category_code  6623 non-null   object 
 5   brand          8595 non-null   object 
 6   price          10001 non-null  float64
 7   user_id        10001 non-null  int64  
 8   user_session   10001 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 703.3+ KB


In [4]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 08:47:35 UTC,view,1001588,2053013555631879936,electronics.smartphone,samsung,460.5,244951053,91769fdf-461b-4e43-9c73-88a07481b75c
1,2019-10-01 08:48:28 UTC,view,1003535,2053013555631879936,electronics.smartphone,samsung,460.5,244951053,91769fdf-461b-4e43-9c73-88a07481b75c
2,2019-10-01 17:06:51 UTC,view,4100129,2053013561218690048,,sony,463.02,292071852,0051531b-c007-442f-88c8-2cbf9537bd02
3,2019-10-01 16:48:28 UTC,view,6400036,2053013554121929984,computers.components.cpu,intel,338.23,295655799,eb8f2cea-4c5b-4e00-880f-3bcfa28549ff
4,2019-10-01 17:07:37 UTC,view,1004870,2053013555631879936,electronics.smartphone,samsung,286.84,306087674,a15f469a-968f-4c8c-8317-6dffed3f5523


## Removing Null Values
- For brands that have null values, we're gonna replace with 'unknown' brand.
- For category_code with null values, we're going to remove them because it would be useless.

In [5]:
df['category_code'] = df['category_code'].fillna(value="empty")
df['category_code'].astype(str)

df['brand'] = df['brand'].fillna(value="unknown")
df['brand'].astype(str)

0          samsung
1          samsung
2             sony
3            intel
4          samsung
           ...    
9996           msi
9997           msi
9998     garanterm
9999       samsung
10000      samsung
Name: brand, Length: 10001, dtype: object

In [6]:
users_count = dict(df.user_id.value_counts())

for row in df.iterrows():
    
    # Retrieving value
    category_code_col = row[1]['category_code']
    user_id_col = row[1]['user_id']
    # Apply logic
        ## We set {user-id: count_it_appears}
        ## If count_it_appear == 1 and (category_code_col = nan), then we remove them
        
    count = users_count[user_id_col]
    if count == 1 and category_code_col == 'empty':
        df = df.drop(df[df.user_id == user_id_col].index)

In [7]:
print("Null Values check: ", "\n", df.isnull().sum())

Null Values check:  
 event_time       0
event_type       0
product_id       0
category_id      0
category_code    0
brand            0
price            0
user_id          0
user_session     0
dtype: int64


## Feature Engineer

In [8]:
df['user_score'] = df['event_type'].map({'view':1, 'cart':10, 'purchase':50})
df['user_purchase'] = df['event_type'].apply(lambda x: 1 if x == 'purchase' else 0)

In [10]:
for category_code in df['category_code'].unique():
    df['price_category'] = pd.qcut(df['price'],  
                            q=[0, .2, .4, .6, .8, 1],
                              labels=[1,2,3,4,5])

In [11]:
group = df.groupby(['user_id', 'product_id'])['user_score', 'user_purchase'].sum().reset_index()
group['user_purchase'] = group['user_purchase'].apply(lambda x: 1 if x>1 else x)
group['user_score'] = group['user_score'].apply(lambda x: 100 if x>100 else x)

# apply MinMaxScaler to the user scores to obtain an interaction score with a value between 0 and 1
# >=0.5: a very high probability that a purchase has occurred
# <0.5: no purchase occurs below the threshold of 0.5

from sklearn.preprocessing import MinMaxScaler

std = MinMaxScaler(feature_range=(0.025, 1))
std.fit(group['user_score'].values.reshape(-1,1))
group['interaction_score'] = std.transform(group['user_score'].values.reshape(-1,1))

group = group.merge(df[['product_id','category_code','brand','price','price_category']].drop_duplicates('product_id'),
                    on=['product_id'])

  """Entry point for launching an IPython kernel.


## Modelling

In [14]:
inputs = group.drop('interaction_score', axis =1)
X = inputs
y = group['interaction_score']

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7)


print({X_train.shape}), print(y_train.shape)
print(X_test.shape), print(y_test.shape)

{(4202, 8)}
(4202,)
(1802, 8)
(1802,)


(None, None)

In [16]:
X_train_matrix = pd.pivot_table(X_train,values='user_score',
                                index='user_id',columns='product_id')
X_train_matrix = X_train_matrix.fillna(0)

In [17]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
# filtering by item category, price category and brand

product_cat = X_train[['product_id','price_category',
                       'category_code','brand']].drop_duplicates('product_id')
product_cat['product_id'] = product_cat['product_id'].astype(str)
product_cat.sort_values(by='product_id', ascending=True, inplace=True)
product_cat = product_cat.sort_values(by='product_id')

# Reciprocal of 2 is 0.5, cos 1/2. 
price_cat_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['price_category']).reshape(-1,1))+1)
euclidean_matrix = pd.DataFrame(price_cat_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

# TfidfVectorizer() converts texts to word freq counts... 
tfidf_vectorizer = TfidfVectorizer()
doc_term = tfidf_vectorizer.fit_transform(list(product_cat['category_code']))
dt_matrix = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in product_cat['product_id']], columns=tfidf_vectorizer.get_feature_names())
cos_similar_matrix = pd.DataFrame(cosine_similarity(dt_matrix.values),columns=product_cat['product_id'],index=product_cat['product_id'])

tfidf_vectorizer = TfidfVectorizer()
doc_term = tfidf_vectorizer.fit_transform(list(product_cat['brand']))
dt_matrix1 = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in product_cat['product_id']], columns=tfidf_vectorizer.get_feature_names())
dt_matrix1 = dt_matrix1 + 0.01
cos_similar_matrix1 = pd.DataFrame(cosine_similarity(dt_matrix1.values),columns=product_cat['product_id'],index=product_cat['product_id'])

similarity_matrix = cos_similar_matrix.multiply(euclidean_matrix).multiply(cos_similar_matrix1)
# content_matrix = X_train_matrix.dot(similarity_matrix)
content_matrix = np.matrix(X_train_matrix)*np.matrix(similarity_matrix)



# apply MinMaxScaler again to obtain the trained User-Item Matrix of predicted interaction scores
content_matrix = pd.DataFrame(content_matrix)
std = MinMaxScaler(feature_range=(0, 1))
std.fit(content_matrix.values)
content_matrix = std.transform(content_matrix.values)

In [19]:
content_matrix = pd.DataFrame(content_matrix,
                              columns=sorted(X_train['product_id'].unique()),
                              index=sorted(X_train['user_id'].unique()))
content_df = content_matrix.stack().reset_index()
content_df = content_df.rename(columns={'level_0':'user_id','level_1':'product_id',0:'predicted_interaction'})
X_test = X_test.merge(content_df,on=['user_id','product_id'])

X_test['predicted_purchase'] = X_test['predicted_interaction'].apply(lambda x:1 if x>=0.5 else 0)

In [20]:
X_test

Unnamed: 0,user_id,product_id,user_score,user_purchase,category_code,brand,price,price_category,predicted_interaction,predicted_purchase
0,512388097,1005124,1,0,electronics.smartphone,apple,1634.51,5,0.020000,0
1,495589687,1004038,1,0,electronics.smartphone,xiaomi,151.35,3,0.234525,0
2,512379919,1004250,2,0,electronics.smartphone,apple,803.96,5,0.080000,0
3,512389330,26300332,2,0,empty,unknown,361.82,4,0.145282,0
4,376532563,1004258,1,0,electronics.smartphone,apple,733.08,5,0.011421,0
...,...,...,...,...,...,...,...,...,...,...
556,436339065,5801510,2,0,electronics.audio.subwoofer,alphard,115.83,2,0.018269,0
557,512382281,3700755,1,0,appliances.environment.vacuum,samsung,95.21,2,0.333061,0
558,512389114,3601437,1,0,appliances.kitchen.washer,lg,380.85,4,0.000000,0
559,512372197,22700536,1,0,empty,jonnesway,338.49,4,0.002924,0


In [24]:
X_test

Unnamed: 0,user_id,product_id,user_score,user_purchase,category_code,brand,price,price_category,predicted_interaction,predicted_purchase
0,512388097,1005124,1,0,electronics.smartphone,apple,1634.51,5,0.020000,0
1,495589687,1004038,1,0,electronics.smartphone,xiaomi,151.35,3,0.234525,0
2,512379919,1004250,2,0,electronics.smartphone,apple,803.96,5,0.080000,0
3,512389330,26300332,2,0,empty,unknown,361.82,4,0.145282,0
4,376532563,1004258,1,0,electronics.smartphone,apple,733.08,5,0.011421,0
...,...,...,...,...,...,...,...,...,...,...
556,436339065,5801510,2,0,electronics.audio.subwoofer,alphard,115.83,2,0.018269,0
557,512382281,3700755,1,0,appliances.environment.vacuum,samsung,95.21,2,0.333061,0
558,512389114,3601437,1,0,appliances.kitchen.washer,lg,380.85,4,0.000000,0
559,512372197,22700536,1,0,empty,jonnesway,338.49,4,0.002924,0


In [28]:
len(X_train['user_id'].unique())

1322

In [38]:
user_test = X_test[['user_id', 'category_code', 'user_purchase', 'predicted_purchase']]
user_test

Unnamed: 0,user_id,category_code,user_purchase,predicted_purchase
0,512388097,electronics.smartphone,0,0
1,495589687,electronics.smartphone,0,0
2,512379919,electronics.smartphone,0,0
3,512389330,empty,0,0
4,376532563,electronics.smartphone,0,0
...,...,...,...,...
556,436339065,electronics.audio.subwoofer,0,0
557,512382281,appliances.environment.vacuum,0,0
558,512389114,appliances.kitchen.washer,0,0
559,512372197,empty,0,0


In [31]:
user_train = X_train[['user_id', 'category_code', 'user_purchase']]
user_train

Unnamed: 0,user_id,category_code,user_purchase
5643,512387901,empty,0
3569,506623177,computers.notebook,0
4358,512368302,sport.bicycle,0
1943,472764123,kids.skates,0
2463,486126757,electronics.clocks,0
...,...,...,...
816,436339065,computers.components.power_supply,0
5559,512385838,empty,0
1578,459555560,electronics.camera.video,0
1295,497416786,empty,0


In [36]:
user_train[user_train['user_purchase'] == 1]

Unnamed: 0,user_id,category_code,user_purchase
1095,512372746,electronics.smartphone,1
1341,451971397,appliances.kitchen.refrigerators,1
4333,512367874,empty,1
3669,512379450,empty,1
4572,512372458,appliances.kitchen.refrigerators,1
...,...,...,...
476,512373231,electronics.smartphone,1
587,512386086,electronics.smartphone,1
1559,512386086,electronics.smartphone,1
1631,511231471,electronics.smartphone,1


In [43]:
user_test[user_test['predicted_purchase'] == 1]

Unnamed: 0,user_id,category_code,user_purchase,predicted_purchase
12,512386351,electronics.smartphone,0,1
98,489110084,empty,0,1
121,512372746,empty,0,1
135,512379450,empty,0,1
151,512363681,empty,0,1
161,512379865,electronics.smartphone,0,1
177,450225323,empty,0,1
201,512371142,empty,0,1
261,476943866,appliances.kitchen.dishwasher,0,1
274,501192468,electronics.video.tv,0,1
