In [452]:
import pandas as pd
import numpy as np

In [453]:
df = pd.read_excel("Sample 5000.xlsx")

In [454]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   event_time     10001 non-null  object 
 1   event_type     10001 non-null  object 
 2   product_id     10001 non-null  int64  
 3   category_id    10001 non-null  int64  
 4   category_code  6623 non-null   object 
 5   brand          8595 non-null   object 
 6   price          10001 non-null  float64
 7   user_id        10001 non-null  int64  
 8   user_session   10001 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 703.3+ KB


## Removing Null Values
- For brands that have null values, we're gonna replace with 'unknown' brand.
- For category_code with null values, we're going to remove them because it would be useless.

In [455]:
df['category_code'] = df['category_code'].fillna(value="empty")
df['category_code'].astype(str)

df['brand'] = df['brand'].fillna(value="unknown")
df['brand'].astype(str)

0          samsung
1          samsung
2             sony
3            intel
4          samsung
           ...    
9996           msi
9997           msi
9998     garanterm
9999       samsung
10000      samsung
Name: brand, Length: 10001, dtype: object

In [456]:
users_count = dict(df.user_id.value_counts())

for row in df.iterrows():
    
    # Retrieving value
    category_code_col = row[1]['category_code']
    user_id_col = row[1]['user_id']
    # Apply logic
        ## We set {user-id: count_it_appears}
        ## If count_it_appear == 1 and (category_code_col = nan), then we remove them
        
    count = users_count[user_id_col]
    if count == 1 and category_code_col == 'empty':
        df = df.drop(df[df.user_id == user_id_col].index)

In [457]:
print("Null Values check: ", "\n", df.isnull().sum())

Null Values check:  
 event_time       0
event_type       0
product_id       0
category_id      0
category_code    0
brand            0
price            0
user_id          0
user_session     0
dtype: int64


## Feature Engineer

In [458]:
df['user_score'] = df['event_type'].map({'view':1, 'cart':10, 'purchase':50})
df['user_purchase'] = df['event_type'].apply(lambda x: 1 if x == 'purchase' else 0)

In [459]:
## FIGURE THIS OUT DARREN!
df['price_category'] = 1
for i in df['category_code'].unique():
    df.loc[df['category_code']==i,'price_category'] = pd.qcut(x=df['price'][df['category_code']==i],q=5, labels=[1,2,3,4,5])

ValueError: Bin edges must be unique: array([338.23 , 403.714, 447.37 , 447.37 , 553.342, 712.3  ]).
You can drop duplicate edges by setting the 'duplicates' kwarg

In [460]:
group = df.groupby(['user_id', 'product_id'])['user_score', 'user_purchase'].sum().reset_index()
group['user_purchase'] = group['user_purchase'].apply(lambda x: 1 if x>1 else x)
group['user_score'] = group['user_score'].apply(lambda x: 100 if x>100 else x)

# apply MinMaxScaler to the user scores to obtain an interaction score with a value between 0 and 1
# >=0.5: a very high probability that a purchase has occurred
# <0.5: no purchase occurs below the threshold of 0.5

from sklearn.preprocessing import MinMaxScaler

std = MinMaxScaler(feature_range=(0.025, 1))
std.fit(group['user_score'].values.reshape(-1,1))
group['interaction_score'] = std.transform(group['user_score'].values.reshape(-1,1))

group = group.merge(df[['product_id','category_code','brand','price','price_category']].drop_duplicates('product_id'),on=['product_id'])

  """Entry point for launching an IPython kernel.


## Modelling

In [461]:
inputs = group.drop('interaction_score', axis =1)
X = inputs
y = group['interaction_score']

In [462]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, 
                                                          random_state = 1)

In [463]:
X_train_matrix = pd.pivot_table(X_train,values='user_score',index='user_id',columns='product_id')
X_train_matrix = X_train_matrix.fillna(0)

In [464]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [465]:
# filtering by item category, price category and brand

product_cat = X_train[['product_id','price_category','category_code','brand']].drop_duplicates('product_id')
product_cat = product_cat.sort_values(by='product_id')

# Reciprocal of 2 is 0.5, cos 1/2. 
price_cat_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['price_category']).reshape(-1,1))+1)
euclidean_matrix = pd.DataFrame(price_cat_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

# TfidfVectorizer() converts texts to word freq counts... 
tfidf_vectorizer = TfidfVectorizer()
doc_term = tfidf_vectorizer.fit_transform(list(product_cat['category_code']))
dt_matrix = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in product_cat['product_id']], columns=tfidf_vectorizer.get_feature_names())
cos_similar_matrix = pd.DataFrame(cosine_similarity(dt_matrix.values),columns=product_cat['product_id'],index=product_cat['product_id'])

tfidf_vectorizer = TfidfVectorizer()
doc_term = tfidf_vectorizer.fit_transform(list(product_cat['brand']))
dt_matrix1 = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in product_cat['product_id']], columns=tfidf_vectorizer.get_feature_names())
dt_matrix1 = dt_matrix1 + 0.01
cos_similar_matrix1 = pd.DataFrame(cosine_similarity(dt_matrix1.values),columns=product_cat['product_id'],index=product_cat['product_id'])

similarity_matrix = cos_similar_matrix.multiply(euclidean_matrix).multiply(cos_similar_matrix1)
content_matrix = X_train_matrix.dot(similarity_matrix)

# apply MinMaxScaler again to obtain the trained User-Item Matrix of predicted interaction scores
std = MinMaxScaler(feature_range=(0, 1))
std.fit(content_matrix.values)
content_matrix = std.transform(content_matrix.values)