# Install Libraries

In [None]:
!pip install lightfm

# Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pandas
from sklearn.decomposition import PCA
import numpy
import matplotlib.pyplot as plot
import random
from scipy import sparse
from lightfm import LightFM
from lightfm.evaluation import auc_score

# Load Data

In [None]:
# list of prefered dataframe names in same order as directory
customers = pd.read_csv("/project/data/raw/olist_customers_dataset.csv")
location = pd.read_csv("/project/data/raw/olist_geolocation_dataset.csv")
orders = pd.read_csv("/project/data/raw/olist_orders_dataset.csv")
order_items = pd.read_csv("/project/data/raw/olist_order_items_dataset.csv")
payments = pd.read_csv("/project/data/raw/olist_order_payments_dataset.csv")
reviews = pd.read_csv("/project/data/raw/olist_order_reviews_dataset.csv")
products = pd.read_csv("/project/data/raw/olist_products_dataset.csv")
sellers = pd.read_csv("/project/data/raw/olist_sellers_dataset.csv")
translation = pd.read_csv("/project/data/raw/product_category_name_translation.csv")

# Looking at customer table

In [None]:
customers.head()

Our customers have not much information, we are unable to use much information about our customer to do customer-feature 

But we can use the interaction as user feature (aka collaborative filtering)

# Merge Order with order item

In [None]:
master_df = orders.merge(order_items, on="order_id").merge(customers, on="customer_id").merge(sellers, on="seller_id").merge(products, on="product_id").merge(translation, on='product_category_name').merge(reviews, on="order_id")
master_df = master_df.sort_values(by=['order_purchase_timestamp'])

# Master Table is the giant table

In [None]:
master_df.head(5)

# Check how many customer with more than 1 item purchase

In [None]:
temp = master_df.groupby("customer_unique_id").count()["order_id"].reset_index()
temp = temp[temp["order_id"] > 1]
temp.sort_values(by='order_id',ascending = False).reset_index(drop=True)

We focus on the 11612 customer that got at least two item, as we can use their engagement with the system, and predict what they will buy next, we will take 20% of the transaction as testing and use 80% to feed our lightfm model to train our recommender system

After merging we got a few missing row due to NA category

# Create a unique list of user and a unique list of item

In [None]:
user_list = master_df["customer_unique_id"].unique()
item_list = master_df["product_id"].unique()

# Create a list of mapping

In [None]:
user_to_index_mapping = {}
index_to_user_mapping = {}
for user_index, user_id in enumerate(user_list):
    user_to_index_mapping[user_id] = user_index
    index_to_user_mapping[user_index] = user_id

In [None]:
item_to_index_mapping = {}
index_to_item_mapping = {}
for item_index, item_id in enumerate(item_list):
    item_to_index_mapping[item_id] = item_index
    index_to_item_mapping[item_index] = item_id

In [None]:
user_to_product_rating = master_df[['customer_unique_id', 'product_id']].sort_values(by='customer_unique_id').reset_index(drop=True)

In [None]:
user_to_product_rating

# Split Train Test

In [None]:
np.random.seed(10)

#Split 80% for training 20% for testing
rows = np.random.binomial(1, .8, size=len(user_to_product_rating)).astype('bool')

user_to_product_rating_train = user_to_product_rating[rows].reset_index(drop=True)
user_to_product_rating_test = user_to_product_rating[~rows].reset_index(drop=True)

# Convert Interaction into matric

In [None]:
def get_interaction_matrix(df, df_column_as_row, df_column_as_col, row_indexing_map, 
                          col_indexing_map):
    
    row = df[df_column_as_row].apply(lambda x: row_indexing_map[x]).values
    col = df[df_column_as_col].apply(lambda x: col_indexing_map[x]).values
    value = [1 for x in range(len(df[df_column_as_col]))]
    return sparse.coo_matrix((value, (row, col)), shape = (len(row_indexing_map), len(col_indexing_map)))

In [None]:
user_to_product_interaction_train = get_interaction_matrix(user_to_product_rating_train, "customer_unique_id", 
                                                    "product_id", user_to_index_mapping, item_to_index_mapping)
user_to_product_interaction_test = get_interaction_matrix(user_to_product_rating_test, "customer_unique_id", 
                                                    "product_id", user_to_index_mapping, item_to_index_mapping)

## Model with only collaborative interactions

In [None]:
from lightfm import LightFM
from lightfm.evaluation import auc_score

#Create simple model
model_without_features = LightFM()
model_without_features.fit(user_to_product_interaction_train)

# Check AUC score
auc_without_features = auc_score(model = model_without_features, test_interactions = user_to_product_interaction_test)
print("Average AUC with only collaborative interactions:", auc_without_features.mean())

In [None]:
def recommendation_for_user( model, items, user_to_product_interaction_matrix, user):

    # getting the userindex
    userindex = user_to_index_mapping.get(user, None)
    if userindex == None:
        print("Not a valid customer")
        return None
    users = userindex
    # products already bought
    known_positives = items[user_to_product_interaction_matrix.tocsr()[userindex].indices]
    # scores from model prediction
    scores = model.predict(user_ids = users, item_ids = np.arange(user_to_product_interaction_matrix.shape[1]))

    # top items

    top_items = items[np.argsort(-scores)]

    # printing out the result
    print("User:\t\t\t", user)
    print("Known Purchases:")
    for x in known_positives[:3]:
        print("\t\t\t",x)
        print("\t\t\t",master_df[master_df['product_id']==x]['product_category_name_english'].iloc[0])
    print("Recommended Item:")
    for x in top_items[:3]:
        print("\t\t\t",x)
        print("\t\t\t",master_df[master_df['product_id']==x]['product_category_name_english'].iloc[0])

In [None]:
recommendation_for_user(model_without_features,item_list,user_to_product_interaction_train,'3a51803cc0d012c3b5dc8b7528cb05f7')

# Adding Content Filtering

<b>Product-feature interactions df:</b>

We want to now create a dataframe that describes the relationship between product and features.

In [None]:
product_to_feature = master_df[["product_id","product_category_name_english", "seller_id","seller_city", "seller_state"]].drop_duplicates('product_id').reset_index(drop=True)

In [None]:
# Transpose product_to_feature to extract features for each product_id
df = product_to_feature.set_index('product_id').T.reset_index(drop=True)

# Need to create list of product_id's repeated 6 times for each feature the resulting df will hold
cols = list(df.columns)
res =  [ele for ele in cols for i in range(len(product_to_feature.columns) - 1)]

# Create empty dataframe with index as product_id
features = pd.DataFrame(index = res)
features.index.name = 'product_id'

# Reset index so 'product_id' becomes column
features = features.reset_index()

# create empty column to fill
features['feature'] = ""

# Create list of feature items to replace empty column
feature_items = []
for col in cols:
    for i in range(len(product_to_feature.columns) - 1):
        feature_items.append(df[col][i])

# set column to list created for each users features
features['feature'] = feature_items

# change name back to product_to_feature
product_to_feature = features

product_to_feature.head(10)

In [None]:
def get_item_feature_list(df, product_category_col, seller_col, seller_city, seller_state):
    
    categories = df[product_category_col]
    sellers = df[seller_col]
    seller_city = df[seller_city]
    seller_state = df[seller_state]
    
    return pd.concat([categories, sellers, seller_city, seller_state], ignore_index = True).unique()

In [None]:
item_features_list = get_item_feature_list(master_df, "product_category_name_english", "seller_id","seller_city", "seller_state")

In [None]:
item_feature_to_index_mapping = {}
index_to_item_feature_mapping = {}
for item_feature_index, item_feature_id in enumerate(item_features_list):
    item_feature_to_index_mapping[item_feature_id] = item_feature_index
    index_to_item_feature_mapping[item_feature_index] = item_feature_id


In [None]:
# generate item_to_feature interaction
product_to_feature_interaction = get_interaction_matrix(product_to_feature, "product_id", "feature", 
                                                        item_to_index_mapping, item_feature_to_index_mapping)

## Model with only collaborative interactions + content filtering

In [None]:
#Create simple model
model_with_features = LightFM()
model_with_features.fit(user_to_product_interaction_train,
          item_features=product_to_feature_interaction)

# Check AUC score
auc_with_features = auc_score(model = model_with_features, test_interactions = user_to_product_interaction_test,
          item_features=product_to_feature_interaction )
print("Average AUC with only collaborative interactions + content filtering :", auc_with_features.mean())