# Checking all files in data folder

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/brazilian-ecommerce/olist_customers_dataset.csv
/kaggle/input/brazilian-ecommerce/olist_sellers_dataset.csv
/kaggle/input/brazilian-ecommerce/olist_order_reviews_dataset.csv
/kaggle/input/brazilian-ecommerce/olist_order_items_dataset.csv
/kaggle/input/brazilian-ecommerce/olist_products_dataset.csv
/kaggle/input/brazilian-ecommerce/olist_geolocation_dataset.csv
/kaggle/input/brazilian-ecommerce/product_category_name_translation.csv
/kaggle/input/brazilian-ecommerce/olist_orders_dataset.csv
/kaggle/input/brazilian-ecommerce/olist_order_payments_dataset.csv


In [52]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tqdm

# LOADING DATA

In [2]:
data_path = '../input/'

customer_data = pd.read_csv(data_path+'brazilian-ecommerce/olist_customers_dataset.csv')
seller_data = pd.read_csv(data_path+'../input/brazilian-ecommerce/olist_sellers_dataset.csv')
reviews_data = pd.read_csv(data_path+'../input/brazilian-ecommerce/olist_order_reviews_dataset.csv')
items_data = pd.read_csv(data_path+'../input/brazilian-ecommerce/olist_order_items_dataset.csv')
product_data = pd.read_csv(data_path+'../input/brazilian-ecommerce/olist_products_dataset.csv')
geo_data = pd.read_csv(data_path+'../input/brazilian-ecommerce/olist_geolocation_dataset.csv')
orders_data = pd.read_csv(data_path+'../input/brazilian-ecommerce/olist_orders_dataset.csv')

# CLEANING DATA

**Overview of data**

In [3]:
print("customer data")
print(customer_data.info())
print()
print("selller data")
print(seller_data.info())
print()
print("reviews data")
print(reviews_data.info())
print()
print("items data")
print(items_data.info())
print()
print("product data")
print(product_data.info())
print()
print("geo data")
print(geo_data.info())
print()
print("orders data")
print(orders_data.info())
print()

customer data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
None

selller data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64

We see that only reviews-data, product-data,order-data have empty cells in them

**Removing or filling null values**

Since we are using only review score, so we can remove review title and review messages.
From the product table since we use only product id we can remove other columns
For orders table, since number of empty rows is less than 3% we can remove them

In [4]:
reviews_data = reviews_data[['order_id', 'review_score']]
product_data.dropna(inplace=True)
orders_data.dropna(inplace=True)

Uptill here all null or empty columns have been removed. Further preprocessing would be done according to the needs of the model

**Breaking into train and test dataset for orders**

In [5]:
num_train_rows = int(orders_data.shape[0]*0.8)

train_orders = orders_data.iloc[:num_train_rows]
test_orders = orders_data.iloc[num_train_rows:]

# Merging Data

We create a merged dataframe category_df to understand which user is interested in which category of items. Here we merge orders data, items data and product data.

In [6]:
df = pd.merge(train_orders, items_data, on='order_id')

df = df[['customer_id','product_id','order_id']]
category_df = pd.merge(df,product_data,on='product_id')
category_df = category_df[['customer_id','product_category_name']]


We make a merged table to be used to select top rated products. Here orders table is merged with product as well as reviews table. In order to get a holistic view of the product the mean of all reviews are used.

We also keep this table sorted in descending order for review score to ensure that the top rated products appear on top.

In [7]:
review_df = pd.merge(items_data,reviews_data,on='order_id')
review_df = review_df[['product_id','review_score']].groupby('product_id').mean()
product_df = pd.merge(review_df,product_data,on='product_id')

product_df.sort_values(by='review_score',ascending=False,inplace=True)

We create a table to monitor customer purchasing patterns. Latter this table is also used to create collaborative filtering recommendation systems

In [8]:
customer_df = pd.merge(df,reviews_data,on='order_id')
customer_df = customer_df[['customer_id','product_id','review_score']]

Finally we create a table for each user. Here the index is customer_id and the columns are different product categories, and the entries are the count of products the customer bought of a particular category.

In [9]:
grouped_df = category_df.groupby(['customer_id', 'product_category_name'])

# Count the number of times each product_category_name appears for each customer_id
count_df = grouped_df.size().unstack()
count_df.fillna(0,inplace=True)
count_df = count_df.astype(int)

# Recommendation Models

We create an recommendation model which uses content based filtering, to filter products in the category the user is interested in and picks products with maximum ratings

**Content based filtering**

In [10]:
cat = count_df.columns
def content_recommend(customer_id):
    terms = count_df[count_df.index==customer_id].values[0]
    sort = np.argsort(terms)
    req = cat[sort].values[-5:]
    return product_df[product_df.product_category_name.isin(req)]['product_id'].iloc[:5].values


Testing the recommendation models

In [11]:
content_recommend(count_df.index[39])

array(['c6ab3d8f4227913c4fa99e94ef84aa70',
       'c6a988c762b18da642d772a1f1ddb782',
       'c6a5b2b8610b3206be072c211b70ba69',
       '5f76bb88bf4410580362879247ef7248',
       '5f79ee3b64a3af922735487fb06da429'], dtype=object)

**COLLABORATIVE FILTERING**

We tried to setup collaborative filtering for items but due to the large number of products in the dataset it was neither feasible nor effective.
3072403790 cells had to be created to implement it which was not feasible in a limited resource environment. Also since the products reviewed by users were different to each other, hence similarities between them were few and was not an effective measure to create an recommendation model out of it.

Thus we used collaborative filtering on categories instead

In [12]:
#user_item_matrix = customer_df.pivot_table(values='review_score', index='customer_id', columns='product_id', fill_value=0)
#Calculate the cosine similarity between users
#user_similarity = user_item_matrix.corr(method='cosine')
# recommend_collaborative_filtering(customer_id):
#     similar_users = user_similarity[customer_id].sort_values(ascending=False)[:5]
#     customer_bought_products = customer_df[customer.customer_id == customer_id]['product_id'].tolist()
#     # Get the products that the similar users have rated
#     similar_products = df[df['customer_id'].isin(similar_users.index)]['product_id'].tolist()
#     return [product for product in similar_products if product not in customer_bought_products]

In [13]:
customer_reviews = pd.merge(df,reviews_data,on='order_id')
category_review = pd.merge(customer_reviews,product_data,on='product_id')
category_review = category_review[['customer_id','review_score','product_category_name']]
category_review = category_review.groupby(['customer_id', 'product_category_name']).mean()
user_item_matrix  = category_review.pivot_table(index='customer_id',columns='product_category_name',values='review_score')
user_item_matrix.fillna(0,inplace=True)
user_item_matrix = user_item_matrix.astype(int)

In [14]:
def collaborative_recommendation(customer_id):
    customer = user_item_matrix[user_item_matrix.index==customer_id].values[0]
    cosine_similarities = np.dot(customer, np.transpose(user_item_matrix.values))

# Sort the similarities in descending order and return the top 5 indices
    top_indices = np.argsort(cosine_similarities, axis=0)[-5:]
    customer_bought_products = customer_df[customer_df.customer_id == customer_id]['product_id'].tolist()
    similar_users = [user_item_matrix.index[indices] for indices in top_indices]
    similar_table = customer_df[customer_df['customer_id'].isin(similar_users)]
    similar_table.sort_values(by='review_score',ascending=False,inplace=True)
    similar_products = df[df['customer_id'].isin(similar_users)]['product_id'].tolist()
    final_product =  [product for product in similar_products if product not in customer_bought_products]
    return final_product

**Combined hybrid model**

We finally combine both recommendation models to generate final recommendation.
Here we also check if previous data of this customer is available or not. If not available then it returns the top 10 rated products to the customer

In [15]:
def hybrid_recommendation(customer_id):
    if(customer_id in user_item_matrix.index):
        final_list = list(set(content_recommend(customer_id)) | set(collaborative_recommendation(customer_id)))
        product_table = product_df[product_df['product_id'].isin(final_list)]
        product_table.sort_values(by='review_score',ascending=False,inplace=True)
        return product_table['product_id'].values[:10]
    return product_df['product_id'].values[:10]

In [16]:
hybrid_recommendation('qwertyi')

array(['00066f42aeeb9f3007548bb9d3f33c38',
       '6000cda32a1e7f6919ae1fcdf1d6259a',
       '5fe490e61e7a37f2f0d1ad1771ac027d',
       'c6ab810300ecce0784a66d36fcb5560f',
       'c6ab3d8f4227913c4fa99e94ef84aa70',
       'c6aa164ce2d179e87318dacf63ba9d9f',
       '5fef486f2057e10fd91e167348812b7a',
       '5ff4076c0f01eeba4f728c9e3fa2653c',
       'c6a988c762b18da642d772a1f1ddb782',
       '5ff59c5f2db7600fa8143442c8b1e4f6'], dtype=object)

# Testing the model

In [17]:
test_items = pd.merge(test_orders, items_data, on='order_id')
test_items = test_items[['customer_id','product_id','order_id']]
test_dataset = pd.merge(test_items,reviews_data,on='order_id')

In [34]:
recom_prod = []
def is_recommendation_correct(y_data,products:list):
    prod_list_cat = product_df[product_df['product_id'].isin(products)]['product_category_name'].values
    prod_cat = product_df[product_df['product_id']==y_data['product_id']]['product_category_name'].values
    recom_prod.append(prod_cat in prod_list_cat)
    if y_data['review_score']>1:
        return prod_cat in prod_list_cat
    elif y_data['review_score']<0:
        return prod_cat in prod_list_cat
    else:
        return True

In [19]:
X_test = test_dataset['customer_id'].values
y_test =  test_dataset[['product_id','review_score']]

In [39]:
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

# This code will not generate any warnings from the FutureWarning category


success_array = []
recom_prod = []
for i in tqdm.tqdm(range(len(X_test))):
    products = hybrid_recommendation(X_test[i])
    is_correct = is_recommendation_correct(y_test.iloc[i],products)
    success_array.append(is_correct)
    
print(np.sum(success_array))

  recom_prod.append(prod_cat in prod_list_cat)
  return prod_cat in prod_list_cat
100%|██████████| 22128/22128 [04:19<00:00, 85.13it/s] 

10603





In [50]:
actual

array([5, 5, 1, ..., 2, 2, 5])

In [53]:
actual = y_test['review_score'].values>3
pred = recom_prod

precision = precision_score(actual, pred)
print("Precision :", precision)
recall = recall_score(actual, pred)
print("Recall    :", recall)
F1_score = f1_score(actual, pred)
print("F1-score  :", F1_score)

Precision : 0.7599121361889072
Recall    : 0.4082728506520328
F1-score  : 0.5311684323660371


In [45]:
print("percentage of accurate recommendations:",np.sum(success_array)/len(success_array)*100,"%")

percentage of accurate recommendations: 47.91666666666667 %


The above results show that of every 2 recommendations whenever the customer buys one he would rate the product over 3