In [1]:
import os
import sys
import datetime
import random
from typing import List

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
import implicit
from scipy.sparse import coo_matrix
import joblib

In [2]:
random.seed(0)
%matplotlib inline
sys.path.append('../utils/')

In [3]:
from collaborative_filtering_matrix_factorization import MFRecommender


# Load data

In [4]:
for dirname, _, filenames in os.walk('./../data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


./../data/Customer.csv
./../data/prod_cat_info.csv
./../data/Transactions.csv


In [5]:
customer_raw_df = pd.read_csv('./../data/Customer.csv')
prod_cat_info_raw_df = pd.read_csv('./../data/prod_cat_info.csv')
transactions_raw_df = pd.read_csv('./../data/Transactions.csv')


In [6]:
transactions_df = transactions_raw_df.copy()
transactions_df = transactions_df[transactions_df.Qty>0]
transactions_df['tran_date'] = pd.to_datetime(transactions_df['tran_date'].str.replace('/','-'), format='%d-%m-%Y')
transactions_df['item_id']=transactions_df.prod_cat_code.astype(str)+'_'+transactions_df.prod_subcat_code.astype(str)
transactions_df = transactions_df[['transaction_id', 'cust_id', 'item_id', 'tran_date', 'Qty']]
transactions_df

Unnamed: 0,transaction_id,cust_id,item_id,tran_date,Qty
10,29258453508,270384,3_5,2014-02-20,5
11,25455265351,267750,6_12,2014-02-20,3
12,1571002198,275023,5_6,2014-02-20,4
14,36554696014,269345,5_3,2014-02-20,3
15,56814940239,268799,5_7,2014-02-20,5
...,...,...,...,...,...
23048,94340757522,274550,5_12,2011-01-25,1
23049,89780862956,270022,1_4,2011-01-25,1
23050,85115299378,271020,6_2,2011-01-25,4
23051,72870271171,270911,5_11,2011-01-25,3


In [7]:
items_df = prod_cat_info_raw_df.copy()
items_df['item_id']=items_df.prod_cat_code.astype(str)+'_'+items_df.prod_sub_cat_code.astype(str)
items_df= items_df[['item_id','prod_cat_code', 'prod_cat', 'prod_sub_cat_code', 'prod_subcat']]
items_df

Unnamed: 0,item_id,prod_cat_code,prod_cat,prod_sub_cat_code,prod_subcat
0,1_4,1,Clothing,4,Mens
1,1_1,1,Clothing,1,Women
2,1_3,1,Clothing,3,Kids
3,2_1,2,Footwear,1,Mens
4,2_3,2,Footwear,3,Women
5,2_4,2,Footwear,4,Kids
6,3_4,3,Electronics,4,Mobiles
7,3_5,3,Electronics,5,Computers
8,3_8,3,Electronics,8,Personal Appliances
9,3_9,3,Electronics,9,Cameras


In [8]:
full_info_transactions = (
    transactions_df
    .merge(customer_raw_df, how='left', left_on='cust_id', right_on='customer_Id')
    .merge(items_df, how='left', on='item_id')
)

# ALS

## Items table

In [9]:
def get_sales_until_date(transactions: pd.DataFrame, max_date:str='01-01-2012', fill_na_with_zero: bool=True)-> pd.DataFrame:
    """
    return the sales data (index: customers, columns: items, quantity sold since day1), 
    an equivalent to recommendation table, until a certain date
    :param transactions: pd.DataFrame, all transactions
    :param max_date: str, maximum date to consider
    :return: pd.DataFrame, sales table (equivalent to recommendation table
    """
    trasaction_until_date = transactions[transactions.tran_date<pd.to_datetime(max_date, format='%d-%m-%Y')]
    trasaction_grouped_by_customer_item = (
        trasaction_until_date.groupby(['cust_id','item_id'])[['Qty']].sum().reset_index()
    )
    sales_table = (
        pd.pivot_table(
            trasaction_grouped_by_customer_item,index='cust_id',columns='item_id',values='Qty'
        )#.fillna(0)
    )
    sales_table = sales_table.fillna(0) if fill_na_with_zero else sales_table
    
    # add missing items
    for item_id in transactions.item_id.unique():
        if item_id not in sales_table.columns:
            sales_table[item_id] = 0 if fill_na_with_zero else sales_table
    sales_table.columns = sorted(sales_table.columns)
    return sales_table


In [10]:
sales = get_sales_until_date(transactions_df, '01-04-2011', False)

In [11]:
sales

Unnamed: 0_level_0,1_1,1_3,1_4,2_1,2_3,2_4,3_10,3_4,3_5,3_8,...,5_10,5_11,5_12,5_3,5_6,5_7,6_10,6_11,6_12,6_2
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
266785,,,,,,5.0,,,,,...,,,,,,,,,,
266794,,3.0,,,,,,,,,...,,,,,,,,,,
266810,,,3.0,,,,,,,,...,,,,,,,,,,
266822,,,,,,,,,,,...,,,,,,,,,,
266829,,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275224,4.0,,,,,,,,,,...,,,,,,,,,4.0,
275226,,,,,,,,,,,...,,,4.0,,,,,,,
275227,,,,,,,,,,,...,,,,,,,,,,
275233,,,,,,,,,,,...,,,,,3.0,,,,,


In [12]:
sales.T.values

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan,  3., nan, ..., nan, nan, nan],
       [nan, nan,  3., ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [13]:
sales.columns

Index(['1_1', '1_3', '1_4', '2_1', '2_3', '2_4', '3_10', '3_4', '3_5', '3_8',
       '3_9', '4_1', '4_4', '5_10', '5_11', '5_12', '5_3', '5_6', '5_7',
       '6_10', '6_11', '6_12', '6_2'],
      dtype='object')

In [14]:
a = pd.melt(sales.reset_index(),id_vars=['cust_id'], value_vars=['1_1', '1_3', '1_4', '2_1', '2_3', '2_4', '3_10', '3_4', '3_5', '3_8',
       '3_9', '4_1', '4_4', '5_10', '5_11', '5_12', '5_3', '5_6', '5_7',
       '6_10', '6_11', '6_12', '6_2'],var_name='a').dropna(subset=['value'])

# Evaluate the model:

We evaluate it on the year 2013 (training on 2011-2012)
We make predictions every month

In [15]:
dates_to_train_on = [f'01-{str(month).zfill(2)}-2013' for month in range(1,13)]+['01-01-2014']

In [16]:
items_ids = items_df.item_id.unique()
def fill_with_other_items_randomly(items_recommended: List) -> List:
    """
    If complete the list of recommended items with the rest of the items at random
    :param items_recommended: list, list of items recommended
    :return: list, ordered list of all the items to be recommended  
    """
    items_not_recommended = [item_id for item_id in items_ids if item_id not in items_recommended]
    # recommend other items randomly 
    random_items_not_recommended = random.sample(items_not_recommended,len(items_not_recommended))
    return items_recommended+random_items_not_recommended

In [17]:
def evaluate_recommendation(items_sold: List[List[str]], items_predicted: List[List[str]]):
    """
    Mean (Size(item |  item in items_sold and item in items_predicted) / Size(item |  item in items_sold) for each user) 
    :param items_sold: array-like, List of users' lists of items sold 
    :param items_predicted: array-like, List of users' lists of recommended items 
    :return:
    """
    return sum([len(set(x) & set(y))/len(x) for x, y in zip(items_sold, items_predicted)])/len(items_sold)

In [18]:
def evaluate_on_n_items():
    # Make a prediction for each month of the year 2013
    yearly_sold = []
    yearly_mfr_predictions = []


    for i, date in enumerate(dates_to_train_on[:-1]):

        cumulative_sales = get_sales_until_date(transactions_df, date, False)

        cumulative_sales_next_month = get_sales_until_date(transactions_df, dates_to_train_on[i+1]).fillna(0)
        sales_next_month = (cumulative_sales_next_month-cumulative_sales.fillna(0)).fillna(0)
        total_sales_next_month = sales_next_month.sum(axis=1)
        customers_next_month = total_sales_next_month[total_sales_next_month>0].index.tolist()

        items_sold = sales_next_month.loc[customers_next_month].apply(lambda s : s[s>0].index.tolist(),axis=1)
        yearly_sold.append(items_sold)

        # Content based recommender
        mfr = MFRecommender(factors=7, iterations=5, random_state=0, calculate_training_loss=True)
        mfr.fit(cumulative_sales)
        predictions = mfr.predict(customers_next_month)
        mfr_predicted_items = (predictions
                               .apply(lambda s : s[s>0].sort_values(ascending=False).index.tolist(),axis=1)
                               .map(fill_with_other_items_randomly)
        )
        yearly_mfr_predictions.append(mfr_predicted_items)


    yearly_sold = pd.concat(yearly_sold,ignore_index=True)
    yearly_mfr_predictions = pd.concat(yearly_mfr_predictions,ignore_index=True)

    for n in [1,3,5]:
        print(f'Evaluation on {n} predictions')
        print('Score of CBR:', evaluate_recommendation(yearly_sold, yearly_mfr_predictions.map(lambda x: x[:n]))*100)


In [19]:
evaluate_on_n_items()



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Evaluation on 1 predictions
Score of CBR: 4.92063492063492
Evaluation on 3 predictions
Score of CBR: 13.158182813355227
Evaluation on 5 predictions
Score of CBR: 21.778872468527645


In [20]:
cumulative_sales = get_sales_until_date(transactions_df, '01-01-2014', False)
# Content based recommender
mfr = MFRecommender(factors=7, iterations=5, random_state=0, calculate_training_loss=True)
mfr.fit(cumulative_sales)


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [22]:
mfr.save_fitted_model('mfr_model_01_01_2014')

In [24]:
mfr.predict([1,266783])

Unnamed: 0,1_1,1_3,1_4,2_1,2_3,2_4,3_10,3_4,3_5,3_8,...,5_10,5_11,5_12,5_3,5_6,5_7,6_10,6_11,6_12,6_2
266783,-0.04976,-0.034,0.651784,0.582541,0.18261,0.64975,0.299692,-0.020104,0.545705,-0.128252,...,0.574053,-0.11481,-0.090685,0.058694,0.359587,-0.157063,0.325214,0.238346,0.152753,0.882335
1,0.248917,0.228681,0.226248,0.241382,0.25073,0.223789,0.233082,0.271494,0.233777,0.23304,...,0.258415,0.240953,0.232202,0.240193,0.226805,0.243515,0.241433,0.242893,0.239151,0.25168
