In [1]:
import os
import sys
import datetime
import random
from typing import List
import shutil

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling


In [2]:
random.seed(0)
%matplotlib inline
sys.path.append('../utils/')

In [3]:
from content_based_recomender import ContentBasedRecommender
from baseline_last_sold_recommender import LastSoldRecommender
from baseline_most_sold_recommender import MostSoldRecommender

# Load data

In [4]:
for dirname, _, filenames in os.walk('./../data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


./../data/Customer.csv
./../data/prod_cat_info.csv
./../data/Transactions.csv


In [5]:
customer_raw_df = pd.read_csv('./../data/Customer.csv')
prod_cat_info_raw_df = pd.read_csv('./../data/prod_cat_info.csv')
transactions_raw_df = pd.read_csv('./../data/Transactions.csv')
transactions_raw_df['tran_date'] = pd.to_datetime(transactions_raw_df['tran_date'].str.replace('/','-'), format='%d-%m-%Y')


In [6]:
transactions_df = transactions_raw_df.copy()
transactions_df = transactions_df[transactions_df.Qty>0]
transactions_df['item_id']=transactions_df.prod_cat_code.astype(str)+'_'+transactions_df.prod_subcat_code.astype(str)
transactions_df = transactions_df[['transaction_id', 'cust_id', 'item_id', 'tran_date', 'Qty']]
transactions_df

Unnamed: 0,transaction_id,cust_id,item_id,tran_date,Qty
10,29258453508,270384,3_5,2014-02-20,5
11,25455265351,267750,6_12,2014-02-20,3
12,1571002198,275023,5_6,2014-02-20,4
14,36554696014,269345,5_3,2014-02-20,3
15,56814940239,268799,5_7,2014-02-20,5
...,...,...,...,...,...
23048,94340757522,274550,5_12,2011-01-25,1
23049,89780862956,270022,1_4,2011-01-25,1
23050,85115299378,271020,6_2,2011-01-25,4
23051,72870271171,270911,5_11,2011-01-25,3


# Content-based recommendation : formulating the problem

## Items table

L'ID d'un 'item' sera la concaténation de la catégorie et de la sous catégorie

In [7]:
items_df = prod_cat_info_raw_df.copy()
items_df['item_id']=items_df.prod_cat_code.astype(str)+'_'+items_df.prod_sub_cat_code.astype(str)
items_df= items_df[['item_id','prod_cat_code', 'prod_cat', 'prod_sub_cat_code', 'prod_subcat']]
items_df

Unnamed: 0,item_id,prod_cat_code,prod_cat,prod_sub_cat_code,prod_subcat
0,1_4,1,Clothing,4,Mens
1,1_1,1,Clothing,1,Women
2,1_3,1,Clothing,3,Kids
3,2_1,2,Footwear,1,Mens
4,2_3,2,Footwear,3,Women
5,2_4,2,Footwear,4,Kids
6,3_4,3,Electronics,4,Mobiles
7,3_5,3,Electronics,5,Computers
8,3_8,3,Electronics,8,Personal Appliances
9,3_9,3,Electronics,9,Cameras


In [8]:
items_df['dummy_column']=1

prod_cat_features = pd.pivot_table(items_df, index='item_id', columns='prod_cat',values='dummy_column')
prod_subcat_features = pd.pivot_table(items_df, index='item_id', columns='prod_subcat',values='dummy_column')

prod_features = prod_cat_features.join(prod_subcat_features).fillna(0)

items_df.drop(columns=['dummy_column'], inplace=True)


In [9]:
def get_sales_until_date(transactions: pd.DataFrame, max_date:str='01-01-2012')-> pd.DataFrame:
    """
    return the sales data (index: customers, columns: items, quantity sold since day1), 
    an equivalent to recommendation table, until a certain date
    :param transactions: pd.DataFrame, all transactions
    :param max_date: str, maximum date to consider
    :return: pd.DataFrame, sales table (equivalent to recommendation table
    """
    trasaction_until_date = transactions[transactions.tran_date<pd.to_datetime(max_date, format='%d-%m-%Y')]
    trasaction_grouped_by_customer_item = (
        trasaction_until_date.groupby(['cust_id','item_id'])[['Qty']].sum().reset_index()
    )
    sales_table = (
        pd.pivot_table(
            trasaction_grouped_by_customer_item,index='cust_id',columns='item_id',values='Qty'
        ).fillna(0)
    )
    
    # add missing items
    for item_id in transactions.item_id.unique():
        if item_id not in sales_table.columns:
            sales_table[item_id] = 0
    sales_table.columns = sorted(sales_table.columns)
    return sales_table


# Evaluate the model:

We evaluate it on the year 2013 (training on 2011-2012)
We make predictions every month

In [10]:
dates_to_train_on = [f'01-{str(month).zfill(2)}-2013' for month in range(1,13)]+['01-01-2014']

In [11]:
items_ids = items_df.item_id.unique()
def fill_with_other_items_randomly(items_recommended: List) -> List:
    """
    If complete the list of recommended items with the rest of the items at random
    :param items_recommended: list, list of items recommended
    :return: list, ordered list of all the items to be recommended  
    """
    items_not_recommended = [item_id for item_id in items_ids if item_id not in items_recommended]
    # recommend other items randomly 
    random_items_not_recommended = random.sample(items_not_recommended,len(items_not_recommended))
    return items_recommended+random_items_not_recommended

In [15]:
cumulative_sales = get_sales_until_date(transactions_df, '01-01-2014')
# Content based recommender
cbr = ContentBasedRecommender(prod_features.columns)
cbr.fit(cumulative_sales, prod_features)


In [16]:
cbr.save_fitted_model('cbr_model_01_01_2014')

In [None]:
pd.read_csv('./cbr_model_01_01_2014/items.csv',index_col=0).equals(prod_features)

In [19]:
cumulative_sales

Unnamed: 0_level_0,1_1,1_3,1_4,2_1,2_3,2_4,3_10,3_4,3_5,3_8,...,5_10,5_11,5_12,5_3,5_6,5_7,6_10,6_11,6_12,6_2
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
266783,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0
266785,0.0,0.0,0.0,8.0,0.0,7.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
266788,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
266794,0.0,7.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,4.0,...,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275257,0.0,0.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
275261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
275262,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
275264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0


In [18]:
cbr.predict([1]).to_dict()

{'1_1': {1: 0.1291394098422904},
 '1_3': {1: 0.10660304410525448},
 '1_4': {1: 0.12568036075663486},
 '2_1': {1: 0.126420055012397},
 '2_3': {1: 0.1298791040980525},
 '2_4': {1: 0.10734273836101661},
 '3_10': {1: 0.130707993665997},
 '3_4': {1: 0.13146784991370852},
 '3_5': {1: 0.13070985270061536},
 '3_8': {1: 0.13139247085215833},
 '3_9': {1: 0.13092870142639415},
 '4_1': {1: 0.10648406280546305},
 '4_4': {1: 0.10994311189111858},
 '5_10': {1: 0.1511249413536206},
 '5_11': {1: 0.15187374544086069},
 '5_12': {1: 0.15045265792354623},
 '5_3': {1: 0.15250225627100997},
 '5_6': {1: 0.15091252460475635},
 '5_7': {1: 0.15196313534346068},
 '6_10': {1: 0.1120504109501182},
 '6_11': {1: 0.1109442650602439},
 '6_12': {1: 0.11377689853706033},
 '6_2': {1: 0.11064341631926497}}