In [1]:
import os
import sys
import datetime
import random
from typing import List
import shutil

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling


In [2]:
random.seed(0)
%matplotlib inline
sys.path.append('../utils/')

In [3]:
from content_based_recomender import ContentBasedRecommender
from collaborative_filtering_matrix_factorization import CFRecommender
from baseline_last_sold_recommender import LastSoldRecommender
from baseline_most_sold_recommender import MostSoldRecommender
from modeling_utils import fill_with_other_items_randomly, get_sales_until_date

# Load data

In [4]:
for dirname, _, filenames in os.walk('./../data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


./../data/Customer.csv
./../data/prod_cat_info.csv
./../data/Transactions.csv


In [5]:
customer_raw_df = pd.read_csv('./../data/Customer.csv')
prod_cat_info_raw_df = pd.read_csv('./../data/prod_cat_info.csv')
transactions_raw_df = pd.read_csv('./../data/Transactions.csv')
transactions_raw_df['tran_date'] = pd.to_datetime(transactions_raw_df['tran_date'].str.replace('/','-'), format='%d-%m-%Y')


In [6]:
transactions_df = transactions_raw_df.copy()
transactions_df = transactions_df[transactions_df.Qty>0]
transactions_df['item_id']=transactions_df.prod_cat_code.astype(str)+'_'+transactions_df.prod_subcat_code.astype(str)
transactions_df = transactions_df[['transaction_id', 'cust_id', 'item_id', 'tran_date', 'Qty']]
transactions_df

Unnamed: 0,transaction_id,cust_id,item_id,tran_date,Qty
10,29258453508,270384,3_5,2014-02-20,5
11,25455265351,267750,6_12,2014-02-20,3
12,1571002198,275023,5_6,2014-02-20,4
14,36554696014,269345,5_3,2014-02-20,3
15,56814940239,268799,5_7,2014-02-20,5
...,...,...,...,...,...
23048,94340757522,274550,5_12,2011-01-25,1
23049,89780862956,270022,1_4,2011-01-25,1
23050,85115299378,271020,6_2,2011-01-25,4
23051,72870271171,270911,5_11,2011-01-25,3


# Content-based recommendation : formulating the problem

## Items table

L'ID d'un 'item' sera la concaténation de la catégorie et de la sous catégorie

In [7]:
items_df = prod_cat_info_raw_df.copy()
items_df['item_id']=items_df.prod_cat_code.astype(str)+'_'+items_df.prod_sub_cat_code.astype(str)
items_df= items_df[['item_id','prod_cat_code', 'prod_cat', 'prod_sub_cat_code', 'prod_subcat']]
items_df

Unnamed: 0,item_id,prod_cat_code,prod_cat,prod_sub_cat_code,prod_subcat
0,1_4,1,Clothing,4,Mens
1,1_1,1,Clothing,1,Women
2,1_3,1,Clothing,3,Kids
3,2_1,2,Footwear,1,Mens
4,2_3,2,Footwear,3,Women
5,2_4,2,Footwear,4,Kids
6,3_4,3,Electronics,4,Mobiles
7,3_5,3,Electronics,5,Computers
8,3_8,3,Electronics,8,Personal Appliances
9,3_9,3,Electronics,9,Cameras


In [8]:
items_df['dummy_column']=1

prod_cat_features = pd.pivot_table(items_df, index='item_id', columns='prod_cat',values='dummy_column')
prod_subcat_features = pd.pivot_table(items_df, index='item_id', columns='prod_subcat',values='dummy_column')

prod_features = prod_cat_features.join(prod_subcat_features).fillna(0)

items_df.drop(columns=['dummy_column'], inplace=True)


# Build and export models:

## Content-based recommender

In [9]:
cumulative_sales = get_sales_until_date(transactions_df, '01-01-2014')
cbr = ContentBasedRecommender(prod_features.columns)
cbr.fit(cumulative_sales, prod_features)


In [10]:
cbr.save_fitted_model('cbr_model_01_01_2014')

## Content-based recommender

In [11]:
cumulative_sales = get_sales_until_date(transactions_df, '01-01-2014', False)
cfr = CFRecommender(factors=7, iterations=5, random_state=0, calculate_training_loss=True)
cfr.fit(cumulative_sales)




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [12]:
cfr.save_fitted_model('cfr_model_01_01_2014')