In [1]:
# Import modules

%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")

In [2]:
#Load data
data = pd.read_csv('AHG.csv')


In [3]:
# After loading the dataset, we should look at the content of each columns.

# Looking at the  file
print("\nDataFrame:")
print("shape : ", data.shape)
print(data.head())



DataFrame:
shape :  (60398, 3)
   CustomerID  ProductID  SalesOrderLineNumber
0       21768        310                     1
1       28389        346                     1
2       25863        346                     1
3       14501        336                     1
4       11003        346                     1


# Create Dummy

Dummy for marking whether a customer bought that item or not.
If one buys an item, then purchase_dummy are marked as 1

In [4]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)

# Normalize item values across Customers

In [5]:
df_matrix = pd.pivot_table(data, values='SalesOrderLineNumber', 
                           index='CustomerID', columns='ProductID')

df_matrix

ProductID,214,217,222,225,228,231,234,237,310,311,...,594,595,596,597,598,599,600,604,605,606
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11000,4.0,,,,,,,,,,...,,,,,,,,,,
11001,,4.0,,6.0,,,,,,,...,,,,,,,,1.0,,
11002,,,2.0,,,,,,,,...,,,,,,,,,,
11003,,,,4.0,,,,,,,...,,,,,,,,,,
11004,3.0,2.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29479,,,,,,,,,,,...,,,,,,,,,,
29480,,4.0,,5.0,,,,,,,...,,,,,,,,,,
29481,,,,,,,,,,,...,,,,,,,,,,
29482,,,,,,,,,,,...,,,,,,,,,,


In [6]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

df_matrix_norm

ProductID,214,217,222,225,228,231,234,237,310,311,...,594,595,596,597,598,599,600,604,605,606
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11000,0.500000,,,,,,,,,,...,,,,,,,,,,
11001,,0.500000,,0.714286,,,,,,,...,,,,,,,,0.0,,
11002,,,0.2,,,,,,,,...,,,,,,,,,,
11003,,,,0.428571,,,,,,,...,,,,,,,,,,
11004,0.333333,0.166667,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29479,,,,,,,,,,,...,,,,,,,,,,
29480,,0.500000,,0.571429,,,,,,,...,,,,,,,,,,
29481,,,,,,,,,,,...,,,,,,,,,,
29482,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# create a table for input to the modeling  
d = df_matrix_norm.reset_index() 
d.index.names = ['scaled_purchase_freq'] 
data_norm = pd.melt(d, id_vars=['CustomerID'], value_name='scaled_purchase_freq').dropna()
print(data_norm.shape)
data_norm.head()

(48140, 3)


Unnamed: 0,CustomerID,ProductID,scaled_purchase_freq
0,11000,214,0.5
4,11004,214,0.333333
7,11007,214,0.166667
8,11008,214,0.166667
19,11019,214,0.0


The above steps can be combined to a function defined below:

In [8]:
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='SalesOrderLineNumber', index='CustomerID', columns='ProductID')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['CustomerID'], value_name='scaled_purchase_freq').dropna()

In this step, we have normalized their purchase history, from 0–1 (with 1 being the most number of purchase for an item and 0 being 0 purchase count for that item).

# Split train and test set

In [9]:
# We use 80:20 ratio for our train-test set size.
#Our training portion will be used to develop a predictive model, while the other to evaluate the model’s performance.

def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [10]:
#Now that we have three datasets with SalesOrderLineNumber, purchase dummy, and scaled purchase counts, 
#we would like to split each for modeling.

train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

# Define Models using Turicreate library

In [11]:
# constant variables to define field names include:
user_id = 'CustomerID'
item_id = 'ProductID'
users_to_recommend = list(data[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [12]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

# Using Popularity Model as Baseline

In [13]:
#Using SalesOrderLineNumber

name = 'popularity'
target = 'SalesOrderLineNumber'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| CustomerID | ProductID |       score        | rank |
+------------+-----------+--------------------+------+
|   21768    |    467    | 3.1763085399449036 |  1   |
|   21768    |    463    | 3.146529562982005  |  2   |
|   21768    |    465    | 3.0554156171284634 |  3   |
|   21768    |    486    | 3.051813471502591  |  4   |
|   21768    |    484    | 2.969401947148818  |  5   |
|   21768    |    471    | 2.891304347826087  |  6   |
|   21768    |    487    |  2.86218487394958  |  7   |
|   21768    |    473    | 2.8089171974522293 |  8   |
|   21768    |    228    | 2.789156626506024  |  9   |
|   21768    |    234    | 2.7591036414565826 |  10  |
|   28389    |    467    | 3.1763085399449036 |  1   |
|   28389    |    463    | 3.146529562982005  |  2   |
|   28389    |    465    | 3.0554156171284634 |  3   |
|   28389    |    486    | 3.051813471502591  |  4   |
|   28389    |    484    | 2.969401947148818  |  5   |
|   28389 

In [14]:
#Using purchase_dummy

name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | ProductID | score | rank |
+------------+-----------+-------+------+
|   21768    |    490    |  1.0  |  1   |
|   21768    |    479    |  1.0  |  2   |
|   21768    |    225    |  1.0  |  3   |
|   21768    |    477    |  1.0  |  4   |
|   21768    |    528    |  1.0  |  5   |
|   21768    |    491    |  1.0  |  6   |
|   21768    |    465    |  1.0  |  7   |
|   21768    |    379    |  1.0  |  8   |
|   21768    |    478    |  1.0  |  9   |
|   21768    |    537    |  1.0  |  10  |
|   28389    |    310    |  1.0  |  1   |
|   28389    |    479    |  1.0  |  2   |
|   28389    |    225    |  1.0  |  3   |
|   28389    |    477    |  1.0  |  4   |
|   28389    |    528    |  1.0  |  5   |
|   28389    |    491    |  1.0  |  6   |
|   28389    |    465    |  1.0  |  7   |
|   28389    |    379    |  1.0  |  8   |
|   28389    |    478    |  1.0  |  9   |
|   28389    |    537    |  1.0  |  10  |
|   25863    |    355    |  1.0  |

In [15]:
#Using scaled_purchased_freq

name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+---------------------+------+
| CustomerID | ProductID |        score        | rank |
+------------+-----------+---------------------+------+
|   21768    |    463    | 0.43480519480519514 |  1   |
|   21768    |    465    |  0.4060996563573881 |  2   |
|   21768    |    484    | 0.39768688293370846 |  3   |
|   21768    |    472    |  0.3946540880503145 |  4   |
|   21768    |    473    |  0.3714285714285717 |  5   |
|   21768    |    471    |  0.3661971830985916 |  6   |
|   21768    |    487    |  0.3655231560891922 |  7   |
|   21768    |    234    |  0.3579831932773123 |  8   |
|   21768    |    467    | 0.35208926875593527 |  9   |
|   21768    |    482    |  0.3509259259259266 |  10  |
|   28389    |    463    | 0.43480519480519514 |  1   |
|   28389    |    465    |  0.4060996563573881 |  2   |
|   28389    |    484    | 0.39768688293370846 |  3   |
|   28389    |    472    |  0.3946540880503145 |  4   |
|   28389    |    473    |  0.3714285714285717 |

# Using Cosine similarity

In [16]:
#Using SalesOrderLineNumber

name = 'cosine'
target = 'SalesOrderLineNumber'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| CustomerID | ProductID |        score         | rank |
+------------+-----------+----------------------+------+
|   21768    |    376    | 0.06874269247055054  |  1   |
|   21768    |    357    | 0.05043560266494751  |  2   |
|   21768    |    359    | 0.04916238784790039  |  3   |
|   21768    |    374    | 0.044199585914611816 |  4   |
|   21768    |    478    | 0.04331541061401367  |  5   |
|   21768    |    477    | 0.04274839162826538  |  6   |
|   21768    |    361    | 0.04205602407455444  |  7   |
|   21768    |    479    | 0.04035496711730957  |  8   |
|   21768    |    363    | 0.03991955518722534  |  9   |
|   21768    |    234    | 0.03876250982284546  |  10  |
|   28389    |    564    | 0.07186996936798096  |  1   |
|   28389    |    561    | 0.05113101005554199  |  2   |
|   28389    |    578    | 0.04892462491989136  |  3   |
|   28389    |    567    | 0.042257726192474365 |  4   |
|   28389    |    361    | 0.04

In [17]:
#Using purchase dummy

name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| CustomerID | ProductID |        score         | rank |
+------------+-----------+----------------------+------+
|   21768    |    376    |  0.0593295693397522  |  1   |
|   21768    |    589    | 0.05913126468658447  |  2   |
|   21768    |    374    | 0.05415302515029907  |  3   |
|   21768    |    355    | 0.046544015407562256 |  4   |
|   21768    |    479    |  0.0446280837059021  |  5   |
|   21768    |    363    | 0.042894721031188965 |  6   |
|   21768    |    477    | 0.042059242725372314 |  7   |
|   21768    |    378    | 0.03936249017715454  |  8   |
|   21768    |    359    | 0.03635001182556152  |  9   |
|   21768    |    478    | 0.03458744287490845  |  10  |
|   28389    |    561    | 0.06995230913162231  |  1   |
|   28389    |    578    | 0.06789422035217285  |  2   |
|   28389    |    564    | 0.06025433540344238  |  3   |
|   28389    |    353    | 0.05345863103866577  |  4   |
|   28389    |    359    | 0.05

In [18]:
#Using scaled_purchase_freq

name = 'cosine' 
target = 'scaled_purchase_freq' 
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| CustomerID | ProductID |        score         | rank |
+------------+-----------+----------------------+------+
|   21768    |    477    | 0.020104310512542724 |  1   |
|   21768    |    528    | 0.014780306816101074 |  2   |
|   21768    |    478    | 0.014596302509307862 |  3   |
|   21768    |    485    | 0.014309172630310058 |  4   |
|   21768    |    225    | 0.013090128898620606 |  5   |
|   21768    |    480    | 0.01250722885131836  |  6   |
|   21768    |    479    | 0.01226273775100708  |  7   |
|   21768    |    222    | 0.01191109538078308  |  8   |
|   21768    |    214    | 0.011861095428466797 |  9   |
|   21768    |    217    | 0.011659750938415528 |  10  |
|   28389    |    477    | 0.020104310512542724 |  1   |
|   28389    |    528    | 0.014780306816101074 |  2   |
|   28389    |    478    | 0.014596302509307862 |  3   |
|   28389    |    485    | 0.014309172630310058 |  4   |
|   28389    |    225    | 0.01

# Pearson similarity

In [19]:
#Using SalesOrderLineNumber

name = 'pearson'
target = 'SalesOrderLineNumber'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| CustomerID | ProductID |       score        | rank |
+------------+-----------+--------------------+------+
|   21768    |    467    | 3.1560278710306684 |  1   |
|   21768    |    463    | 3.127483262110003  |  2   |
|   21768    |    486    | 3.0531033750059735 |  3   |
|   21768    |    465    | 3.050231807089916  |  4   |
|   21768    |    484    | 2.9639285220002103 |  5   |
|   21768    |    471    | 2.8834866300354816 |  6   |
|   21768    |    487    | 2.8418709167901146 |  7   |
|   21768    |    473    | 2.800920144782705  |  8   |
|   21768    |    228    | 2.7768221353115967 |  9   |
|   21768    |    234    | 2.7459514340707805 |  10  |
|   28389    |    467    | 3.1560278710306684 |  1   |
|   28389    |    463    | 3.127483262110003  |  2   |
|   28389    |    486    | 3.0531033750059735 |  3   |
|   28389    |    465    | 3.050231807089916  |  4   |
|   28389    |    484    | 2.9639285220002103 |  5   |
|   28389 

In [20]:
#Using purchase dummy

name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | ProductID | score | rank |
+------------+-----------+-------+------+
|   21768    |    490    |  0.0  |  1   |
|   21768    |    479    |  0.0  |  2   |
|   21768    |    225    |  0.0  |  3   |
|   21768    |    477    |  0.0  |  4   |
|   21768    |    528    |  0.0  |  5   |
|   21768    |    491    |  0.0  |  6   |
|   21768    |    465    |  0.0  |  7   |
|   21768    |    379    |  0.0  |  8   |
|   21768    |    478    |  0.0  |  9   |
|   21768    |    537    |  0.0  |  10  |
|   28389    |    310    |  0.0  |  1   |
|   28389    |    479    |  0.0  |  2   |
|   28389    |    225    |  0.0  |  3   |
|   28389    |    477    |  0.0  |  4   |
|   28389    |    528    |  0.0  |  5   |
|   28389    |    491    |  0.0  |  6   |
|   28389    |    465    |  0.0  |  7   |
|   28389    |    379    |  0.0  |  8   |
|   28389    |    478    |  0.0  |  9   |
|   28389    |    537    |  0.0  |  10  |
|   25863    |    355    |  0.0  |

In [21]:
#Using scaled_purchase_freq

name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+---------------------+------+
| CustomerID | ProductID |        score        | rank |
+------------+-----------+---------------------+------+
|   21768    |    463    |  0.4348051948051949 |  1   |
|   21768    |    465    |  0.4060996563573885 |  2   |
|   21768    |    484    | 0.39768688293370935 |  3   |
|   21768    |    472    |  0.3946540880503144 |  4   |
|   21768    |    473    |  0.3714285714285715 |  5   |
|   21768    |    471    |  0.3661971830985918 |  6   |
|   21768    |    487    | 0.36552315608919406 |  7   |
|   21768    |    234    |  0.3579831932773107 |  8   |
|   21768    |    467    | 0.35208926875593544 |  9   |
|   21768    |    482    |  0.3509259259259258 |  10  |
|   28389    |    463    |  0.4348051948051949 |  1   |
|   28389    |    465    |  0.4060996563573885 |  2   |
|   28389    |    484    | 0.39768688293370935 |  3   |
|   28389    |    472    |  0.3946540880503144 |  4   |
|   28389    |    473    |  0.3714285714285715 |

# Model Evaluation

We will use RMSE and Precision-recall to evalute this task.

In [22]:
#Let’s first create initial callable variables for model evaluation:

models_w_counts = [popularity, cos, pear]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]
names_w_counts = ['Popularity Model on SalesOrderLineNumber', 'Cosine Similarity on SalesOrderLineNumber', 'Pearson Similarity on SalesOrderLineNumber']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [23]:
#Lets compare all the models we have built based on RMSE and precision-recall characteristics:

eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on SalesOrderLineNumber



Precision and recall summary statistics by cutoff
+--------+----------------------+-----------------------+
| cutoff |    mean_precision    |      mean_recall      |
+--------+----------------------+-----------------------+
|   1    | 0.00943825335455992  | 0.0058316649069535395 |
|   2    | 0.010291107573345462 |  0.01359199219622101  |
|   3    | 0.010764915472670724 |  0.021405189424995717 |
|   4    | 0.00977939504207415  |   0.0259478872699451  |
|   5    | 0.011735274050489039 |  0.040210280361653576 |
|   6    | 0.010878629368508863 |   0.0451663109885962  |
|   7    | 0.011078982423080667 |  0.05399998546361707  |
|   8    | 0.010419035706163291 |  0.058253900470602325 |
|   9    | 0.010512217926364044 |  0.06651336899843568  |
|   10   | 0.010541278144189196 |  0.07357775863048506  |
+--------+----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.7483050295334681

Per User RMSE (best)
+------------+------+-------+
| CustomerID | rmse | count 


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    |  0.2607459631566976 | 0.20317285803646803 |
|   2    | 0.19291562428928807 |  0.2948056204721845 |
|   3    | 0.15048138882571446 | 0.34218592501404027 |
|   4    | 0.12858198771889937 |  0.3892606658103521 |
|   5    | 0.11394132362974749 | 0.43136596204011884 |
|   6    | 0.10300583731331968 | 0.46798653765887493 |
|   7    | 0.09322914974495566 | 0.49351519072284566 |
|   8    | 0.08586820559472368 |  0.5186839155618006 |
|   9    | 0.07986505951027223 |  0.541385541129017  |
|   10   | 0.07464180122811012 |  0.5598354484349627 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 1.9665072267508719

Per User RMSE (best)
+------------+---------------------+-------+
| CustomerID |         rmse        | count |
+---------


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.00943825335455992  | 0.005831664906953542 |
|   2    | 0.010347964521264512 | 0.013603363585804825 |
|   3    | 0.010158441361534369 | 0.02011643193883091  |
|   4    | 0.009779395042074132 | 0.025947887269945104 |
|   5    | 0.011894473504662361 | 0.04069356441896538  |
|   6    | 0.010859677052535857 | 0.04510945404067721  |
|   7    | 0.01093277884271743  | 0.05303341734899345  |
|   8    | 0.01030532181032522  | 0.05747685551570883  |
|   9    | 0.010524852803679404 | 0.06661083805201122  |
|   10   | 0.010654992040027247 | 0.07406970660367031  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.7368681427362532

Per User RMSE (best)
+------------+------+-------+
| CustomerID | rmse | count |
+-----------


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.002941842045711699 |  0.002130949686957837 |
|   2    |  0.002376103190767145 | 0.0034227200724145745 |
|   3    | 0.0026023987327449647 |  0.006032661989892135 |
|   4    | 0.0025175379045032794 |  0.007795881421136009 |
|   5    | 0.0029644715999094743 |  0.011416610092781174 |
|   6    | 0.0034698649769932953 |  0.015936863543788184 |
|   7    |  0.003442925031519729 |  0.01815644565135402  |
|   8    | 0.0034085766010409596 |  0.020541977823036905 |
|   9    | 0.0035452968243192187 |  0.02427585426567097  |
|   10   |  0.003598099117447404 |  0.027198838349551176 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+------------+------+-------+
| CustomerID | rmse | count |


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    | 0.26069246435845245 | 0.20520080911239372 |
|   2    | 0.18641095270423172 |  0.2894226744861702 |
|   3    | 0.15052425133891534 |  0.3492576852801088 |
|   4    | 0.12624462548087795 |  0.3875422474991681 |
|   5    | 0.11172210907445128 |  0.4289230826003023 |
|   6    | 0.10049407859998497 |  0.4612884118851835 |
|   7    |  0.0913910710244718 | 0.48830432800496854 |
|   8    | 0.08331918986195967 |  0.5072643922297977 |
|   9    | 0.07660104095949309 |  0.5232729650102654 |
|   10   | 0.07081918986195976 |  0.5368755651709095 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9147649335032153

Per User RMSE (best)
+------------+--------------------+-------+
| CustomerID |        rmse        | count |
+-----------


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.002941842045711698 |  0.002130949686957833 |
|   2    |  0.002376103190767142 | 0.0034227200724145698 |
|   3    |  0.002602398732744963 |  0.006032661989892132 |
|   4    |  0.002517537904503282 | 0.0077958814211359995 |
|   5    | 0.0029644715999094773 |  0.011416610092781174 |
|   6    | 0.0034698649769932923 |  0.015936863543788187 |
|   7    |  0.003442925031519729 |  0.018156445651353985 |
|   8    |  0.003408576601040961 |   0.0205419778230369  |
|   9    |  0.003545296824319218 |  0.02427585426567096  |
|   10   | 0.0035980991174474025 |  0.027198838349551165 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+------------+------+-------+
| CustomerID | rmse | count |


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.01278256189451023  | 0.009809254702957614 |
|   2    | 0.013388051668460715 | 0.019566984041553465 |
|   3    | 0.01614639397201286  | 0.03671259632306456  |
|   4    | 0.01365715823466093  | 0.04108798076101628  |
|   5    | 0.011948331539289642 | 0.04504982743007498  |
|   6    | 0.011078220308575538 | 0.05023012882942915  |
|   7    | 0.012052129786252506 | 0.06388216122473372  |
|   8    | 0.012042518837459637 | 0.07264809404206601  |
|   9    | 0.011990192560698613 | 0.08069470073641224  |
|   10   | 0.011719590958019353 | 0.08732027525757342  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.19775583281588435

Per User RMSE (best)
+------------+------+-------+
| CustomerID | rmse | count |
+----------


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    |  0.2568622174381055 |  0.2073619867753342 |
|   2    | 0.19227664155005378 | 0.30709731192441087 |
|   3    | 0.15217976318622176 |  0.3596051822235891 |
|   4    |  0.1283301937567275 |  0.4000577724810767 |
|   5    | 0.11550053821313225 | 0.44762134356792616 |
|   6    | 0.10557947613921767 |  0.4884155496608406 |
|   7    | 0.09609026603106244 |  0.5168744233430717 |
|   8    | 0.08919200753498391 |  0.5473373400310958 |
|   9    | 0.08349778734601132 |  0.5766373853093446 |
|   10   | 0.07835037674919271 |  0.6015992618791327 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.2976084799554144

Per User RMSE (best)
+------------+------+-------+
| CustomerID | rmse | count |
+------------+------+-------+
|   23586 


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.012782561894510222 | 0.009809254702957607 |
|   2    | 0.014128094725511305 | 0.020430367608112475 |
|   3    | 0.014576605669178299 | 0.03257508286773638  |
|   4    | 0.013858988159311091 | 0.041525278931091626 |
|   5    | 0.012351991388589948 | 0.04618231756283424  |
|   6    | 0.011930391101542889 | 0.05406489739778222  |
|   7    | 0.011783023220052327 | 0.062267521827532556 |
|   8    | 0.011958423035522065 |  0.0719080509850155  |
|   9    | 0.01187058964238739  | 0.07987616826421995  |
|   10   | 0.011652314316469327 | 0.08657574709108623  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.194982552259827

Per User RMSE (best)
+------------+------+-------+
| CustomerID | rmse | count |
+------------

# Evaluation Summary

### Popularity v. Collaborative Filtering: 
We can see that the collaborative filtering algorithms work better than popularity model for SalesOrderLineNumber. Indeed, popularity model doesn’t give any personalizations as it only gives the same list of recommended items to every user.

### Precision and recall:
Looking at the summary above, we see that the precision and recall for SalesOrderLineNumber > Purchase Dummy > Normalized SalesOrderLineNumber. However, because the recommendation scores for the normalized purchase data is zero and constant, we choose the dummy. In fact, the RMSE isn’t much different between models on the dummy and those on the normalized data.

### RMSE: 
Since RMSE is higher using pearson distance than cosine, we would choose model the smaller mean squared errors, which in this case would be cosine.

# Final Output

In [24]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(data_norm), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target= target,
                                                    similarity_type ='cosine')

recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+----------------------+------+
| CustomerID | ProductID |        score         | rank |
+------------+-----------+----------------------+------+
|   21768    |    477    | 0.024141215085983277 |  1   |
|   21768    |    528    | 0.017641897201538085 |  2   |
|   21768    |    478    | 0.017101408243179322 |  3   |
|   21768    |    485    | 0.017007005214691163 |  4   |
|   21768    |    225    | 0.015925196409225465 |  5   |
|   21768    |    480    | 0.015025767087936402 |  6   |
|   21768    |    479    | 0.014914838075637817 |  7   |
|   21768    |    214    | 0.014421181678771973 |  8   |
|   21768    |    222    | 0.014229209423065185 |  9   |
|   21768    |    217    | 0.014191017150878907 |  10  |
|   28389    |    477    | 0.024141215085983277 |  1   |
|   28389    |    528    | 0.017641897201538085 |  2   |
|   28389    |    478    | 0.017101408243179322 |  3   |
|   28389    |    485    | 0.017007005214691163 |  4   |
|   28389    |    225    | 0.01

# CSV output file

In [25]:
#Here we want to manipulate our result to a csv output. Let’s see what we have:

df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(603980, 4)


Unnamed: 0,CustomerID,ProductID,score,rank
0,21768,477,0.024141,1
1,21768,528,0.017642,2
2,21768,478,0.017101,3
3,21768,485,0.017007,4
4,21768,225,0.015925,5


In [38]:
import os
os.getcwd() 

'/home/enearow'

In [55]:
#Let’s define a function to create a desired output:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['CustomerID', 'recommendedProducts']].drop_duplicates() \
        .sort_values('CustomerID').set_index('CustomerID')
    if print_csv:
        df_output.to_csv('C:\\Users\\Hp\output\Recommendation.csv')
        print("An output file can be found in 'output' folder with name 'Recommendation.csv'")
    return df_output

In [56]:
#Lets print the output below and setprint_csv to true, this way we could literally print out our output file in csv,

df_output = create_output(pear_norm, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'Recommendation.csv'
(18484, 1)


Unnamed: 0_level_0,recommendedProducts
CustomerID,Unnamed: 1_level_1
11000,463|465|484|472|473|487|471|234|467|482|463|46...
11001,463|465|484|472|473|487|471|234|467|482|463|46...
11002,463|465|484|472|473|471|487|234|482|467|463|46...
11003,463|465|484|472|473|487|471|234|467|482|463|46...
11004,463|465|484|472|473|471|487|234|467|482|463|46...


# Customer Recommendation Function

In [28]:
def customer_recomendation(CustomerID):
    if CustomerID not in df_output.index:
        print('Customer not found.')
        return CustomerID
    return df_output.loc[CustomerID]

In [52]:
customer_recomendation(11004)

recommendedProducts    463|465|484|472|473|471|487|234|467|482|463|46...
Name: 11004, dtype: object