##  Import modules
- **pandas** and numpy for data manipulation
- **turicreate** for performing model selection and evaluation
- **sklearn** for splitting the data into train and test set

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import turicreate as tc
%matplotlib inline


## Load Data

In [43]:
customers_df = pd.read_csv('recommend_1.csv')
transactions_df = pd.read_csv('trx_data.csv')

In [44]:
print(customers_df.shape)
customers_df.head()

(1000, 1)


Unnamed: 0,customerId
0,1553
1,20400
2,19750
3,6334
4,27773


In [45]:
print(transactions_df.shape)
transactions_df.head()

(62483, 2)


Unnamed: 0,customerId,products
0,0,20
1,1,2|2|23|68|68|111|29|86|107|152
2,2,111|107|29|11|11|11|33|23
3,3,164|227
4,5,2|2


## Create data with user, item, and target field

In [46]:
transactions_df['products'] = transactions_df['products'].apply(lambda x: [int(i) for i in x.split('|')])

In [47]:
transactions_df.head()

Unnamed: 0,customerId,products
0,0,[20]
1,1,"[2, 2, 23, 68, 68, 111, 29, 86, 107, 152]"
2,2,"[111, 107, 29, 11, 11, 11, 33, 23]"
3,3,"[164, 227]"
4,5,"[2, 2]"


In [48]:
#Data Preperation
data = pd.melt(transactions_df.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products')\
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})
data['productId'] = data['productId'].astype(np.int64)
data.head()

Unnamed: 0,customerId,productId,purchase_count
0,0,1,2
1,0,13,1
2,0,19,3
3,0,20,1
4,0,31,2


In [49]:
data.shape

(133585, 3)

In [50]:
data.head()

Unnamed: 0,customerId,productId,purchase_count
0,0,1,2
1,0,13,1
2,0,19,3
3,0,20,1
4,0,31,2


## Train test split

In [60]:
train, test = train_test_split(data, test_size = .2)
print(train.shape, test.shape)



(106868, 3) (26717, 3)


In [61]:
# Using turicreate library, we convert dataframe to SFrame - this will be useful in the modeling part

train_data = tc.SFrame(train)
test_data = tc.SFrame(test)

In [62]:
train_data.head()

customerId,productId,purchase_count
2488,5,1
8413,29,2
3336,196,1
2478,21,1
21799,135,1
19565,206,4
13155,14,1
8096,4,3
4712,10,3
4685,235,2


In [63]:
test_data.head()

customerId,productId,purchase_count
25635,297,1
13169,218,1
1078,6,2
4862,41,3
19824,125,1
19492,121,2
10313,1,2
604,174,1
5117,285,1
18675,147,3


In [55]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

## Popularity model

In [54]:
# constant variables to define field names include:
user_id = 'customerId'
item_id = 'productId'
users_to_recommend = list(customers_df[user_id])
n_rec = 10           # number of items to recommend
n_display = 30       # to display the first few rows in an output dataset

In [67]:
popularity_model = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
|    1553    |    132    | 3.132075471698114  |  1   |
|    1553    |     37    | 3.026869787005928  |  2   |
|    1553    |     0     |  2.99531301079582  |  3   |
|    1553    |     34    | 2.9865962935216497 |  4   |
|    1553    |    248    | 2.9756782460212703 |  5   |
|    1553    |     3     | 2.887990766087498  |  6   |
|    1553    |     27    | 2.7462686567164187 |  7   |
|    1553    |    230    | 2.663540114585618  |  8   |
|    1553    |     10    |  2.64264264264264  |  9   |
|    1553    |    110    | 2.6418561810392087 |  10  |
|   20400    |    132    | 3.132075471698114  |  1   |
|   20400    |     37    | 3.0275590551181106 |  2   |
|   20400    |     34    | 3.0075757575757573 |  3   |
|   20400    |     0     | 2.997274212225187  |  4   |
|   20400    |    248    | 2.9799999999999995 |  5   |
|   20400 

## Colaborate filtering

In [68]:
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.12802904844284058  |  1   |
|    1553    |     1     | 0.07729625701904297  |  2   |
|    1553    |     5     | 0.07147923111915588  |  3   |
|    1553    |     17    | 0.06917351484298706  |  4   |
|    1553    |     33    | 0.06127279996871948  |  5   |
|    1553    |     61    | 0.059559136629104614 |  6   |
|    1553    |     47    | 0.05598253011703491  |  7   |
|    1553    |     8     | 0.05152064561843872  |  8   |
|    1553    |     41    | 0.049761444330215454 |  9   |
|    1553    |    233    | 0.04658660292625427  |  10  |
|   20400    |    280    | 0.09378987550735474  |  1   |
|   20400    |    122    | 0.04982936382293701  |  2   |
|   20400    |     1     | 0.040702879428863525 |  3   |
|   20400    |     31    |  0.0369907021522522  |  4   |
|   20400    |    265    | 0.03

## Pearson similarity of Collaborative filtering

In [69]:
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
|    1553    |    132    | 3.1320754716981143 |  1   |
|    1553    |     37    | 3.0268697870059262 |  2   |
|    1553    |     0     | 2.9953130107958246 |  3   |
|    1553    |     34    | 2.986596293521648  |  4   |
|    1553    |    248    | 2.975678246021271  |  5   |
|    1553    |     3     | 2.888655462184876  |  6   |
|    1553    |     27    | 2.7462686567164187 |  7   |
|    1553    |    230    | 2.6635401145856186 |  8   |
|    1553    |    110    | 2.6436781609195386 |  9   |
|    1553    |     10    | 2.642642642642643  |  10  |
|   20400    |    132    | 3.1320754716981143 |  1   |
|   20400    |     37    | 3.027559055118109  |  2   |
|   20400    |     34    | 3.0075757575757556 |  3   |
|   20400    |     0     | 2.9972742122251916 |  4   |
|   20400    |    248    | 2.9800000000000004 |  5   |
|   20400 

## Model Evaluation

#### i. RMSE (Root Mean Squared Errors)
-- Measures the error of predicted values
-- Lesser the RMSE value, better the recommendations

#### ii. Recall
-- What percentage of products that a user buys are actually recommended?
-- If a customer buys 5 products and the recommendation decided to show 3 of them, then the recall is 0.6

#### iii. Precision
-- Out of all the recommended items, how many the user actually liked?
-- If 5 products were recommended to the customer out of which he buys 4 of them, then precision is 0.8

### So our aim has to be optimizing both recall and precision (to be close to 1 as possible).

In [70]:
models_w_counts = [popularity_model, cos, pear]
names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']

Lets compare all the models we have built based on RMSE and precision-recall characteristics:

In [71]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0013712471131639716 | 0.0008371824480369517 |
|   2    |  0.00339203233256351  | 0.0035378013230568076 |
|   3    |  0.004041570438799063 |  0.006460541766365961 |
|   4    |  0.00737947459584296  |  0.01626057783160041  |
|   5    |  0.006177829099307202 |  0.017015708838968687 |
|   6    |  0.006507409545804493 |  0.021129424142782117 |
|   7    |  0.005887083470801723 |  0.02230220127904079  |
|   8    |  0.005566180715935327 |  0.02444835002594008  |
|   9    |  0.005396779574031374 |  0.026756100493332585 |
|   10   |  0.00530456120092379  |  0.02948424645726087  |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0369712951742305

Per User RMSE (best)
+------------+-----------------------+------


Precision and recall summary statistics by cutoff
+--------+----------------------+---------------------+
| cutoff |    mean_precision    |     mean_recall     |
+--------+----------------------+---------------------+
|   1    | 0.11670034642032333  | 0.06809790693527619 |
|   2    | 0.09425519630484988  | 0.10792096530954597 |
|   3    | 0.07909930715935329  | 0.13423637464715402 |
|   4    |  0.0694644919168593  | 0.15586842002324514 |
|   5    | 0.062312355658198655 | 0.17292322426121956 |
|   6    | 0.05643764434180127  | 0.18692194011621346 |
|   7    | 0.05194242824150488  | 0.20013130821155176 |
|   8    |  0.0480928839491917  | 0.21123092263070914 |
|   9    | 0.04496247113163932  | 0.22120120880516705 |
|   10   | 0.042407621247113114 | 0.23100280730209594 |
+--------+----------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 1.8819845023017845

Per User RMSE (best)
+------------+---------------------+-------+
| customerId |         rmse        | coun


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0013712471131639718 | 0.0008371824480369508 |
|   2    |  0.00335594688221709  | 0.0034656304223639705 |
|   3    |  0.004041570438799083 | 0.0064605417663659605 |
|   4    |  0.007379474595842952 |  0.016260577831600346 |
|   5    |  0.006177829099307199 |  0.017015708838968704 |
|   6    |  0.006507409545804479 |  0.021129424142782086 |
|   7    |  0.005887083470801724 |  0.022302201279040743 |
|   8    |  0.005602266166281751 |  0.024580663343876936 |
|   9    |  0.005404798562997241 |  0.026792185943679013 |
|   10   | 0.0053045612009237995 |  0.029484246457260877 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0369648458587792

Per User RMSE (best)
+------------+------------------------+-----

## Final Output

In [72]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(data), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_count', similarity_type='cosine')
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.14465173482894897  |  1   |
|    1553    |     5     | 0.09291791915893555  |  2   |
|    1553    |     1     | 0.09233839511871338  |  3   |
|    1553    |     35    | 0.08991665840148926  |  4   |
|    1553    |     17    | 0.07552688121795655  |  5   |
|    1553    |     33    |  0.0724305510520935  |  6   |
|    1553    |     61    | 0.06554533243179321  |  7   |
|    1553    |     21    | 0.056928062438964845 |  8   |
|    1553    |    167    |  0.0536272406578064  |  9   |
|    1553    |     15    | 0.05270164012908936  |  10  |
|   20400    |    280    | 0.07982701063156128  |  1   |
|   20400    |     1     | 0.05004173517227173  |  2   |
|   20400    |    122    | 0.04513084888458252  |  3   |
|   20400    |     6     | 0.043053507804870605 |  4   |
|   20400    |    160    | 0.04

## Output DataFrame

In [73]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(10000, 4)


Unnamed: 0,customerId,productId,score,rank
0,1553,2,0.144652,1
1,1553,5,0.092918,2
2,1553,1,0.092338,3
3,1553,35,0.089917,4
4,1553,17,0.075527,5
