In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split
import sys
sys.path.append("..")

In [78]:
!pwd

/Users/liangkuang/GitHub/medium/items-recommender/notebooks


# Load the data

In [4]:
# two datasets in csv format
# ** recommend_1.csv
# ** trx_data.csv
customers = pd.read_csv("../data/recommend_1.csv")
transactions = pd.read_csv("../data/trx_data.csv")

In [5]:
# preview the data
print(customers.shape)
print(transactions.shape)

(1000, 1)
(62483, 2)


In [6]:
customers.head()

Unnamed: 0,customerId
0,1553
1,20400
2,19750
3,6334
4,27773


In [7]:
transactions.head()

Unnamed: 0,customerId,products
0,0,20
1,1,2|2|23|68|68|111|29|86|107|152
2,2,111|107|29|11|11|11|33|23
3,3,164|227
4,5,2|2


## Data preparation

In [16]:
# Create data with user, item and target field
data = pd.melt(transactions.set_index('customerId')['products'].str.split("|", n = -1, expand = True).reset_index(),
              id_vars = ['customerId'],
              value_name = 'products')\
        .dropna().drop(['variable'], axis = 1) \
        .groupby(['customerId', 'products']) \
        .agg({'products': 'count'}) \
        .rename(columns = {'products': 'purchase_count'}) \
        .reset_index() \
        .rename(columns = {'products': 'productId'})
data['productId'] = data['productId'].astype(np.int64)

In [18]:
print(data.shape)
data.head()

(133585, 3)


Unnamed: 0,customerId,productId,purchase_count
0,0,1,2
1,0,13,1
2,0,136,2
3,0,157,1
4,0,19,3


## Create dummy 
If one buys an item, then purchase_dummy are set to 1.

In [19]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)

## Normalize item values across users
We normalize purchase frequency of each item across users by first creating a user-item matrix as follows

In [30]:
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values = 'purchase_count',
                              index = 'customerId', columns = 'productId')
    df_matrix_norm = (df_matrix - df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars = ['customerId'],
                  value_name = 'scaled_purchase_freq').dropna()

In [24]:
df_matrix = pd.pivot_table(data, values = 'purchase_count', 
                          index = 'customerId', columns = 'productId')
print(df_matrix.shape)
df_matrix.head()

(24429, 300)


productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,2.0,,,,,,,,,...,,,,,,,,,,
1,,,6.0,,,,,,,,...,,,,1.0,,,1.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [26]:
df_matrix_norm = (df_matrix - df_matrix.min())/(df_matrix.max() - df_matrix.min())
df_matrix_norm.head()

productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,0.1,,,,,,,,,...,,,,,,,,,,
1,,,0.166667,,,,,,,,...,,,,0.0,,,0.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [28]:
# create a table for input to the modeling
d = df_matrix_norm.reset_index()
d.index.names = ['scaled_purchase_freq']
data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()
print(data_norm.shape)
data_norm.head()

(133585, 3)


Unnamed: 0,customerId,productId,scaled_purchase_freq
9,9,0,0.133333
25,25,0,0.133333
32,33,0,0.133333
35,36,0,0.133333
43,44,0,0.133333


# Split train and test set
We use 80:20 ratio for our train_test set size

In [31]:
def split_data(data):
    train, test = train_test_split(data, test_size = 0.2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [32]:
train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

# Define models using Turicreate library

## Baseline model: most popular items

In [33]:
# constant variables to define filed names
user_id = 'customerId'
item_id = 'productId'
users_to_recommend = list(customers[user_id])
n_rec = 10
n_display = 30 # to display the first few rows in an output dataset

In [48]:
# Turicreate is a great library
def model(train_data, name, user_id, item_id, target,
         users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data,
                                                user_id = user_id,
                                                item_id = item_id,
                                                target = target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data,
                                                     user_id = user_id,
                                                     item_id = item_id,
                                                     target = target,
                                                     similarity_type = 'cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data,
                                                     user_id = user_id,
                                                     item_id = item_id,
                                                     target = target,
                                                     similarity_type = 'pearson')
#     recom = model.recommend(users = users_to_recommend, k = n_rec)
    return model

# Popularity model as baseline

In [54]:
name = 'popularity'
target = 'purchase_count'
popularity_model = model(train_data, name, user_id, item_id, target,
                  users_to_recommend, n_rec, n_display)

In [57]:
popularity_model.recommend(users_to_recommend, k = n_rec).print_rows(30)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
|    1553    |    132    | 3.1403508771929824 |  1   |
|    1553    |     37    | 3.022727272727273  |  2   |
|    1553    |     0     | 2.9611273080660836 |  3   |
|    1553    |     34    | 2.9243027888446216 |  4   |
|    1553    |    248    | 2.909090909090909  |  5   |
|    1553    |     3     | 2.8410041841004183 |  6   |
|    1553    |    230    | 2.746376811594203  |  7   |
|    1553    |    110    | 2.694610778443114  |  8   |
|    1553    |     27    | 2.671641791044776  |  9   |
|    1553    |     32    |       2.635        |  10  |
|   20400    |    132    | 3.1403508771929824 |  1   |
|   20400    |     37    | 3.022727272727273  |  2   |
|   20400    |     0     | 2.9611273080660836 |  3   |
|   20400    |     34    | 2.9243027888446216 |  4   |
|   20400    |    248    | 2.909090909090909  |  5   |
|   20400 

## use collaborative filter

In [58]:
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target,
           users_to_recommend, n_rec, n_display)

In [59]:
cos.recommend(users_to_recommend, k = n_rec).print_rows(n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.13774506747722626  |  1   |
|    1553    |     1     | 0.07800829410552979  |  2   |
|    1553    |     5     | 0.07741294801235199  |  3   |
|    1553    |     35    | 0.06270940601825714  |  4   |
|    1553    |    167    | 0.05358444154262543  |  5   |
|    1553    |     17    | 0.04725289344787598  |  6   |
|    1553    |    148    | 0.046237096190452576 |  7   |
|    1553    |     31    | 0.04357762634754181  |  8   |
|    1553    |     13    | 0.04239402711391449  |  9   |
|    1553    |    152    |  0.0413091778755188  |  10  |
|   20400    |    280    | 0.08716535568237305  |  1   |
|   20400    |    122    | 0.04911607503890991  |  2   |
|   20400    |     74    | 0.04235422611236572  |  3   |
|   20400    |     1     | 0.037989258766174316 |  4   |
|   20400    |    284    | 0.03

In [61]:
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target,
           users_to_recommend, n_rec, n_display)
pear.recommend(users_to_recommend, k = n_rec).print_rows(n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
|    1553    |    132    | 3.1403508771929824 |  1   |
|    1553    |     37    |  3.02168523452499  |  2   |
|    1553    |     0     | 2.9589822561099046 |  3   |
|    1553    |    248    | 2.9080011871728026 |  4   |
|    1553    |     34    | 2.9008225225119944 |  5   |
|    1553    |     3     | 2.8410041841004188 |  6   |
|    1553    |    230    | 2.740934192270472  |  7   |
|    1553    |    110    | 2.6946107784431135 |  8   |
|    1553    |     27    | 2.6716417910447756 |  9   |
|    1553    |     32    | 2.635000000000001  |  10  |
|   20400    |    132    | 3.1297424425158584 |  1   |
|   20400    |     37    | 3.0227272727272756 |  2   |
|   20400    |     0     | 2.9592742592646593 |  3   |
|   20400    |     34    | 2.9243027888446207 |  4   |
|   20400    |    248    | 2.9090909090909087 |  5   |
|   20400 

In [62]:
models_w_counts = [popularity_model, cos, pear]

names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']

# Evaluate the models

In [63]:
eval_counts = tc.recommender.util.compare_models(test_data,
                                                models_w_counts, model_names=names_w_counts)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0010849909584086847 | 0.0005271104222279648 |
|   2    |  0.002893309222423152 | 0.0028978256021655645 |
|   3    |  0.008896925858951181 |  0.01452505579612273  |
|   4    |  0.007703435804701618 |  0.01641552024427251  |
|   5    |  0.006336347197106737 |  0.016723537121909566 |
|   6    |  0.006690777576853542 |  0.021060228760590438 |
|   7    | 0.0060449496254197975 |   0.0223231237920208  |
|   8    |  0.005687160940325492 |  0.02392843867526623  |
|   9    |  0.005368696001607366 |  0.025243803639817493 |
|   10   |  0.005236889692585906 |  0.02783968314708945  |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.2190371185061946

Per User RMSE (best)
+------------+------+-------+
| customerId |


Precision and recall summary statistics by cutoff
+--------+----------------------+---------------------+
| cutoff |    mean_precision    |     mean_recall     |
+--------+----------------------+---------------------+
|   1    | 0.11522603978300278  | 0.06793969262440941 |
|   2    | 0.09634719710669101  | 0.11183088106542127 |
|   3    | 0.08081977094635319  | 0.13715332404338088 |
|   4    | 0.07052441229656382  | 0.15819889717946523 |
|   5    | 0.06343580470162737  | 0.17632855072864723 |
|   6    |  0.0573960216998189  | 0.19022280142470693 |
|   7    |  0.0526065616119867  | 0.20256732858731685 |
|   8    | 0.04868896925858949  | 0.21415273000526858 |
|   9    | 0.045473176612417135 | 0.22366008257355333 |
|   10   | 0.04302350813743229  | 0.23481439761756073 |
+--------+----------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 1.9980972390960676

Per User RMSE (best)
+------------+---------------------+-------+
| customerId |         rmse        | coun


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0011573236889692578 | 0.0005632767875082575 |
|   2    | 0.0029656419529837086 |  0.002952075150086007 |
|   3    |  0.00877637130801691  |  0.014211613963693534 |
|   4    |  0.007703435804701583 |  0.016427575699365884 |
|   5    |  0.006350813743218766 |  0.016759703487190004 |
|   6    | 0.0067028330319469755 |   0.0210963951258708  |
|   7    |  0.006055282872642705 |  0.022359290157301027 |
|   8    |  0.005714285714285736 |  0.02407310413638747  |
|   9    |  0.005400843881856538 |  0.025412580011125365 |
|   10   |  0.005273056057866194 |  0.02805668133877112  |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.216139435011238

Per User RMSE (best)
+------------+-----------------------+-------

# final model

In [67]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', similarity_type='cosine')
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)


+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.12324784994125366  |  1   |
|    1553    |     35    | 0.10447167158126831  |  2   |
|    1553    |     1     | 0.10348175764083863  |  3   |
|    1553    |     5     |  0.0906752586364746  |  4   |
|    1553    |     17    | 0.07659814357757569  |  5   |
|    1553    |     21    | 0.07491707801818848  |  6   |
|    1553    |     8     | 0.06811234951019288  |  7   |
|    1553    |     33    |  0.0668614387512207  |  8   |
|    1553    |     47    | 0.06058878898620605  |  9   |
|    1553    |     61    | 0.060317397117614746 |  10  |
|   20400    |     26    | 0.05812269449234009  |  1   |
|   20400    |     6     | 0.05361741781234741  |  2   |
|   20400    |    113    | 0.05312788486480713  |  3   |
|   20400    |     1     | 0.05210459232330322  |  4   |
|   20400    |     15    | 0.04

In [69]:
recom.to_dataframe().head()

Unnamed: 0,customerId,productId,score,rank
0,1553,2,0.123248,1
1,1553,35,0.104472,2
2,1553,1,0.103482,3
3,1553,5,0.090675,4
4,1553,17,0.076598,5


# create a beautiful output

In [70]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerId').set_index('customerId')
    if print_csv:
        df_output.to_csv('../output/option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [72]:
df_output = create_output(pear, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(1000, 1)


Unnamed: 0_level_0,recommendedProducts
customerId,Unnamed: 1_level_1
4,132|37|0|34|248|3|230|110|27|32
11,132|37|0|34|248|3|230|110|27|32
12,132|37|0|34|248|3|230|110|27|32
16,132|37|0|34|248|3|230|110|27|32
21,132|37|0|34|248|3|230|110|27|32
