### USER-USER 協同過濾：轉換為最相似的顧客族群(USER-USER Similarity Matrix)，查看他們經常購買的商品，推薦給目前鎖定的顧客。 
- https://medium.datadriveninvestor.com/how-to-build-a-recommendation-system-for-purchase-data-step-by-step-d6d7a78800b6

In [1]:
import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

In [34]:
df = pd.read_csv('transaction_data_version3.csv')
product_df = pd.read_csv('product.csv')
product_df1 = product_df[['PRODUCT_ID','SUB_COMMODITY_DESC']]
df_merge = df.merge(product_df1,on='PRODUCT_ID')
df = df_merge.copy()
df_new = df[['household_key','SUB_COMMODITY_DESC','QUANTITY']]
df_group = df_new.groupby(['household_key','SUB_COMMODITY_DESC']).sum().reset_index()
df_group.columns = ['customerId','productId','purchase_count']

In [13]:
data = df_group.copy()

In [14]:
data.head()

Unnamed: 0,customerId,productId,purchase_count
0,1,ADULT ANALGESICS,3
1,1,ADULT CEREAL,2
2,1,AEROSOL TOPPINGS,1
3,1,AIR CARE - AEROSOLS,1
4,1,AIR CARE - CANDLES,6


In [15]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy

In [16]:
data_dummy = create_data_dummy(data)

In [17]:
df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')

In [18]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

In [20]:
# create a table for input to the modeling  
d = df_matrix_norm.reset_index() 
d.index.names = ['scaled_purchase_freq'] 
data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()
print(data_norm.shape)
data_norm.head()

(607269, 3)


Unnamed: 0,customerId,productId,scaled_purchase_freq
5184,185,*BOYS/GIRLS MISC TOYS,1.0
5370,371,*BOYS/GIRLS MISC TOYS,0.0
5499,500,*BOYS/GIRLS MISC TOYS,0.0
5540,541,*BOYS/GIRLS MISC TOYS,0.0
5613,614,*BOYS/GIRLS MISC TOYS,0.0


In [25]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [26]:
train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [49]:
customers = df['household_key'].reset_index()
customers = customers.drop('index',axis=1)
customers = customers.groupby('household_key').sum()
customers = customers.reset_index()
customers

Unnamed: 0,household_key
0,1
1,2
2,3
3,4
4,5
...,...
2495,2496
2496,2497
2497,2498
2498,2499


In [50]:
# constant variables to define field names include:
user_id = 'customerId'
item_id = 'productId'
users_to_recommend = list(customers['household_key'])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [53]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

### Popularity Model as Baseline
- The popularity model takes the most popular items for recommendation. These items are products with the highest number of sells across customers.
- Training data is used for model selection

#### Using purchase count

In [54]:
name = 'popularity'
target = 'purchase_count'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+--------------------+------+
| customerId |           productId            |       score        | rank |
+------------+--------------------------------+--------------------+------+
|     1      | CANNED CAT FOOD (9 LIVES/FRISK | 52.81782945736434  |  1   |
|     1      |      BABY FOOD - BEGINNER      | 43.20054945054945  |  2   |
|     1      |     YOGURT NOT MULTI-PACKS     | 43.06858924395947  |  3   |
|     1      | CAN CATFD GOURMET/SUP PREM (GR | 38.58563535911602  |  4   |
|     1      | CAN DOG FOOD (SKIPPY/PEDIGREE/ | 35.24096385542169  |  5   |
|     1      |           BABY FOOD            | 30.11764705882353  |  6   |
|     1      | CAN DOGFD GOURMET/SUPER PREM ( | 29.291666666666668 |  7   |
|     1      | SFT DRNK 2 LITER BTL CARB INCL | 27.76474278544542  |  8   |
|     1      |   SOFT DRINK POWDER POUCHES    | 25.387024608501118 |  9   |
|     1      | CANDY BARS (SINGLES)(INCLUDING | 24.499020248203788 |  10  |
|     2     

#### Using purchase dummy

In [55]:
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+-------+------+
| customerId |           productId            | score | rank |
+------------+--------------------------------+-------+------+
|     1      | SFT DRNK SNGL SRV BTL CARB (EX |  1.0  |  1   |
|     1      | APPLE JUICE & CIDER (OVER 50%  |  1.0  |  2   |
|     1      |       YOGURT MULTI-PACKS       |  1.0  |  3   |
|     1      |      DELI TRAY:SANDWICHES      |  1.0  |  4   |
|     1      |          WINGS (IQF)           |  1.0  |  5   |
|     1      |      SAL:SALSA/DPS-PRPCK       |  1.0  |  6   |
|     1      |         PEANUT BUTTER          |  1.0  |  7   |
|     1      |          DIECAST MINI          |  1.0  |  8   |
|     1      |             BEANS              |  1.0  |  9   |
|     1      |              MUMS              |  1.0  |  10  |
|     2      |             BUTTER             |  1.0  |  1   |
|     2      |       YOGURT MULTI-PACKS       |  1.0  |  2   |
|     2      |      DELI TRAY:SANDWICHES      |  1.0  |

#### Using scaled purchase count

In [56]:
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+--------------------+------+
| customerId |           productId            |       score        | rank |
+------------+--------------------------------+--------------------+------+
|     1      | TEA (CANNED/BOTTLED) W/SWEETEN |        1.0         |  1   |
|     1      |  FOLIAGE PLANTS (5IN & DOWN)   |        1.0         |  2   |
|     1      |         POLY SHEETING          |        1.0         |  3   |
|     1      |        PATIO FURNITURE         |        1.0         |  4   |
|     1      |      HARD GOODS ALL OTHER      |        1.0         |  5   |
|     1      |         TASK LIGHTING          |        1.0         |  6   |
|     1      |          VITAMIN MISC          | 0.6666666666666666 |  7   |
|     1      |     DECORATOR ACCESSORIES      | 0.6666666666666666 |  8   |
|     1      | HALLOWEEN COSTUMES/MASKS/ACCES | 0.6666666666666666 |  9   |
|     1      |     TRUESCENTS/BAIN DELUXE     | 0.6666666666666666 |  10  |
|     2     

### Cosine similarity

#### Using purchase count

In [57]:
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+---------------------+------+
| customerId |           productId            |        score        | rank |
+------------+--------------------------------+---------------------+------+
|     1      |        SHREDDED CHEESE         |  1.9370655469957427 |  1   |
|     1      |    POURABLE SALAD DRESSINGS    |  1.769642070264018  |  2   |
|     1      |      TORTILLA/NACHO CHIPS      |  1.656494073405665  |  3   |
|     1      |          SOUR CREAMS           |  1.5588818048065454 |  4   |
|     1      |   MARGARINE: TUBS AND BOWLS    |  1.5025010001291788 |  5   |
|     1      |          EGGS - LARGE          |  1.4333985647441008 |  6   |
|     1      |           MAINSTREAM           |  1.3014755690150324 |  7   |
|     1      |             PRIMAL             |  1.2999044924580578 |  8   |
|     1      | SOUP CRACKERS (SALTINE/OYSTER) |  1.1831314710793517 |  9   |
|     1      | FRZN GARLIC BREAD/TOAST/STICKS |  1.173970659136247  |  10  |

#### Using purchase dummy

In [58]:
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+---------------------+------+
| customerId |           productId            |        score        | rank |
+------------+--------------------------------+---------------------+------+
|     1      |       ALL FAMILY CEREAL        |  0.4026975781880608 |  1   |
|     1      |             PRIMAL             |  0.4000129593435184 |  2   |
|     1      |         TOILET TISSUE          |  0.3973616816738779 |  3   |
|     1      |      TORTILLA/NACHO CHIPS      |  0.396321201971335  |  4   |
|     1      |         PEANUT BUTTER          |  0.3846841560777768 |  5   |
|     1      |   MARGARINE: TUBS AND BOWLS    |  0.3752635904522829 |  6   |
|     1      | SFT DRNK 2 LITER BTL CARB INCL | 0.37197578438492707 |  7   |
|     1      |         EGGS - X-LARGE         |  0.3674825587013895 |  8   |
|     1      | CANDY BARS (SINGLES)(INCLUDING |  0.3648358253083488 |  9   |
|     1      |     PAPER TOWELS & HOLDERS     | 0.35660117217736653 |  10  |

#### Using scaled purchase count

In [59]:
name = 'cosine' 
target = 'scaled_purchase_freq' 
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+-----------------------+------+
| customerId |           productId            |         score         | rank |
+------------+--------------------------------+-----------------------+------+
|     1      |       ALL FAMILY CEREAL        |  0.01328815338088245  |  1   |
|     1      |      TORTILLA/NACHO CHIPS      |  0.011770867962178177 |  2   |
|     1      |          EGGS - LARGE          |  0.011744011708391391 |  3   |
|     1      |   POTATOES RUSSET (BULK&BAG)   |  0.011592877589590182 |  4   |
|     1      |          SOUR CREAMS           |  0.011457141821946555 |  5   |
|     1      |             PRIMAL             |  0.01074170654382163  |  6   |
|     1      | CHEESE CRACKERS (CHEEZ-ITS/GOL |  0.008980888661330308 |  7   |
|     1      |         CARDS EVERYDAY         |  0.008696002931129642 |  8   |
|     1      |          STRAWBERRIES          |  0.008566525408892127 |  9   |
|     1      |     PAPER TOWELS & HOLDERS     |  0.0

### Pearson

#### Using purchase count

In [62]:
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+--------------------+------+
| customerId |           productId            |       score        | rank |
+------------+--------------------------------+--------------------+------+
|     1      | CANNED CAT FOOD (9 LIVES/FRISK | 52.817766933142295 |  1   |
|     1      |      BABY FOOD - BEGINNER      | 43.199588168632275 |  2   |
|     1      |     YOGURT NOT MULTI-PACKS     | 43.127979110880794 |  3   |
|     1      | CAN CATFD GOURMET/SUP PREM (GR | 38.57894915898671  |  4   |
|     1      | CAN DOG FOOD (SKIPPY/PEDIGREE/ | 35.24202177826183  |  5   |
|     1      |           BABY FOOD            | 30.119067123730773 |  6   |
|     1      | CAN DOGFD GOURMET/SUPER PREM ( | 29.286122514900494 |  7   |
|     1      | SFT DRNK 2 LITER BTL CARB INCL | 27.81336656576736  |  8   |
|     1      |   SOFT DRINK POWDER POUCHES    | 25.37379352140256  |  9   |
|     1      | CANDY BARS (SINGLES)(INCLUDING | 24.543146838121725 |  10  |
|     2     

#### Using purchase dummy

In [63]:
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+-------+------+
| customerId |           productId            | score | rank |
+------------+--------------------------------+-------+------+
|     1      | SFT DRNK SNGL SRV BTL CARB (EX |  0.0  |  1   |
|     1      | APPLE JUICE & CIDER (OVER 50%  |  0.0  |  2   |
|     1      |       YOGURT MULTI-PACKS       |  0.0  |  3   |
|     1      |      DELI TRAY:SANDWICHES      |  0.0  |  4   |
|     1      |          WINGS (IQF)           |  0.0  |  5   |
|     1      |      SAL:SALSA/DPS-PRPCK       |  0.0  |  6   |
|     1      |         PEANUT BUTTER          |  0.0  |  7   |
|     1      |          DIECAST MINI          |  0.0  |  8   |
|     1      |             BEANS              |  0.0  |  9   |
|     1      |              MUMS              |  0.0  |  10  |
|     2      |             BUTTER             |  0.0  |  1   |
|     2      |       YOGURT MULTI-PACKS       |  0.0  |  2   |
|     2      |      DELI TRAY:SANDWICHES      |  0.0  |

#### Using scaled purchase count

In [64]:
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+--------------------+------+
| customerId |           productId            |       score        | rank |
+------------+--------------------------------+--------------------+------+
|     1      | TEA (CANNED/BOTTLED) W/SWEETEN |        1.0         |  1   |
|     1      |  FOLIAGE PLANTS (5IN & DOWN)   |        1.0         |  2   |
|     1      |         POLY SHEETING          |        1.0         |  3   |
|     1      |        PATIO FURNITURE         |        1.0         |  4   |
|     1      |      HARD GOODS ALL OTHER      |        1.0         |  5   |
|     1      |         TASK LIGHTING          |        1.0         |  6   |
|     1      |     TRUESCENTS/BAIN DELUXE     | 0.6667159870872652 |  7   |
|     1      |          VITAMIN MISC          | 0.6666666666666666 |  8   |
|     1      | HALLOWEEN COSTUMES/MASKS/ACCES | 0.666656546718706  |  9   |
|     1      |     DECORATOR ACCESSORIES      | 0.6665965892919681 |  10  |
|     2     

### Model Evaluation

In [85]:
models_w_counts = [popularity, cos, pear]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]
names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [86]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+----------------------+-----------------------+
| cutoff |    mean_precision    |      mean_recall      |
+--------+----------------------+-----------------------+
|   1    | 0.027254509018036072 | 0.0004891859504976197 |
|   2    | 0.04849699398797596  |  0.001924333037205202 |
|   3    |  0.0753507014028056  |  0.004899318797957038 |
|   4    | 0.08276553106212425  | 0.0076012345693128635 |
|   5    | 0.08697394789579156  |  0.011216196184111002 |
|   6    | 0.08623914495657985  |  0.012533387670661022 |
|   7    | 0.09928428285141713  |  0.016073969203899918 |
|   8    | 0.11202404809619243  |   0.0213457192179516  |
|   9    | 0.11440659095969705  |  0.025307473592445503 |
|   10   | 0.10989979959919846  |  0.027564569672184916 |
+--------+----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 12.624688985731384

Per User RMSE (best)
+------------+-------------------+-------+
| customerId | 


Precision and recall summary statistics by cutoff
+--------+---------------------+----------------------+
| cutoff |    mean_precision   |     mean_recall      |
+--------+---------------------+----------------------+
|   1    |  0.6296593186372748 | 0.014538382007181617 |
|   2    |  0.5861723446893788 | 0.02633108688022113  |
|   3    |  0.5529726118904471 | 0.036813333176115105 |
|   4    |  0.5313627254509024 | 0.04641893105789371  |
|   5    |  0.507575150300601  | 0.05483853658985713  |
|   6    | 0.49038076152304616 | 0.06303666723265128  |
|   7    |  0.4763813340967649 | 0.07083913399894906  |
|   8    | 0.46362725450901815 | 0.07822471241364416  |
|   9    | 0.45268314406590976 | 0.08555311496522126  |
|   10   |  0.4419639278557117 | 0.09234053833116687  |
+--------+---------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 14.426240950487239

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse        | count 


Precision and recall summary statistics by cutoff
+--------+----------------------+-----------------------+
| cutoff |    mean_precision    |      mean_recall      |
+--------+----------------------+-----------------------+
|   1    | 0.028056112224448898 | 0.0005010615535555877 |
|   2    | 0.051903807615230474 | 0.0020115710647539246 |
|   3    | 0.08136272545090185  |  0.00513911866111127  |
|   4    | 0.08466933867735467  |  0.007763226938871088 |
|   5    | 0.08697394789579156  |  0.011216196184111007 |
|   6    | 0.08637274549098191  |  0.012555274951856842 |
|   7    | 0.09899799599198403  |  0.016043756040480462 |
|   8    | 0.11117234468937874  |  0.021167878695722408 |
|   9    | 0.11342685370741472  |  0.024950551676909346 |
|   10   | 0.10917835671342697  |  0.027187449804378844 |
+--------+----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 12.597216237019296

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |


Precision and recall summary statistics by cutoff
+--------+-----------------------+------------------------+
| cutoff |     mean_precision    |      mean_recall       |
+--------+-----------------------+------------------------+
|   1    | 0.0032038446135362435 |  6.93656767305255e-05  |
|   2    |  0.002803364036844213 | 0.00010727003142591445 |
|   3    |  0.003070351087972232 | 0.00022425898743060667 |
|   4    |  0.002603123748498197 | 0.0002620236032151623  |
|   5    |  0.002643171806167402 | 0.0003124000440518645  |
|   6    |  0.002469630222934188 | 0.0003490510331148452  |
|   7    |  0.002345671949196178 | 0.00036375271696211664 |
|   8    | 0.0023027633159791736 | 0.0003985957361495065  |
|   9    |  0.00258087482757086  | 0.00048029524407063604 |
|   10   |  0.00252302763315979  | 0.0005583635668327486  |
+--------+-----------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+------------+------+-------+
| customerId | 


Precision and recall summary statistics by cutoff
+--------+---------------------+----------------------+
| cutoff |    mean_precision   |     mean_recall      |
+--------+---------------------+----------------------+
|   1    |  0.5722867440929117 | 0.01202029311205713  |
|   2    |  0.5540648778534241 | 0.024237003644818483 |
|   3    |  0.5402482979575489 | 0.03509438053589034  |
|   4    |  0.529835802963556  |  0.0456493101843391  |
|   5    |  0.5196635963155795 | 0.05569245035867948  |
|   6    |  0.5098785208917365 | 0.06497664387830857  |
|   7    | 0.49882716402540184 | 0.07410655183051232  |
|   8    | 0.48868642370844995 | 0.08335535604857926  |
|   9    |  0.4785297913051212 | 0.09233558361686288  |
|   10   | 0.46852222667200727 | 0.10084049085141432  |
+--------+---------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9389874190212619

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse        | count 


Precision and recall summary statistics by cutoff
+--------+-----------------------+------------------------+
| cutoff |     mean_precision    |      mean_recall       |
+--------+-----------------------+------------------------+
|   1    | 0.0012014417300760907 | 1.973540040048517e-05  |
|   2    | 0.0020024028834601556 |  6.96810245037791e-05  |
|   3    | 0.0033373381391002487 | 0.0001875679047772527  |
|   4    |  0.003504205046055267 | 0.0002613636621126005  |
|   5    | 0.0032839407288746507 | 0.00029768745134659284 |
|   6    | 0.0030036043251902285 | 0.00033051210598157067 |
|   7    |  0.003375479146404258 | 0.0004119214983661506  |
|   8    |  0.003354024829795755 | 0.00046106526174254943 |
|   9    | 0.0032928402972455806 | 0.0005069749929348379  |
|   10   |  0.00348418101722066  | 0.0005769281631816204  |
+--------+-----------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+------------+------+-------+
| customerId | 


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    | 0.0006004803843074457  | 1.6450553791993982e-05 |
|   3    | 0.0005337603416066185  | 2.116020386499356e-05  |
|   4    | 0.0005004003202562051  | 2.5925921200766947e-05 |
|   5    | 0.00040032025620496405 | 2.5925921200766933e-05 |
|   6    | 0.00046704029890579123 | 3.498230027362581e-05  |
|   7    | 0.0004575088642342445  | 3.849388146840615e-05  |
|   8    |  0.000500400320256205  | 5.013180672489753e-05  |
|   9    | 0.0004448002846721823  | 5.013180672489755e-05  |
|   10   | 0.0004403522818254603  | 5.5135809927459545e-05 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.11307266699682494

Per User RMSE (best)
+------------+-


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    |  0.6309047237790232 | 0.01445847005708118 |
|   2    |  0.5816653322658123 | 0.02525194866314324 |
|   3    |  0.5528422738190557 | 0.03558912772200469 |
|   4    |  0.5282225780624501 | 0.04471075297917851 |
|   5    |  0.5120896717373901 | 0.05358667024998455 |
|   6    |  0.4957966373098477 | 0.06196729910156928 |
|   7    | 0.48330092645545025 | 0.07027787775941599 |
|   8    | 0.47252802241793435 |  0.0781753499045683 |
|   9    | 0.46014589449337245 | 0.08530315026090476 |
|   10   | 0.44883907125700573 | 0.09207012244311377 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.1264016683645031

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |
+------------+------+-------+
|    1504 


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    | 0.0006004803843074464  | 1.8366152472974286e-05 |
|   3    | 0.0006672004270082732  | 2.7216770382267722e-05 |
|   4    |  0.000500400320256205  | 2.7216770382267712e-05 |
|   5    | 0.0005604483586869497  | 3.498230027362578e-05  |
|   6    | 0.00046704029890579134 | 3.4982300273625786e-05 |
|   7    | 0.00045750886423424436 |  3.77242198366735e-05  |
|   8    | 0.00040032025620496416 | 3.7724219836673476e-05 |
|   9    | 0.0003558402277377457  |  3.77242198366735e-05  |
|   10   | 0.0003602882305844675  | 4.662022553011715e-05  |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.11290983670448318

Per User RMSE (best)
+------------+-

In [132]:
# target='purchase_dummy' replace to target= None

final_model = tc.item_similarity_recommender.create(tc.SFrame(data_norm), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            similarity_type="pearson")
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+--------------------------------+--------------------+------+
| customerId |           productId            |       score        | rank |
+------------+--------------------------------+--------------------+------+
|     1      | SFT DRNK 2 LITER BTL CARB INCL | 0.8259414225941423 |  1   |
|     1      |     MAINSTREAM WHITE BREAD     | 0.8188284518828451 |  2   |
|     1      | CANDY BARS (SINGLES)(INCLUDING | 0.796652719665272  |  3   |
|     1      |      TORTILLA/NACHO CHIPS      | 0.7861924686192469 |  4   |
|     1      |             PRIMAL             | 0.7410041841004185 |  5   |
|     1      |         EGGS - X-LARGE         | 0.702928870292887  |  6   |
|     1      |          KIDS CEREAL           | 0.700836820083682  |  7   |
|     1      |      BEERALEMALT LIQUORS       | 0.6794979079497908 |  8   |
|     1      |     PAPER TOWELS & HOLDERS     | 0.6665271966527196 |  9   |
|     1      |     YOGURT NOT MULTI-PACKS     | 0.6598326359832636 |  10  |
|     2     

In [140]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec

(25000, 4)


Unnamed: 0,customerId,productId,score,rank
0,1,SFT DRNK 2 LITER BTL CARB INCL,0.825941,1
1,1,MAINSTREAM WHITE BREAD,0.818828,2
2,1,CANDY BARS (SINGLES)(INCLUDING,0.796653,3
3,1,TORTILLA/NACHO CHIPS,0.786192,4
4,1,PRIMAL,0.741004,5
...,...,...,...,...
24995,2500,GRAPES RED,0.548536,6
24996,2500,CANDY BARS (MULTI PACK),0.541004,7
24997,2500,SANDWICH COOKIES,0.511715,8
24998,2500,FRZN SS PREMIUM ENTREES/DNRS/T,0.499582,9


In [137]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerId').set_index('customerId')
    if print_csv:
        df_output.to_csv('./option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [142]:
# pear_norm replace to final_model 

df_output = create_output(final_model, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(2500, 1)


Unnamed: 0_level_0,recommendedProducts
customerId,Unnamed: 1_level_1
1,SFT DRNK 2 LITER BTL CARB INCL|MAINSTREAM WHIT...
2,CANDY BARS (SINGLES)(INCLUDING|CONDENSED SOUP|...
3,PREMIUM|EGGS - LARGE|EGGS - X-LARGE|MAINSTREAM...
4,BANANAS|SFT DRNK 2 LITER BTL CARB INCL|DAIRY C...
5,BANANAS|SOFT DRINKS 12/18&15PK CAN CAR|POTATO ...


In [143]:
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [146]:
customer_recomendation(400)

recommendedProducts    STICKS/ENROBED|FRZN SS PREMIUM ENTREES/DNRS/T|...
Name: 400, dtype: object