In [1]:
import graphlab as gl

In [2]:
rawData = gl.SFrame.read_csv('train.csv',sep='|')

[INFO] GraphLab Create v1.8.2 started. Logging: C:\Users\Shetty\AppData\Local\Temp\graphlab_server_1455847120.log.0


------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[long,long,long]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [3]:
rawData.head()

user_id,product_id,purchase_count
98635,160841,1
98635,53757,2
98635,77040,1
98635,4073,1
98635,6597,1
98635,29220,1
98635,23804,1
98635,67146,1
98635,82698,1
98635,122831,1


In [4]:
rawData.sort('purchase_count',ascending=False).head()

user_id,product_id,purchase_count
14932,150240,923
69470,100951,646
19713,3935,597
72249,39156,537
67387,35250,535
107537,16945,500
24384,112329,478
108151,26185,476
3924,64555,463
47333,135390,454


We are going to ignore the `purchase_count` completely and just focus on `user_id` and `product_id` since we are using item based recommender system. Although `purchase_count` tells us how popular an item is, the goal of this recommender is to recommend items based on the product's presence in other user's basket and not on how many times it was purchased.

### Training & Test data

In [5]:
train_raw,test_data = gl.recommender.util.random_split_by_user(rawData,user_id='user_id',item_id='product_id')

# Table of product_id and count (total count of how many times the products appeared in the training set)
productId_count = train_raw.groupby(key_columns = 'product_id',operations={'count':gl.aggregate.COUNT()})

### Measuring accuracy via the metric 'precision and recall'

In [5]:
#An item based recommender system works best when data is not sparse. Lets experiment with the variable k, where k represents
# how many users purchased this product as a minimum. 

for k in range(0,101,10):
    print 100*'o'
    productId_count_trunc = productId_count[productId_count['count']>=k]
    train_data_trunc = train_raw.join(productId_count_trunc)    
    print 'Data set where users >= {}(k) have purchased a single product'.format(k) 
    print 'Shape of train data: {} (should vary with k)'.format(train_data_trunc.shape)
    print 'Shape of test data: {} (should remain constant)'.format(test_data.shape) 
    print 'Number of unique users: {} (should vary with k)'.format(len(train_data_trunc['user_id'].unique()))
    print 'Number of unique products: {} (should vary with k)'.format(len(train_data_trunc['product_id'].unique()))
    # Model Building
    model = gl.item_similarity_recommender.create(train_data_trunc,user_id='user_id',item_id='product_id',similarity_type='jaccard',
                                                  verbose=False)
    # Model evaluation
    model_pre_rec_eval = gl.recommender.util.compare_models(test_data,[model],metric='precision_recall',
                                                exclude_known_for_precision_recall=True,verbose=False)
    print model_pre_rec_eval[0]

oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo
Data set where users >= 0(k) have purchased a single product
Shape of train data: (1338458, 4) (should vary with k)
Shape of test data: (2475, 3) (should remain constant)
Number of unique users: 110000 (should vary with k)
Number of unique products: 158208 (should vary with k)


{'precision_recall_overall': Columns:
	cutoff	int
	precision	float
	recall	float

Rows: 18

Data:
+--------+-----------------+------------------+
| cutoff |    precision    |      recall      |
+--------+-----------------+------------------+
|   1    | 0.0285035629454 | 0.00789409192022 |
|   2    | 0.0267220902613 |  0.016223654941  |
|   3    | 0.0269200316706 | 0.0267838291294  |
|   4    | 0.0296912114014 |  0.042201579761  |
|   5    | 0.0273159144893 | 0.0470841345248  |
|   6    | 0.0255344418052 | 0.0545762168684  |
|   7    | 0.0244316253817 | 0.0626767334012  |
|   8    | 0.0234560570071 | 0.0703483768804  |
|   9    | 0.0224333597255 | 0.0780794027825  |
|   10   | 0.0214964370546 | 0.0821136372205  |
+--------+-----------------+------------------+
[18 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'precision_recall_by_user': Columns:
	user_id	int
	cutoff	int
	precision	float


{'precision_recall_overall': Columns:
	cutoff	int
	precision	float
	recall	float

Rows: 18

Data:
+--------+-----------------+-----------------+
| cutoff |    precision    |      recall     |
+--------+-----------------+-----------------+
|   1    | 0.0391923990499 | 0.0129877841873 |
|   2    | 0.0385985748219 | 0.0285855672435 |
|   3    | 0.0372129849565 | 0.0426954907062 |
|   4    | 0.0359263657957 | 0.0520345549146 |
|   5    | 0.0356294536817 | 0.0639963427968 |
|   6    | 0.0336500395883 | 0.0745598160087 |
|   7    | 0.0322361723787 | 0.0828747690684 |
|   8    | 0.0308788598575 | 0.0910709572824 |
|   9    | 0.0296912114014 | 0.0991021943219 |
|   10   | 0.0277909738717 |  0.103804716661 |
+--------+-----------------+-----------------+
[18 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'precision_recall_by_user': Columns:
	user_id	int
	cutoff	int
	precision	float
	recall	float


{'precision_recall_overall': Columns:
	cutoff	int
	precision	float
	recall	float

Rows: 18

Data:
+--------+-----------------+-----------------+
| cutoff |    precision    |      recall     |
+--------+-----------------+-----------------+
|   1    | 0.0403800475059 |  0.013181012706 |
|   2    | 0.0362232779097 | 0.0291581834634 |
|   3    | 0.0368171021378 | 0.0409408089931 |
|   4    | 0.0338479809976 | 0.0486567536449 |
|   5    |  0.033729216152 | 0.0598173501083 |
|   6    | 0.0326603325416 | 0.0697389276071 |
|   7    | 0.0307091957923 | 0.0759952900095 |
|   8    | 0.0301365795724 | 0.0840562182605 |
|   9    | 0.0285035629454 | 0.0886663679419 |
|   10   | 0.0269596199525 | 0.0933095078582 |
+--------+-----------------+-----------------+
[18 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'precision_recall_by_user': Columns:
	user_id	int
	cutoff	int
	precision	float
	recall	float


{'precision_recall_overall': Columns:
	cutoff	int
	precision	float
	recall	float

Rows: 18

Data:
+--------+-----------------+-----------------+
| cutoff |    precision    |      recall     |
+--------+-----------------+-----------------+
|   1    | 0.0344418052257 | 0.0120131583908 |
|   2    | 0.0308788598575 | 0.0249717226558 |
|   3    | 0.0336500395883 | 0.0388742064217 |
|   4    | 0.0317695961995 | 0.0473404432727 |
|   5    | 0.0332541567696 | 0.0620112007285 |
|   6    | 0.0322644497229 | 0.0729008059768 |
|   7    | 0.0296912114014 | 0.0772305386617 |
|   8    | 0.0280581947743 | 0.0818326764289 |
|   9    | 0.0263921879124 | 0.0848743760858 |
|   10   | 0.0251781472684 | 0.0896777542858 |
+--------+-----------------+-----------------+
[18 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'precision_recall_by_user': Columns:
	user_id	int
	cutoff	int
	precision	float
	recall	float


{'precision_recall_overall': Columns:
	cutoff	int
	precision	float
	recall	float

Rows: 18

Data:
+--------+-----------------+-----------------+
| cutoff |    precision    |      recall     |
+--------+-----------------+-----------------+
|   1    | 0.0380047505938 | 0.0135241111488 |
|   2    | 0.0290973871734 | 0.0243034347547 |
|   3    | 0.0304829770388 | 0.0349841864391 |
|   4    | 0.0302850356295 | 0.0434391123524 |
|   5    | 0.0296912114014 | 0.0539728943529 |
|   6    | 0.0275138558987 | 0.0613476257182 |
|   7    | 0.0261282660333 | 0.0664281218913 |
|   8    | 0.0250890736342 | 0.0716867653328 |
|   9    | 0.0237529691211 | 0.0749641095247 |
|   10   | 0.0226840855107 | 0.0792509549042 |
+--------+-----------------+-----------------+
[18 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'precision_recall_by_user': Columns:
	user_id	int
	cutoff	int
	precision	float
	recall	float


{'precision_recall_overall': Columns:
	cutoff	int
	precision	float
	recall	float

Rows: 18

Data:
+--------+-----------------+-----------------+
| cutoff |    precision    |      recall     |
+--------+-----------------+-----------------+
|   1    |  0.041567695962 | 0.0163942615843 |
|   2    | 0.0285035629454 | 0.0245607585869 |
|   3    | 0.0285035629454 | 0.0337701457951 |
|   4    | 0.0288004750594 | 0.0433203475068 |
|   5    | 0.0270783847981 | 0.0513501706791 |
|   6    | 0.0257323832146 | 0.0575815547609 |
|   7    | 0.0240922972514 | 0.0602089913254 |
|   8    | 0.0228622327791 |  0.064246996076 |
|   9    | 0.0220374769068 | 0.0685140473145 |
|   10   | 0.0214964370546 | 0.0743137306083 |
+--------+-----------------+-----------------+
[18 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'precision_recall_by_user': Columns:
	user_id	int
	cutoff	int
	precision	float
	recall	float


{'precision_recall_overall': Columns:
	cutoff	int
	precision	float
	recall	float

Rows: 18

Data:
+--------+-----------------+-----------------+
| cutoff |    precision    |      recall     |
+--------+-----------------+-----------------+
|   1    | 0.0332541567696 |  0.014988877578 |
|   2    | 0.0267220902613 | 0.0223273196848 |
|   3    | 0.0253365003959 | 0.0297321410445 |
|   4    | 0.0252375296912 | 0.0364319864617 |
|   5    | 0.0249406175772 | 0.0459364731336 |
|   6    | 0.0243467933492 | 0.0523738105389 |
|   7    | 0.0222259925348 | 0.0543763177967 |
|   8    |  0.021229216152 |  0.058958661423 |
|   9    | 0.0209817893903 | 0.0661288533319 |
|   10   | 0.0200712589074 | 0.0703365221477 |
+--------+-----------------+-----------------+
[18 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'precision_recall_by_user': Columns:
	user_id	int
	cutoff	int
	precision	float
	recall	float


{'precision_recall_overall': Columns:
	cutoff	int
	precision	float
	recall	float

Rows: 18

Data:
+--------+-----------------+-----------------+
| cutoff |    precision    |      recall     |
+--------+-----------------+-----------------+
|   1    | 0.0308788598575 | 0.0129104927799 |
|   2    | 0.0255344418052 | 0.0196551106587 |
|   3    | 0.0229612034838 | 0.0251333023008 |
|   4    | 0.0225653206651 | 0.0315626277918 |
|   5    | 0.0237529691211 |  0.042572883042 |
|   6    | 0.0235550277118 | 0.0498816339375 |
|   7    |  0.021377672209 | 0.0518016656081 |
|   8    |  0.020190023753 | 0.0555738633233 |
|   9    | 0.0191343362365 | 0.0598296036241 |
|   10   | 0.0179334916865 | 0.0625315038617 |
+--------+-----------------+-----------------+
[18 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'precision_recall_by_user': Columns:
	user_id	int
	cutoff	int
	precision	float
	recall	float


{'precision_recall_overall': Columns:
	cutoff	int
	precision	float
	recall	float

Rows: 18

Data:
+--------+-----------------+-----------------+
| cutoff |    precision    |      recall     |
+--------+-----------------+-----------------+
|   1    | 0.0296912114014 | 0.0133063755985 |
|   2    | 0.0231591448931 | 0.0178010594578 |
|   3    | 0.0233570863025 | 0.0258920777032 |
|   4    |  0.021674584323 | 0.0319057262347 |
|   5    | 0.0220902612827 | 0.0404252187506 |
|   6    | 0.0225653206651 | 0.0484220516881 |
|   7    | 0.0203596878181 | 0.0496822786609 |
|   8    | 0.0187054631829 | 0.0528889294923 |
|   9    | 0.0184745315387 | 0.0592923340845 |
|   10   |  0.016864608076 | 0.0604799825406 |
+--------+-----------------+-----------------+
[18 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'precision_recall_by_user': Columns:
	user_id	int
	cutoff	int
	precision	float
	recall	float


{'precision_recall_overall': Columns:
	cutoff	int
	precision	float
	recall	float

Rows: 18

Data:
+--------+-----------------+------------------+
| cutoff |    precision    |      recall      |
+--------+-----------------+------------------+
|   1    | 0.0249406175772 | 0.00808072239189 |
|   2    |  0.020190023753 | 0.0125688082042  |
|   3    | 0.0205859065717 | 0.0188057752488  |
|   4    |  0.020190023753 | 0.0254132480083  |
|   5    |  0.020190023753 | 0.0332102543801  |
|   6    |  0.020190023753 | 0.0397720120998  |
|   7    | 0.0186630471666 | 0.0435065066894  |
|   8    | 0.0173693586698 | 0.0461589215746  |
|   9    | 0.0164951174452 | 0.0509095153988  |
|   10   |  0.01567695962  | 0.0526528136683  |
+--------+-----------------+------------------+
[18 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'precision_recall_by_user': Columns:
	user_id	int
	cutoff	int
	precision	float


{'precision_recall_overall': Columns:
	cutoff	int
	precision	float
	recall	float

Rows: 18

Data:
+--------+-----------------+------------------+
| cutoff |    precision    |      recall      |
+--------+-----------------+------------------+
|   1    | 0.0249406175772 | 0.00842570599103 |
|   2    |  0.020190023753 | 0.0128855144591  |
|   3    | 0.0209817893903 | 0.0202111592551  |
|   4    | 0.0192992874109 | 0.0245112007285  |
|   5    | 0.0190023752969 | 0.0328426489056  |
|   6    | 0.0195961995249 | 0.0409879379001  |
|   7    | 0.0183237190363 | 0.0437327254429  |
|   8    | 0.0167755344418 |  0.046889890922  |
|   9    | 0.0158353127474 | 0.0500965417533  |
|   10   | 0.0153206650831 | 0.0529384148446  |
+--------+-----------------+------------------+
[18 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns., 'precision_recall_by_user': Columns:
	user_id	int
	cutoff	int
	precision	float

From the above you can see that as k increases, the values in precision_recall table of the test data also increases before reaching a plateau and then drop. Accordingly based on the above output, we will set k = 50 i.e. we will build a recommendation model where at least 50 users have purchased a product.

### Building the recommendation model

In [6]:
k = 50
# Table of product_id and count (total count of how many times the products appeared in the original raw data set)
raw_productId_count = rawData.groupby(key_columns = 'product_id',operations={'count':gl.aggregate.COUNT()})
raw_productId_count_trunc = raw_productId_count[raw_productId_count['count']>=k]
rawData_trunc = rawData.join(raw_productId_count_trunc)   
final_model = gl.item_similarity_recommender.create(rawData_trunc,user_id='user_id',item_id='product_id',similarity_type='jaccard',
                                                  verbose=False)

### Creating the recommendation list

In [10]:
recommended_products = final_model.recommend(users=rawData['user_id'].unique(),k=1, verbose=False)

In [14]:
recommended_products.head()

user_id,product_id,score,rank
21855,91410,0.0283252231764,1
88004,153231,0.00828025477707,1
79732,10547,0.0191628529172,1
63664,38282,0.0254545454545,1
7899,51862,0.0429483476676,1
25263,132364,0.0175530672907,1
87629,14766,0.115994489909,1
30621,3263,0.0208543558695,1
43116,4886,0.0424757621556,1
82163,9855,0.0588791565237,1


In [16]:
del recommended_products['score']
del recommended_products['rank']

In [17]:
recommended_products = recommended_products.sort('user_id')

In [28]:
# Saving from pandas data frame to csv is easier.
recommended_products = recommended_products.to_dataframe()

In [30]:
recommended_products.shape

(110000, 2)

In [29]:
recommended_products.head()

Unnamed: 0,user_id,product_id
0,1,52928
1,2,82238
2,3,115771
3,4,102612
4,5,70312


In [32]:
recommended_products.to_csv('Full-Recommended-List.csv',index=False)