# Category-Wise Popularity Based Recommender System

___

In [89]:
import numpy as np
import pandas as pd
from pandas import DataFrame as df

## Load Training and Test Set

In [41]:
ads_data = pd.read_csv('./Dataset/ads_data.csv')
userdata = pd.read_csv('./Dataset/user_data.csv')
user_messages = pd.read_csv('./Dataset/user_messages.csv')

In [42]:
test = pd.read_csv('./Dataset/user_messages_test.csv')

### 1.1 Exploring Ads Data

In [43]:
ads_data.sort_values('category_id',inplace = True)
#ads_data.head(10)

In [44]:
ads_data.drop('description',inplace = True, axis=1)
ads_data.drop('lat',inplace = True,axis =1)
ads_data.drop('long',inplace = True, axis = 1)

In [45]:
ads_data.head()

Unnamed: 0,ad_id,category_id,seller_id,creation_time,title,price,source,enabled
401718,2275225,362,522712,2017-04-05 00:38:25,Casco HTL nuevo,800.0,android,1
619697,863742,362,353414,2016-10-06 16:10:23,Volkswagen gol trend,135000.0,android,1
23115,411940,362,237803,2016-08-04 12:55:05,Siena 2010 Gnc,100000.0,android,0
92976,1436629,362,466535,2016-12-15 22:42:56,Palio Fire,100000.0,android,0
23111,407672,362,236959,2016-08-03 21:10:05,Repuesto Monoshock fz 16,400.0,android,0


In [46]:
print "Number of Ads:", len(ads_data)

Number of Ads: 645168


In [47]:
print "Number of Unique Ad_ID:", len(ads_data['ad_id'].unique())

Number of Unique Ad_ID: 645168


In [48]:
print "Number of Unique Category_ID:", len(ads_data['category_id'].unique())

Number of Unique Category_ID: 10


In [49]:
print "Number of Seller_ID:", len(ads_data['seller_id'].unique())

Number of Seller_ID: 108112


In [50]:
ads_data['price'].describe()

count    6.451660e+05
mean     5.291704e+03
std      9.827501e+05
min      0.000000e+00
25%      1.500000e+02
50%      3.400000e+02
75%      8.500000e+02
max      7.777778e+08
Name: price, dtype: float64

In [51]:
ads_data['source'].unique()  # NAN available

array(['android', 'web', 'apple', nan, 'i2', 'none'], dtype=object)

### Distribution of Each Category ID

In [52]:
category = ads_data['category_id'].unique()

In [53]:
cat_len = [len(ads_data[ads_data['category_id'] == i]) for i in category]

In [54]:
for i in range(len(category)):
    print "Category : ", category[i] ,"Number : " ,cat_len[i]

Category :  362 Number :  25804
Category :  800 Number :  72561
Category :  806 Number :  79110
Category :  811 Number :  11408
Category :  815 Number :  267871
Category :  853 Number :  70571
Category :  859 Number :  13754
Category :  881 Number :  19640
Category :  887 Number :  19813
Category :  888 Number :  64636


## 1.2 Exploring User Data

In [55]:
userdata.sort_values('user_id',inplace = True)
#userdata.head(10)

In [56]:
userdata.columns

Index([u'event_time', u'user_id', u'event', u'channel', u'user_lat',
       u'user_long', u'origin', u'ad_id', u'images_count', u'ad_impressions',
       u'ad_views', u'ad_messages'],
      dtype='object')

In [57]:
userdata.drop(u'user_lat',inplace = True,axis = 1)
userdata.drop(u'user_long', inplace = True, axis = 1)

In [162]:
userdata.head(10)

Unnamed: 0,event_time,user_id,event,channel,origin,ad_id,images_count,ad_impressions,ad_views,ad_messages
544306,2017-06-08 15:10:05,1,view,android,notification_center,2116768,3.0,1317.0,19.0,0.0
709950,2017-06-11 16:39:48,1,view,android,search,2784220,3.0,264.0,62.0,2.0
596723,2017-06-11 16:40:16,1,view,android,search,1232646,4.0,1090.0,224.0,7.0
90092,2017-06-11 16:43:24,1,view,android,search,337350,4.0,42.0,24.0,0.0
642116,2017-06-11 16:39:24,1,view,android,search,2325381,3.0,795.0,177.0,0.0
1064533,2017-06-09 16:29:41,1,view,android,home,2817385,4.0,1363.0,21.0,0.0
329740,2017-06-11 16:40:53,1,view,android,search,2510107,7.0,120.0,95.0,1.0
564486,2017-06-10 16:49:25,1,view,android,notification_center,2818118,2.0,310.0,1.0,0.0
600709,2017-06-11 16:38:54,1,view,android,search,2815681,4.0,1755.0,77.0,2.0
517409,2017-06-11 16:44:50,1,view,android,search,1564512,1.0,76.0,83.0,6.0


In [163]:
userdata.shape

(2143889, 10)

In [59]:
print "Number of User Data Transactions:", len(userdata)

Number of User Data Transactions: 2143889


In [60]:
print "Number of Unique Users:", len(userdata['user_id'].unique())

Number of Unique Users: 12365


In [61]:
userdata['event'].unique()

array(['view', 'first_message'], dtype=object)

In [62]:
userdata['channel'].unique()

array(['android', 'desktop', 'ios'], dtype=object)

In [63]:
userdata['origin'].unique() ## NAN Present

array(['notification_center', 'search', 'home', 'push', 'browse',
       'browse_search', 'drawer', nan, 'deeplink'], dtype=object)

In [64]:
print "Number of Unique Ad_ID: ", len(userdata['ad_id'].unique())

Number of Unique Ad_ID:  535651


In [65]:
userdata['images_count'].describe()

count    2.077478e+06
mean     2.955274e+00
std      1.798261e+00
min      0.000000e+00
25%      2.000000e+00
50%      3.000000e+00
75%      4.000000e+00
max      1.500000e+01
Name: images_count, dtype: float64

In [66]:
userdata['ad_impressions'].describe()

count    2.095669e+06
mean     1.745520e+03
std      2.097932e+04
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      2.130000e+02
max      6.940120e+05
Name: ad_impressions, dtype: float64

In [67]:
userdata['ad_views'].describe()

count    2.095669e+06
mean     6.360186e+01
std      2.624783e+02
min      0.000000e+00
25%      1.100000e+01
50%      2.600000e+01
75%      5.500000e+01
max      7.725000e+03
Name: ad_views, dtype: float64

In [68]:
userdata['ad_messages'].describe()

count    2.095669e+06
mean     1.940905e+00
std      6.053958e+00
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      1.890000e+02
Name: ad_messages, dtype: float64

## 1.3 Exploring User Messages

In [69]:
user_messages.head(10)

Unnamed: 0,user_id,category_id,ad_id
0,1,859,1806476
1,3,800,2131700
2,3,800,2734107
3,3,800,2877209
4,3,800,2877209
5,4,815,2883211
6,7,815,2429412
7,7,815,2886810
8,7,815,2886804
9,7,362,2909301


In [70]:
print "Number of Users:", len(user_messages['user_id'].unique())

Number of Users: 7161


In [71]:
print " Number of Categories:", len(user_messages['category_id'].unique())

 Number of Categories: 10


In [72]:
user_messages['user_id'].unique()

array([    1,     3,     4, ..., 15056, 15061, 15066])

In [73]:
user_messages.shape

(21153, 3)

In [74]:
type(user_messages['user_id'][0])

numpy.int64

In [77]:
user_messages.sort_values('category_id',inplace=True)
user_messages.head(5)

Unnamed: 0,user_id,category_id,ad_id
12167,7597,362,1730320
19091,11938,362,2388804
11279,6452,362,1238103
11280,6452,362,2187575
11281,6452,362,2863136


## 1.4 Exploring Sample Prediction File

In [75]:
test.head()

Unnamed: 0,user_id,category_id
0,2,800
1,6,815
2,14,811
3,14,800
4,17,815


In [76]:
print "No. of Test Users:",len(test['user_id'])
print "No. of Unique Categories:", len(test['category_id'].unique())

No. of Test Users: 10507
No. of Unique Categories: 10


## 2.0 Naive Submission

Popular Recommendation

In [78]:
category

array([362, 800, 806, 811, 815, 853, 859, 881, 887, 888])

In [108]:
def result_entry(x):
    val = '['+str(x[0])+', '+str(x[1])+', '+str(x[2])+', '+str(x[3])+', '+\
              str(x[4])+', '+str(x[5])+', '+str(x[6])+', '+str(x[7])+', '+\
              str(x[7])+', '+str(x[8])+', '+str(x[9])+']'
    return val

In [109]:
pred = list()
for i in xrange(10): 
    df1 = user_messages[user_messages['category_id']== category[i]]
    ads_all = list(df1['ad_id'])
    ads_uni = df1['ad_id'].unique()

    add_freq = list()

    for i in xrange(len(ads_uni)):
        c = ads_all.count(ads_uni[i])
        add_freq.append(c)
    
    d = {'Frequency': add_freq,'ad_id': ads_uni}
    d1 = df(data = d)

    d1.sort_values('Frequency',ascending = False,inplace = True)

    d1 = d1.head(10)

    ads_pred = list(d1['ad_id'])
    ads_entry = result_entry(ads_pred)
    
    pred.append(ads_entry)

In [110]:
pred

['[2406697, 2853312, 2598159, 2437215, 2137557, 2839263, 2705180, 2574889, 2574889, 2873173, 2900594]',
 '[2891500, 2898379, 2911000, 2898476, 2031170, 2869844, 2877849, 2874589, 2874589, 2887420, 2732959]',
 '[1720784, 2886467, 2895717, 2835598, 2896045, 2825149, 2208312, 2742600, 2742600, 2870851, 2917387]',
 '[2624021, 2753896, 2881080, 2899362, 2869986, 2370688, 2899968, 2586092, 2586092, 2861011, 2887602]',
 '[2882036, 2827310, 2888041, 2892684, 2831841, 2897623, 2885081, 2880011, 2880011, 2920781, 2887862]',
 '[2871638, 2893130, 1998679, 2874121, 2535492, 2892871, 2438365, 724136, 724136, 2841165, 272573]',
 '[2567368, 2880807, 2270156, 2721400, 1149112, 2799525, 2848893, 2303094, 2303094, 2883995, 1998464]',
 '[2658874, 2519991, 2880278, 2648807, 2662331, 2794828, 2889904, 395414, 395414, 2119965, 1700433]',
 '[2870873, 2677927, 2906282, 2851216, 2512096, 854802, 2873966, 2887773, 2887773, 2041846, 2238881]',
 '[2882472, 2873541, 2246591, 2880088, 2895915, 2839650, 2895649, 2117

In [113]:
category = list(category)
pred_test = list()
for i in xrange(len(test)):
    ind = category.index(test['category_id'][i])
    pred_test.append(pred[ind])

In [115]:
len(pred_test)

10507

In [116]:
len(test)

10507

In [153]:
sol = df({'user_id': list(test['user_id']), 'category_id': list(test['category_id']),'ads':(pred_test)})  
sol.head(5)

Unnamed: 0,ads,category_id,user_id
0,"[2891500, 2898379, 2911000, 2898476, 2031170, ...",800,2
1,"[2882036, 2827310, 2888041, 2892684, 2831841, ...",815,6
2,"[2624021, 2753896, 2881080, 2899362, 2869986, ...",811,14
3,"[2891500, 2898379, 2911000, 2898476, 2031170, ...",800,14
4,"[2882036, 2827310, 2888041, 2892684, 2831841, ...",815,17
5,"[2624021, 2753896, 2881080, 2899362, 2869986, ...",811,20
6,"[1720784, 2886467, 2895717, 2835598, 2896045, ...",806,20
7,"[2891500, 2898379, 2911000, 2898476, 2031170, ...",800,20
8,"[2891500, 2898379, 2911000, 2898476, 2031170, ...",800,24
9,"[2882472, 2873541, 2246591, 2880088, 2895915, ...",888,24


In [154]:
cols = sol.columns.tolist()
cols = cols[2:3]+cols[1:2]+cols[0:1] 
sol = sol[cols]

In [155]:
sol.head(5)

Unnamed: 0,user_id,category_id,ads
0,2,800,"[2891500, 2898379, 2911000, 2898476, 2031170, ..."
1,6,815,"[2882036, 2827310, 2888041, 2892684, 2831841, ..."
2,14,811,"[2624021, 2753896, 2881080, 2899362, 2869986, ..."
3,14,800,"[2891500, 2898379, 2911000, 2898476, 2031170, ..."
4,17,815,"[2882036, 2827310, 2888041, 2892684, 2831841, ..."
5,20,811,"[2624021, 2753896, 2881080, 2899362, 2869986, ..."
6,20,806,"[1720784, 2886467, 2895717, 2835598, 2896045, ..."
7,20,800,"[2891500, 2898379, 2911000, 2898476, 2031170, ..."
8,24,800,"[2891500, 2898379, 2911000, 2898476, 2031170, ..."
9,24,888,"[2882472, 2873541, 2246591, 2880088, 2895915, ..."


In [156]:
sol.to_csv("01-Naive-Category_ID-.csv")

In [157]:
sol.columns

Index([u'user_id', u'category_id', u'ads'], dtype='object')