# Recommendation System : Boosting the K-Nearest-Neighborhood based incremental collaborative filtering

In [1]:
from operator import itemgetter
from scipy.stats import norm
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv( 'test_data.txt', sep = ',', header = None )
data.head(4)

Unnamed: 0,0,1,2
0,1,1,1
1,1,2,2
2,1,3,5
3,2,1,2


In [3]:
from BIKNN import BIKNN
model1 = BIKNN()
model1.fit(data)

<BIKNN.BIKNN at 0x11028be90>

In [4]:
model1.sim_w_

array([[ 1.        ,  0.09590002,  0.06762181],
       [ 0.09590002,  1.        ,  0.28254745],
       [ 0.06762181,  0.28254745,  1.        ]])

In [18]:
item_id = 1
user_id = 6
B1 = 10
B2 = 10
k = 2

In [24]:
def get( item1, item2 ):
    """returns the similarity score given two item ids"""
    sim = model1.sim_w_[ model1.item_id_dict[item1] ][ model1.item_id_dict[item2] ]
    return sim
# get(1,2)
 
def predict_rating( item_id, user_id ): 

    item, user = data[ data['item_id'] == item_id ], data[ data['user_id'] == user_id ]
    item_count, user_count = item['ratings'].count(), user['ratings'].count()
    item_ratings, user_ratings = item['ratings'].values, user['ratings'].values
    global_avg = data['ratings'].mean()
 
    item_bias = float( np.sum( item_ratings - global_avg ) ) / ( B1 + item_count )
    user_bias = float( np.sum( user_ratings - global_avg - item_bias ) ) / ( B2 + user_count )

    baseline  = global_avg + item_bias + user_bias
    
    numerator   = 0.
    denominator = 0.
    nearest = knearest_amongst_user_rated( item_id, user_id )
    
    for nearest_id, sim in nearest:
        nearest_rating = user[ user['item_id'] == nearest_id ]['ratings'].values[0]
        nearest_item_count = data[ data['item_id'] == nearest_id ]['ratings'].count()
        nearest_item_bias  = ( nearest_rating - global_avg ) / ( B1 + nearest_item_count )
        numerator += ( sim * ( nearest_rating - global_avg - user_bias - nearest_item_bias ) )
        denominator += sim

    if denominator > 0.:
        score = baseline + ( numerator / denominator )
    else:
        score = baseline
    return score

def knearest_amongst_user_rated( item_id, user_id ):
    user_rated = data[ data['user_id'] == user_id ]['item_id'].unique()
    similars = []
    for other_item_id in user_rated:
        if other_item_id != item_id:
            similarity = get( other_item_id, item_id )
            similars.append( ( other_item_id, similarity ) )

    similars_sorted = sorted( similars, key = itemgetter(1), reverse = True )
    return similars_sorted[0:k] 


In [46]:
dict1 = { 
    'user_id' : [6, 5],
    'item_id' : [1, 2],
    'ratings' : [5, 2]
}
test1 = pd.DataFrame( dict1, columns = [ 'user_id', 'item_id', 'ratings' ] )
test1

Unnamed: 0,user_id,item_id,ratings
0,6,1,5
1,5,2,2


In [74]:
test1.iloc[ 1:2 ]

Unnamed: 0,user_id,item_id,ratings
1,5,2,2


In [79]:
def get_common_users( data, item1, item2 ):
    item1_users  = data[ data['item_id'] == item1 ]['user_id'].unique()
    item2_users  = data[ data['item_id'] == item2 ]['user_id'].unique()
    common_users = set(item1_users).intersection(item2_users)
    return common_users

def update( data ):
    
    for index1, user_id1, item1, rating1 in test1.itertuples():
        
        # predict the rating and store the MAE (mean absolute error) 
        predicted = predict_rating( item_id = item1, user_id = user_id1 )
        print "MAE: ", abs(predicted - rating1)
        
        # update the F and G array
        other_user = data[ data['user_id'] == user_id1 ]
        data = pd.concat( [ data, test1.iloc[ index1:index1 + 1 ] ], ignore_index = True )
        print data
        
        for _, _, item2, rating2 in other_user.itertuples():
            print "F: ", rating1 * rating2
            print "G: ", rating1 ** 2 + rating2 ** 2
            
            sup1 = model1.sup_[ model1.item_id_dict[item1] ][ model1.item_id_dict[item2] ]
            print "old support: ", sup1
           
            common_users = get_common_users( data, item1, item2 )
            print "new support: ", len(common_users)

In [80]:
update(data)

MAE:  1.90981574816
    user_id  item_id  ratings
0         1        1        1
1         1        2        2
2         1        3        5
3         2        1        2
4         2        2        1
5         3        3        2
6         4        2        4
7         4        3        2
8         5        1        2
9         5        3        3
10        6        2        5
11        6        3        1
12        6        1        5
F:  25
G:  50
old support:  2
new support:  3
F:  5
G:  26
old support:  2
new support:  3
MAE:  0.874313114954
    user_id  item_id  ratings
0         1        1        1
1         1        2        2
2         1        3        5
3         2        1        2
4         2        2        1
5         3        3        2
6         4        2        4
7         4        3        2
8         5        1        2
9         5        3        3
10        6        2        5
11        6        3        1
12        6        1        5
13        5        2        

In [13]:
model1.mean

2.3333333333333335

In [14]:
model1.sup_

array([[1, 2, 2],
       [2, 1, 3],
       [2, 3, 1]])

In [None]:
# for every row of the test set e.g. ( 6, 1, 5 )
# obtain all the user's other rating
other_user = data[ data['user_id'] == 6 ]
other_user

In [None]:
for _, _, item2, rating2 in other_ratings.itertuples():
    print item2, rating2

In [None]:
for _, user_id, item_id, rating in test.iloc[ :5, 0:3 ].itertuples():
    print user_id, item_id, rating

In [None]:
train = pd.read_csv( 'data/u1.base', sep = '\t', header = None )
train = train.iloc[ :, 0:3 ]
test = pd.read_csv( 'data/u1.test', sep = '\t', header = None )
test = train.iloc[ :, 0:3 ]

In [None]:
# movie_lens = BIKNN( K = 500, B1 = 25, B2 = 25 )
# movie_lens.fit(train)

In [None]:
# movie_lens.predict_rating( 1, 4 )