# Recommendation System : Boosting the K-Nearest-Neighborhood based incremental collaborative filtering

In [1]:
from operator import itemgetter
from scipy.stats import norm
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv( 'test_data.txt', sep = ',', header = None )
data.head(4)

Unnamed: 0,0,1,2
0,1,1,1
1,1,2,2
2,1,3,5
3,2,1,2


In [3]:
from BIKNN import BIKNN
model1 = BIKNN()
model1.fit(data)

<BIKNN.BIKNN at 0x106d71b90>

In [4]:
model1.sim_w_

array([[ 1.        ,  0.09590002,  0.06762181],
       [ 0.09590002,  1.        ,  0.28254745],
       [ 0.06762181,  0.28254745,  1.        ]])

In [33]:
item_id = 1
user_id = 6
B1 = 25
B2 = 25

In [34]:
def get( item1, item2 ):
    """returns the similarity score given two item ids"""
    sim = model1.sim_w_[ model1.item_id_dict[item1] ][ model1.item_id_dict[item2] ]
    return sim
# get(1,2)
 
def predict_rating( item_id, user_id, k ): 

    item, user = data[ data['item_id'] == item_id ], data[ data['user_id'] == user_id ]
    item_count, user_count = item['ratings'].count(), user['ratings'].count()
    item_ratings, user_ratings = item['ratings'].values, user['ratings'].values
    global_avg = data['ratings'].mean()
 
    item_bias = float( np.sum( item_ratings - global_avg ) ) / ( B1 + item_count )
    user_bias = float( np.sum( user_ratings - global_avg ) ) / ( B2 + user_count )

    baseline  = global_avg + item_bias + user_bias
    
    score_numerator   = []
    score_denominator = []
    nearest = knearest_amongst_user_rated( item_id, user_id, k = 2 )
    
    for nearest_id, sim in nearest:
        nearest_rating = user[ user['item_id'] == nearest_id ]['ratings'].values[0]
        nearest_item_count = data[ data['item_id'] == nearest_id ]['ratings'].count()
        nearest_item_bias  = ( nearest_rating - global_avg ) / ( B1 + nearest_item_count )
        score_numerator.append( sim * ( nearest_rating - global_avg - user_bias - nearest_item_bias ) )
        score_denominator.append(sim)   

    score = baseline + sum(score_numerator) / sum(score_denominator)
    return score, item_bias, user_bias

def knearest_amongst_user_rated( item_id, user_id, k ):
    user_rated = data[ data['user_id'] == user_id ]['item_id'].unique()
    similars = []
    for other_item_id in user_rated:
        if other_item_id != item_id:
            similarity = get( other_item_id, item_id )
            similars.append( ( other_item_id, similarity ) )

    similars_sorted = sorted( similars, key = itemgetter(1), reverse = True )
    return similars_sorted[0:k] 


In [35]:
predict_rating( 1, 6, 2 )

(3.2266982110448534, -0.08928571428571429, 0.037037037037037035)

In [12]:
train = pd.read_csv( 'data/u1.base', sep = '\t', header = None )
train = train.iloc[ :, 0:3 ]

In [13]:
movie_lens = BIKNN( K = 500, B1 = 25, B2 = 25 )
movie_lens.fit(train)

<BIKNN.BIKNN at 0x106d71ad0>

In [19]:
movie_lens.predict_rating( 1, 4 )

4.4460863202859393

In [14]:
test = pd.read_csv( 'data/u1.test', sep = '\t', header = None )
test = train.iloc[ :, 0:3 ]

In [20]:
test.shape

(80000, 3)