# Recommendation System : Boosting the K-Nearest-Neighborhood based incremental collaborative filtering

In [1]:
from operator import itemgetter
from scipy.stats import norm
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv( 'test_data.txt', sep = ',', header = None )
data.head(4)

Unnamed: 0,0,1,2
0,1,1,1
1,1,2,2
2,1,3,5
3,2,1,2


In [3]:
from BIKNN import BIKNN
model1 = BIKNN()
model1.fit(data)

<BIKNN.BIKNN at 0x110171b50>

In [4]:
item_id = 1
user_id = 6
B1 = 10
B2 = 10
k = 2

In [5]:
def get( item1, item2 ):
    """returns the similarity score given two item ids"""
    sim = model1.sim_w_[ model1.item_id_dict[item1] ][ model1.item_id_dict[item2] ]
    return sim
# get(1,2)
 
def predict_rating( item_id, user_id ): 

    item, user = data[ data['item_id'] == item_id ], data[ data['user_id'] == user_id ]
    item_count, user_count = item['ratings'].count(), user['ratings'].count()
    item_ratings, user_ratings = item['ratings'].values, user['ratings'].values
    global_avg = data['ratings'].mean()
 
    item_bias = float( np.sum( item_ratings - global_avg ) ) / ( B1 + item_count )
    user_bias = float( np.sum( user_ratings - global_avg - item_bias ) ) / ( B2 + user_count )

    baseline  = global_avg + item_bias + user_bias
    
    numerator   = 0.
    denominator = 0.
    nearest = knearest_amongst_user_rated( item_id, user_id )
    
    for nearest_id, sim in nearest:
        nearest_rating = user[ user['item_id'] == nearest_id ]['ratings'].values[0]
        nearest_item_count = data[ data['item_id'] == nearest_id ]['ratings'].count()
        nearest_item_bias  = ( nearest_rating - global_avg ) / ( B1 + nearest_item_count )
        numerator += ( sim * ( nearest_rating - global_avg - user_bias - nearest_item_bias ) )
        denominator += sim

    if denominator > 0.:
        rating = baseline + ( numerator / denominator )
    else:
        rating = baseline
    return rating

def knearest_amongst_user_rated( item_id, user_id ):
    user_rated = data[ data['user_id'] == user_id ]['item_id'].unique()
    similars = []
    for other_item_id in user_rated:
        if other_item_id != item_id:
            similarity = get( other_item_id, item_id )
            similars.append( ( other_item_id, similarity ) )

    similars_sorted = sorted( similars, key = itemgetter(1), reverse = True )
    return similars_sorted[0:k] 


In [6]:
# test incoming new ratings
dict1 = { 
    'user_id' : [6, 5],
    'item_id' : [1, 2],
    'ratings' : [5, 2]
}
test1 = pd.DataFrame( dict1, columns = [ 'user_id', 'item_id', 'ratings' ] )
test1

Unnamed: 0,user_id,item_id,ratings
0,6,1,5
1,5,2,2


In [7]:
def get_common_users( data, item1, item2 ):
    item1_users  = data[ data['item_id'] == item1 ]['user_id'].unique()
    item2_users  = data[ data['item_id'] == item2 ]['user_id'].unique()
    common_users = set(item1_users).intersection(item2_users)
    return common_users

def update( data ):
    
    # loop through all the test data's rating
    for index1, user_id1, item1, rating1 in test1.itertuples():
        
        # predict the rating and store the MAE (mean absolute error) 
        predicted = predict_rating( item_id = item1, user_id = user_id1 )
        print "MAE: ", abs(predicted - rating1)
        
        # obtain the user's other rating, after that
        # update the user rating database
        other_user = data[ data['user_id'] == user_id1 ]
        data = pd.concat( [ data, test1.iloc[ index1:index1 + 1 ] ], ignore_index = True )
        # print data
        
        # loop through all the user's other rating
        for _, _, item2, rating2 in other_user.itertuples():
            # update the F and G array
            print "F: ", rating1 * rating2
            print "G: ", rating1 ** 2 + rating2 ** 2
            
            sup_old = model1.sup_[ model1.item_id_dict[item1] ][ model1.item_id_dict[item2] ]
            print "old support: ", sup_old
            
            # compute the new support
            common_users = get_common_users( data, item1, item2 )
            sup_new = len(common_users)
            print "new support: ", sup_new
            
            sup_delta = sup_new - sup_old
        
            # after calculating the new mean and variance of the support
            # update them
            mean_new =  model1.mean + float(sup_delta) / model1.N
            print "new mean", mean_new
            variance_new = model1.variance + ( float( 2 * sup_delta * sup_old + sup_delta ** 2 ) / model1.N ) \
                           + model1.mean ** 2 - mean_new ** 2
            print "new variance", variance_new
                                  
            # TODO : permanent update support array, mean, variance 
            
            # TODO : permanent update the support weight array and the weighted similarity score array
            # if index1 % parameter == 0:
            # update
            
# update(data)

In [9]:
model1.update(test1)

1.455278018764861

In [11]:
model1.global_avg

2.642857142857143

In [None]:
"""
train = pd.read_csv( 'data/u1.base', sep = '\t', header = None )
train = train.iloc[ :, 0:3 ]
test  = pd.read_csv( 'data/u1.test', sep = '\t', header = None )
test  = train.iloc[ :, 0:3 ]

from BIKNN import BIKNN
movie_lens = BIKNN()
movie_lens.fit(train)
movie_lens.update(test)
"""