# Recommendation System : Boosting the K-Nearest-Neighborhood based incremental collaborative filtering

In [1]:
from scipy.stats import norm
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv( 'test_data.txt', sep = ',', header = None )

In [3]:
data.head(4)

Unnamed: 0,0,1,2
0,1,1,1
1,1,2,2
2,1,3,5
3,2,1,2


In [4]:
def fit( data ):
    data.columns = [ 'user_id', 'item_id', 'stars' ]
    item_id_dict = { v : k for k, v in enumerate( data['item_id'].unique() ) }
    size = len(item_id_dict.keys())
    F = np.zeros( [ size, size ], dtype = np.int )
    G = np.zeros( [ size, size ], dtype = np.int )
    sup = np.zeros( [ size, size ], dtype = np.int )
    sim = np.zeros( [ size, size ] )
    supports = []
    for item1, i1 in item_id_dict.iteritems():
        for item2, i2 in item_id_dict.iteritems():
            if i1 < i2:
                similarity, numerator, denominator, support = \
                calculate_similarity( item1, item2 )
                sup[i1][i2], sup[i2][i1] = support, support
                sim[i1][i2], sim[i2][i1] = similarity, similarity
                F[i1][i2], F[i2][i1] = numerator, numerator
                G[i1][i2], G[i2][i1] = denominator, denominator
                supports.append(support)
            elif i1 == i2:
                F[i1][i2], F[i2][i1] = 1, 1
                G[i1][i2], G[i2][i1] = 1, 1
                sup[i1][i2], sup[i2][i1] = 1, 1
                sim[i1][i2], sim[i2][i1] = 1, 1
    
    supports = np.array(supports)
    mean = float( np.sum(supports) ) / supports.shape[0]
    std  = np.sqrt( float( np.sum( supports ** 2 ) ) / supports.shape[0] - mean ** 2 )
    w_ = np.zeros( [ size, size ] )
    sim_w_ = np.zeros( [ size, size ] )

    for _, i1 in item_id_dict.iteritems():
        for _, i2 in item_id_dict.iteritems():
            if i1 < i2:
                weight = norm( mean, std ).cdf( sup[i1][i2] )
                w_[i1][i2], w_[i2][i1] = weight, weight

            elif i1 == i2:
                w_[i1][i2], w_[i2][i1] = 1, 1

    sim_w_ = sim * w_
    return sup, sim, F, G

In [5]:
def calculate_similarity( item1, item2 ):
    item1_users  = data[ data['item_id'] == item1 ]['user_id'].unique()
    item2_users  = data[ data['item_id'] == item2 ]['user_id'].unique()
    common_users = set(item1_users).intersection(item2_users)
    support = len(common_users)
    if support == 0:
        return 0, 0, 0, 0
    item1_stars = get_item_stars( item_id = item1, set_of_users = common_users )
    item2_stars = get_item_stars( item_id = item2, set_of_users = common_users )
    numerator   = item1_stars.dot(item2_stars)
    denominator = np.sum( item1_stars ** 2 ) + np.sum( item2_stars ** 2 )
    similarity  = float(numerator) / denominator
    return similarity, numerator, denominator, support

def get_item_stars( item_id, set_of_users ):
    condition = ( ( data['item_id'] == item_id ) & 
                  ( data['user_id'].isin(set_of_users) ) )
    reviews = data[condition]
    reviews = reviews[ reviews['user_id'].duplicated() == False ]['stars'].values
    return reviews 

# item1 = item_id_dict.items()[0][0]
# item2 = item_id_dict.items()[1][0]
# calculate_similarity( item1, item2 )

In [6]:
sup, sim, F, G = fit( data )
sup

array([[1, 2, 2],
       [2, 1, 3],
       [2, 3, 1]])

In [7]:
from BIKNN import BIKNN
model1 = BIKNN()
model1.fit(data)

<BIKNN.BIKNN at 0x110120e50>

In [9]:
model1.sim_w_

array([[ 1.        ,  0.09590002,  0.06762181],
       [ 0.09590002,  1.        ,  0.28254745],
       [ 0.06762181,  0.28254745,  1.        ]])