In [1]:
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import warnings 

%matplotlib inline
warnings.filterwarnings('ignore')

In [3]:
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=7, progress_bar=False)
# for parallel computing the lambda function

ModuleNotFoundError: No module named 'pyarrow._plasma'

### Data Preprocessing

In [159]:
data = pd.read_csv('C:/Users/HanWang/Documents/Columbia/Fall 2019/personalization/project/ml-20m/ml-20m/ratings.csv')

In [160]:
print("NaNs cells: ", data.isnull().values.sum())
data.head()

NaNs cells:  0


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [161]:
data.drop("timestamp", inplace=True, axis=1)

In [164]:
popularMovie = data[['rating', 'movieId']].groupby("movieId").count()
popularMovie = popularMovie.loc[popularMovie['rating'] > 8000].index

popularUser = data[['rating', 'userId']].groupby("userId").count()
popularUser = popularUser.loc[popularUser['rating'] > 2000].index

print("the number of popular movies: ", len(popularMovie))
print("the number of active users: ", len(popularUser))

the number of popular movies:  620
the number of active users:  255


In [165]:
data = data.loc[ 
    (data['movieId'].isin(popularMovie)) & (data['userId'].isin(popularUser)) 
]

In [166]:
train, test = train_test_split(data, test_size=0.4)
test, valid = train_test_split(test, test_size=0.2)

In [167]:
print(train.shape)
print(valid.shape)
print(test.shape)
train.head(5)

(74868, 3)
(9983, 3)
(39930, 3)


Unnamed: 0,userId,movieId,rating
3614560,24688,441,3.0
13792910,95301,1272,4.0
15420269,106676,1375,3.0
4543828,31122,357,3.5
14895374,102911,590,0.5


In [168]:
'''
this is the same as 
train.pivot(
    index="movieId", 
    columns="userId",
    values="rating"
)
'''
movieCat = pd.api.types.CategoricalDtype(categories=sorted(train['movieId'].unique()))
userCat = pd.api.types.CategoricalDtype(categories=sorted(train['userId'].unique()))

rowIndex = train['movieId'].astype(movieCat).cat.codes
colIndex = train['userId'].astype(userCat).cat.codes
sparse_matrix = csr_matrix(
    (train["rating"], (rowIndex, colIndex)), 
    shape=(movieCat.categories.size, userCat.categories.size)
)

UserItemDF = pd.SparseDataFrame(
    sparse_matrix,
    index=movieCat.categories,
    columns=userCat.categories,
    default_fill_value=0
)

In [169]:
UserItemDF

Unnamed: 0,156,741,903,982,2261,3318,3907,4222,4358,6636,...,134773,135090,135425,136268,136599,137202,137277,137343,137686,138208
1,0.0,5.0,4.0,3.0,0.0,4.5,0.0,5.0,0.0,4.0,...,5.0,2.5,3.0,4.0,0.0,4.5,4.0,0.0,5.0,0.0
2,5.0,3.0,4.0,0.0,3.5,0.0,0.0,0.0,3.0,0.0,...,0.0,2.5,0.0,2.0,0.0,2.5,2.5,0.0,0.0,2.0
3,2.0,0.0,2.0,0.0,2.5,3.0,0.0,3.5,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,3.5,3.0,0.0,0.0,2.0
5,0.0,4.0,3.0,2.5,1.0,2.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0
6,0.0,3.5,0.0,3.5,0.0,0.0,4.0,0.0,5.0,4.0,...,0.0,3.5,3.5,5.0,3.5,3.0,4.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68157,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,4.0,0.0,4.5,4.0,0.0,4.5
68954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,4.0,4.0,4.0,0.0,0.0,0.0
70286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,4.0,0.0,...,0.0,0.0,0.0,3.5,4.0,0.0,0.0,4.0,0.0,0.0
72998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,2.0


### Model

In [170]:
model = NearestNeighbors(
    metric='cosine', 
    n_neighbors=20, 
    n_jobs=-1,
    algorithm='brute'
)

In [171]:
print(model)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)


In [172]:
model.fit(UserItemDF)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

<bound method UnsupervisedMixin.fit of NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)>

In [173]:
dis, ind = model.kneighbors(return_distance=True)

In [174]:
print(dis)
dis.shape

[[0.34924009 0.36089549 0.36298398 ... 0.38858902 0.38964293 0.39097519]
 [0.39095377 0.39470524 0.39756013 ... 0.4155606  0.41663584 0.41678822]
 [0.46418059 0.47977464 0.48117914 ... 0.50975948 0.51037082 0.51112757]
 ...
 [0.39857596 0.46317931 0.46403133 ... 0.49394469 0.49410613 0.49486147]
 [0.40865344 0.43796242 0.43827731 ... 0.50068731 0.50069782 0.5009032 ]
 [0.39850273 0.41097447 0.41389819 ... 0.47308728 0.47369313 0.47547128]]


(620, 20)

In [175]:
print(ind)
ind.shape

[[534 554 576 ... 408 503 352]
 [123 212 315 ... 460 497  93]
 [316 261 471 ... 514 159 233]
 ...
 [615 607 618 ... 239 570 230]
 [612 615 619 ... 447 234 596]
 [612 615 608 ... 598 119 590]]


(620, 20)

In [176]:
'''
dis: distance in high dimension, smaller distance means high similarity
ind: Indices of the nearest points in the population matrix.
'''
# movieId = 1
dis[0], ind[0]

(array([0.34924009, 0.36089549, 0.36298398, 0.36860526, 0.37417465,
        0.37830888, 0.3787048 , 0.37972517, 0.38181949, 0.38211905,
        0.38439633, 0.38493773, 0.38511452, 0.38522739, 0.38548535,
        0.38609596, 0.38800148, 0.38858902, 0.38964293, 0.39097519]),
 array([534, 554, 576, 323, 246, 141, 223, 527,  95, 584, 313, 272,  93,
        236, 314, 225, 419, 408, 503, 352], dtype=int64))

In [177]:
# key: movieId
# value: the row index in User-Item matrix
movieIdIndexMap = {movId: idx for idx, movId in enumerate(UserItemDF.index)}
showMovieIdIndexMap = list(movieIdIndexMap.items())
showMovieIdIndexMap[:5] + showMovieIdIndexMap[-5:]

[(1, 0),
 (2, 1),
 (3, 2),
 (5, 3),
 (6, 4),
 (68157, 615),
 (68954, 616),
 (70286, 617),
 (72998, 618),
 (79132, 619)]

In [178]:
def predict(movieId, userId, dis, ind, UserItemMatrix, returnAvg=False):
    movieIdIndexMap = {movId: idx for idx, movId in enumerate(UserItemMatrix.index)}
    movieIdIndexMapReversed = {idx: movId for idx, movId in enumerate(UserItemMatrix.index)}
    
    # since we used cosine similarity so the closest neighbor distance should be 1
    nonZeroUserIndex = np.nonzero(UserItemMatrix.loc[movieId])[0]
    sim = dis[ movieIdIndexMap[movieId] ]
    mu_movie = UserItemMatrix.loc[movieId]\
                             .iloc[nonZeroUserIndex]\
                             .mean()
    
    nonZeroMovieIndex = np.nonzero(UserItemMatrix.iloc[ind[ movieIdIndexMap[movieId] ]].loc[:, userId])[0]
    denom = np.sum( abs(sim[nonZeroMovieIndex]) )
    nom = (UserItemMatrix.iloc[ind[ movieIdIndexMap[movieId] ]]
                     .loc[:, userId]\
                     .iloc[nonZeroMovieIndex]
           - UserItemMatrix.iloc[ind[ movieIdIndexMap[movieId] ]]\
                           .iloc[nonZeroMovieIndex]
                           .to_dense()\
                           .replace({0: np.nan})\
                           .mean(axis=1, skipna=True)
    ).values

    if len(nonZeroMovieIndex) == 0:
        prediction = mu_movie
    else:
        prediction = mu_movie + np.sum(nom * sim[nonZeroMovieIndex]) / denom

    if returnAvg:
        return prediction, mu_movie
    else:
        return prediction

In [179]:
def func(s):
    return predict(
        movieId=s['movieId'],
        userId=s['userId'], 
        dis=dis, ind=ind, 
        UserItemMatrix=UserItemDF
    )

In [None]:
%%time
train['predict'] = train.apply(func, axis=1)

In [141]:
ratings=UserItemDF.loc[:,54465]
ratings

1        4.5
2        4.0
6        5.0
10       0.0
11       3.0
        ... 
8636     4.5
8961     5.0
32587    5.0
33794    5.0
58559    5.0
Name: 54465, Length: 252, dtype: Sparse[float64, 0]
BlockIndex
Block locations: array([  0,   4,   6,  10,  15,  23,  30,  34,  37,  39,  44,  47,  52,
        57,  59,  62,  67,  69,  71,  75,  78,  84,  89,  91,  94,  96,
        99, 106, 109, 112, 114, 117, 120, 128, 134, 136, 140, 143, 149,
       151, 153, 156, 159, 163, 167, 172, 182, 185, 189, 191, 194, 196,
       205, 213, 217, 224, 230, 233, 235, 239, 246])
Block lengths: array([3, 1, 1, 3, 7, 4, 3, 2, 1, 2, 2, 3, 3, 1, 2, 3, 1, 1, 3, 2, 2, 4,
       1, 2, 1, 2, 4, 1, 1, 1, 1, 1, 6, 4, 1, 1, 1, 4, 1, 1, 1, 2, 3, 3,
       4, 7, 2, 1, 1, 2, 1, 5, 7, 1, 5, 5, 2, 1, 3, 5, 6])

In [142]:
ratings.index

Int64Index([    1,     2,     6,    10,    11,    16,    17,    19,    21,
               25,
            ...
             6711,  6874,  7153,  7361,  7438,  8636,  8961, 32587, 33794,
            58559],
           dtype='int64', length=252)

In [143]:
df=pd.DataFrame(ratings)
df

Unnamed: 0,54465
1,4.5
2,4.0
6,5.0
10,0.0
11,3.0
...,...
8636,4.5
8961,5.0
32587,5.0
33794,5.0


In [145]:
%%time
test['predict'] = test.apply(func, axis=1)

Wall time: 3min 25s


In [146]:
train.head()

Unnamed: 0,userId,movieId,rating,predict
17086475,118205,1101,4.0,3.309521
12032544,83090,590,5.0,3.447571
15274231,105580,4878,3.0,3.185575
10405289,71975,788,3.5,3.113853
6750377,46470,454,0.5,2.78869


In [156]:
test.head()

Unnamed: 0,userId,movieId,rating,predict,difpre
13315247,92011,1617,5.0,4.429014,0.570986
11458492,79159,235,4.0,3.786187,0.213813
10717968,74142,111,0.5,3.542949,3.042949
5989165,41267,1097,4.0,3.60044,0.39956
12858492,88820,5418,3.0,4.21583,1.21583


In [None]:
print("-"*10 + "model performance" + "-"*10)
print("RMSE")
print("on training set: ", mean_squared_error(train['rating'], train['predict']))
print("on testing set: ", mean_squared_error(test['rating'], test['predict']))
print("\n\nMAE")
print("on training set: ", mean_absolute_error(train['rating'], train['predict']))
print("on testing set: ", mean_absolute_error(test['rating'], test['predict']))

### Coverage

In [147]:
train['difpre']=abs(train['predict']-train['rating'])

In [148]:
test['difpre']=abs(test['predict']-test['rating'])

In [149]:
def coverage(dataset,k):
    userct=0
    for id in dataset.userId.unique():
        userdata=dataset[dataset.userId==id]
        userdata['avg']=userdata.loc[:,"rating"].mean()
        userdata['difavg']=userdata['avg']-userdata['rating']
        if len(userdata[userdata.difavg>userdata.difpre])>k:
            userct+=1
    
    t=dataset.userId.nunique()
    r=userct/t
    return r

In [152]:
rtrain=coverage(train,10)

In [153]:
rtrain

1.0

In [154]:
rtest=coverage(test,10)

In [155]:
rtest

0.9107142857142857

### Choosing Hyperparameter

In [25]:
modelOne = NearestNeighbors(
    metric='cosine', 
    n_neighbors=30, 
    n_jobs=-1,
    algorithm='brute'
)
modelOne.fit(UserItemDF)
disOne, indOne = modelOne.kneighbors(return_distance=True)


modelTwo = NearestNeighbors(
    metric='cosine', 
    n_neighbors=20, 
    n_jobs=-1,
    algorithm='brute'
)
modelTwo.fit(UserItemDF)
disTwo, indTwo = modelTwo.kneighbors(return_distance=True)


modelThree = NearestNeighbors(
    metric='cosine', 
    n_neighbors=10, 
    n_jobs=-1,
    algorithm='brute'
)
modelThree.fit(UserItemDF)
disThree, indThree = modelThree.kneighbors(return_distance=True)

In [26]:
def func(s, dis, ind, UserItemMatrix):
    return predict(
        movieId=s['movieId'],
        userId=s['userId'], 
        dis=dis, ind=ind, 
        UserItemMatrix=UserItemMatrix
    )

In [27]:
train['predictOne'] = train.parallel_apply(
    func, axis=1, dis=disOne, ind=indOne, UserItemMatrix=UserItemDF
)
train['predictTwo'] = train.parallel_apply(
    func, axis=1, dis=disTwo, ind=indTwo, UserItemMatrix=UserItemDF
)
train['predictThree'] = train.parallel_apply(
    func, axis=1, dis=disThree, ind=indThree, UserItemMatrix=UserItemDF
)

In [28]:
valid['predictOne'] = valid.parallel_apply(
    func, axis=1, dis=disOne, ind=indOne, UserItemMatrix=UserItemDF
)

In [29]:
valid['predictTwo'] = valid.parallel_apply(
    func, axis=1, dis=disTwo, ind=indTwo, UserItemMatrix=UserItemDF
)

In [30]:
valid['predictThree'] = valid.parallel_apply(
    func, axis=1, dis=disThree, ind=indThree, UserItemMatrix=UserItemDF
)

In [31]:
print('model performance on training set')
print('neighbor size - 30: ', mean_squared_error(train['rating'], train['predictOne']))
print('neighbor size - 20: ', mean_squared_error(train['rating'], train['predictTwo']))
print('neighbor size - 10: ', mean_squared_error(train['rating'], train['predictThree']))

model performance on training set
neighbor size - 30:  0.5712795722551185
neighbor size - 20:  0.5821331574117482
neighbor size - 10:  0.6216109022473322


In [32]:
print('model performance on validation set')
print('neighbor size - 30: ', mean_squared_error(valid['rating'], valid['predictOne']))
print('neighbor size - 20: ', mean_squared_error(valid['rating'], valid['predictTwo']))
print('neighbor size - 10: ', mean_squared_error(valid['rating'], valid['predictThree']))

model performance on validation set
neighbor size - 30:  0.64033783953412
neighbor size - 20:  0.6847783249081223
neighbor size - 10:  0.8196170756782438


##### according to the validation set RMSE result, we choose neighbor size = 30

In [34]:
print("model performance on training set")
print('RMSE: ', mean_squared_error(train['rating'], train['predictOne']))
print('MAE: ', mean_absolute_error(train['rating'], train['predictOne']))

print("\n\nmodel performance on validation set")
print('RMSE: ', mean_squared_error(valid['rating'], valid['predictOne']))
print('MAE: ', mean_absolute_error(valid['rating'], valid['predictOne']))

model performance on training set
RMSE:  0.5712795722551185
MAE:  0.5795319505512077


model performance on validation set
RMSE:  0.64033783953412
MAE:  0.6155764213949892


In [35]:
test['predict'] = test.parallel_apply(
    func, axis=1, dis=disOne, ind=indOne, UserItemMatrix=UserItemDF
)
print("model performance on testing set")
print('RMSE: ', mean_squared_error(test['rating'], test['predict']))
print('MAE: ', mean_absolute_error(test['rating'], test['predict']))

model performance on testing set
RMSE:  0.7058404931062546
MAE:  0.6478214709289709
