In [166]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math

In [58]:
df = pd.read_csv('movies-utf8.csv')
df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [59]:
listGen = set()
for (columnName, columnData) in df.iteritems():
    if columnName != 'genres':
        continue
    for genres in columnData.values:
        if isinstance(genres, str):
            genres = genres.split('|')
            for genre in genres:
                listGen.add(genre)
print(listGen)

{'Animation', 'Adventure', "Children's", 'Drama', 'Romance', 'Action', 'Documentary', 'Musical', 'Comedy', 'War', 'Horror', 'Thriller', 'Sci-Fi', 'Fantasy', 'Crime'}


In [62]:
gen_list = []
df['genres'] = df['genres'].replace(np.nan, '', regex=True)

def split_gen(gen_string):
    if isinstance(gen_string, str):
        return gen_string.split('|')
    else:
        return gen_string
    
df['genres'].map(lambda x: gen_list.append(split_gen(x)))
df['gen_list'] = gen_list

In [74]:
M = gen_list

M_G = [[1 if genre in sublist else 0 for genre in listGen] for sublist in M]

print(M_G[:4])

print(M_G[0])

[[1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]]
[1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]


In [68]:
df_user = pd.read_csv('ratings.csv')
df_user.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1,3
1,1,3,5
2,1,4,4
3,1,5,3
4,1,6,3


In [82]:
# Initialize each user with list of zeros for categories
U_G = [[0] * len(listGen)] * df_user['user_id'].max()

for index, row in df_user.iterrows():
    # Perform or operator on users current rated categories and the movies categories
    U_G[row['user_id']-1] = [a or b for a, b in zip(U_G[row['user_id']-1], M_G[row['movie_id']-1])]

print(U_G[:4])

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]]


In [97]:
sim = cosine_similarity(M_G, U_G)

print(sim)

[[0.46291005 0.46291005 0.4472136  ... 0.46291005 0.4472136  0.4472136 ]
 [0.46291005 0.46291005 0.4472136  ... 0.46291005 0.4472136  0.4472136 ]
 [0.37796447 0.37796447 0.36514837 ... 0.37796447 0.36514837 0.36514837]
 ...
 [0.37796447 0.37796447 0.36514837 ... 0.37796447 0.36514837 0.36514837]
 [0.26726124 0.26726124 0.25819889 ... 0.26726124 0.25819889 0.25819889]
 [0.37796447 0.37796447 0.36514837 ... 0.37796447 0.36514837 0.36514837]]


# b)

In [107]:
# Split into training and testing data

train_data, test_data = train_test_split(df_user, test_size=0.5)

train = train_data.pivot_table(index='movie_id', columns='user_id', values='rating').astype('float64')
test = test_data.pivot_table(index='movie_id', columns='user_id', values='rating').astype('float64')

In [182]:
import math
def cosine_similarity(v1,v2, metric='cosine'):
    if metric == 'correlation':
        v1 = v1 - np.nanmean(v1)
        v2 = v2 - np.nanmean(v2)
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i+1]; y = v2[i+1]
        if np.isnan(x) or np.isnan(y): continue
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

def sim_matrix(M, dimension='user', metric='cosine'):
    N = M.shape[0] if dimension == 'user' else M.shape[1]
    sim = np.zeros([N,N])
    for i in range(N):
        for j in range(N):
            if i == j:
                sim[i,j] = 0
                continue
            if dimension == 'user':
                v1, v2 = M[i+1], M[j+1]
            else:
                v1, v2 = M.ix[i+1], M.ix[j+1]
            sim[i][j] = cosine_similarity(v1,v2,metric)
    return sim

## User CF

In [143]:
user_sim = sim_matrix(train, 'user', 'correlation')

In [148]:
print(user_sim)

[[ 0.         -0.17536381 -0.40171391 ... -0.04813454 -0.01207952
   0.07862194]
 [-0.17536381  0.         -0.26189816 ... -0.10993577 -0.29245681
   0.12463326]
 [-0.40171391 -0.26189816  0.         ... -0.15712101  0.21855797
   0.08706135]
 ...
 [-0.04813454 -0.10993577 -0.15712101 ...  0.         -0.59750263
  -0.1164841 ]
 [-0.01207952 -0.29245681  0.21855797 ... -0.59750263  0.
   0.2299324 ]
 [ 0.07862194  0.12463326  0.08706135 ... -0.1164841   0.2299324
   0.        ]]


In [157]:
def predict_user(M, u, i, simularity, avg, n_users):
    total_sum = 0
    bot_sum = 0
    for v in range(n_users):
        if (u != v):
            bot_sum += simularity[u, v]
    for v in range(n_users):
        if (u != v and not np.isnan(M.loc[v+1,i+1])):
            total_sum += simularity[u, v]*(M.loc[v+1, i+1]-avg[v])/bot_sum
    return avg[u]+total_sum

def user_cf(M, sim_users):
    pred = np.copy(M)
    n_users, n_items = M.shape
    avg_ratings = np.nanmean(M, axis=1)
    for i in range(n_users):
        for j in range(n_items):
            if np.isnan(M.loc[i+1,j+1]):
                pred[i,j] = predict_user(M, i, j, sim_users, avg_ratings, n_users)
    return pred

In [159]:
pred = user_cf(train, user_sim)

In [160]:
print(pred)

[[ 3.84648358  3.          3.5720679  ...  3.22494     4.
   4.        ]
 [ 7.23064305  0.26920817 -1.34546389 ...  0.74592987  3.24976089
   4.        ]
 [ 4.25727106  3.96595136  4.         ...  5.          3.35063816
   4.        ]
 ...
 [ 3.09645188  3.2514601   3.23065361 ...  4.          2.95885415
   5.        ]
 [ 4.05940517  3.79128936  3.64439874 ...  3.86076089  3.29313481
   3.67088947]
 [ 1.          2.01896465  4.         ...  4.88831557  3.
   5.        ]]


In [173]:
# Fill NA values so the RMSE works
test = test.fillna(test.mean())

MSE = mean_squared_error(pred, test)
RMSE = round(math.sqrt(MSE),3)
print(RMSE)

29.6


In [179]:
N = pred.shape
print(N)
total_sum = 0
n = 0
for i in range(N[0]):
    for j in range(N[1]):
        if (test.loc[i+1, j+1]):
            total_sum += (pred[i,j] - test.loc[i+1, j+1])**2
            n += 1
rmse = math.sqrt(total_sum/n)
print(rmse)

(100, 100)
29.599633414458673


## Item CF

In [184]:
item_sim = sim_matrix(train, 'item', 'correlation')
print(item_sim)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


[[ 0.         -0.33776883 -0.12447196 ... -0.32467362  0.36219885
   0.08233208]
 [-0.33776883  0.         -0.12531477 ... -0.05765317 -0.44981749
   0.10799486]
 [-0.12447196 -0.12531477  0.         ...  0.24097388 -0.25151784
  -0.2345426 ]
 ...
 [-0.32467362 -0.05765317  0.24097388 ...  0.         -0.39953212
  -0.23093259]
 [ 0.36219885 -0.44981749 -0.25151784 ... -0.39953212  0.
  -0.21924347]
 [ 0.08233208  0.10799486 -0.2345426  ... -0.23093259 -0.21924347
   0.        ]]


In [187]:
def predict_item(M, u, i, simularity, avg, n_items):
    total_sum = 0
    bot_sum = 0
    for j in range(n_items):
        if (i != j):
            bot_sum += simularity[i, j]
    for j in range(n_items):
        if (i != j and not np.isnan(M.loc[u+1, j+1])):
            total_sum += simularity[i, j]*(M.loc[u+1, j+1]-avg[j])/bot_sum
    return avg[i]+total_sum

def item_cf(M, sim_items):
    pred = np.copy(M)
    n_users, n_items = M.shape
    avg_ratings = np.nanmean(M, axis=0)
    for u in range(n_users):
        for i in range(n_items):
            if np.isnan(M.loc[u+1,i+1]):
                pred[u,i] = predict_item(M, u, i, sim_items, avg_ratings, n_items)
    return pred

In [188]:
pred_item = item_cf(train, item_sim)
print(pred_item)

[[  3.25287567   3.           6.7058905  ...   2.73881672   4.
    4.        ]
 [  2.4492314    3.42849243   7.22808501 ...   3.98575055   5.7841872
    4.        ]
 [  4.66178895   3.52199643   4.         ...   5.           3.68314077
    4.        ]
 ...
 [  4.81789217   3.94046756  -1.78467406 ...   4.           2.29972811
    5.        ]
 [  3.66585823   3.58351806   1.30849617 ...   3.45662713   3.20922453
  -35.0536043 ]
 [  1.           3.16096606   4.         ...   4.14444606   3.
    5.        ]]


In [189]:
N = pred_item.shape
print(N)
total_sum = 0
n = 0
for i in range(N[0]):
    for j in range(N[1]):
        if (test.loc[i+1, j+1]):
            total_sum += (pred_item[i,j] - test.loc[i+1, j+1])**2
            n += 1
rmse = math.sqrt(total_sum/n)
print(rmse)

(100, 100)
5.2832697027905455


# Conclusion
In the above example item based CF performs significantly better than user based CF because its RMSE of 5.28 is lower than 29.60. This sugests that similar movies are a better indication of how a user will rate a movie rather than similar users.