# Data Preprocessing and Transformation - Eric

Below, we turn the dataset from a size that we can not manage or process into a managable form. This is done by removing the Date column, reducing the data types to a smaller form, and only using data on Movies and Customers that are frequently rated or rating.

In [245]:
import numpy as np
import pandas as pd
import os
from sklearn.neighbors import NearestNeighbors
from math import sqrt
from sklearn.metrics import mean_squared_error
import math

In [246]:
import os
cwd = os.getcwd()

In [247]:
movie_title = pd.read_csv("/Users/xjshen/data/DSC478_Project/movie_titles.csv", encoding='unicode_escape', usecols=[2], header=None)
movie_title.columns = ['title']
movie_title

Unnamed: 0,title
0,Dinosaur Planet
1,Isle of Man TT 2004 Review
2,Character
3,Paula Abdul's Get Up & Dance
4,The Rise and Fall of ECW
...,...
17765,Where the Wild Things Are and Other Maurice Se...
17766,Fidel Castro: American Experience
17767,Epoch
17768,The Company


In [248]:
movie = pd.read_csv("/Users/xjshen/DSC478/Project/final.csv")
movie.describe()

Unnamed: 0,MovieID,CustomerID,Rating
count,100480500.0,100480500.0,100480500.0
mean,9070.915,1322489.0,3.60429
std,5131.891,764536.8,1.085219
min,1.0,6.0,1.0
25%,4677.0,661198.0,3.0
50%,9051.0,1319012.0,4.0
75%,13635.0,1984455.0,4.0
max,17770.0,2649429.0,5.0


In [249]:
movie_freq = pd.DataFrame(movie.groupby('MovieID').size(),columns=['count'])
threshold = 100

popular_movies = list(set(movie_freq.query('count>=@threshold').index))

# ratings df after dropping non popular movies
data_popular_movies = movie[movie.MovieID.isin(popular_movies)]

print('shape of original data:', movie.shape)
print('shape of data_popular_movies', data_popular_movies.shape)
print("No. of movies which are rated more than 100 times:", len(popular_movies))

shape of original data: (100480507, 4)
shape of data_popular_movies (100400918, 4)
No. of movies which are rated more than 100 times: 16795


In [250]:
user_freq = pd.DataFrame(movie.groupby('CustomerID').size(),columns=['count'])
# A large number of users are rated very rarely, so we can remove those users which are rated less than 1000 times.
threshold = 1000
active_user = list(set(user_freq.query('count>=@threshold').index))
data_popular_movies_active_user = data_popular_movies[data_popular_movies.CustomerID.isin(active_user)]

print('shape of original data:', movie.shape)
print('shape of data_popular_movies', data_popular_movies.shape)
print('shape of data_popular_movies_active_user', data_popular_movies_active_user.shape)
print('No. of users who rated more than 1000 times:', len(active_user))

print('user number of new matrix', len(active_user))
print('movie number of new matrix', len(popular_movies))

shape of original data: (100480507, 4)
shape of data_popular_movies (100400918, 4)
shape of data_popular_movies_active_user (18757426, 4)
No. of users who rated more than 1000 times: 13141
user number of new matrix 13141
movie number of new matrix 16795


In [251]:
print(data_popular_movies_active_user.memory_usage(), '\n')
print("Memory Usage: ", data_popular_movies_active_user.memory_usage().sum() / (1024**2), " MB")

Index         150059408
MovieID       150059408
CustomerID    150059408
Rating        150059408
Date          150059408
dtype: int64 

Memory Usage:  715.5390167236328  MB


In [252]:
data_popular_movies_active_user['MovieID'] = data_popular_movies_active_user['MovieID'].astype('int16')
data_popular_movies_active_user['CustomerID'] = data_popular_movies_active_user['CustomerID'].astype('int32')
data_popular_movies_active_user['Rating'] = data_popular_movies_active_user['Rating'].astype('int8')

cleanedMovie = data_popular_movies_active_user.drop(columns=['Date']).iloc[:20000000]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_popular_movies_active_user['MovieID'] = data_popular_movies_active_user['MovieID'].astype('int16')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_popular_movies_active_user['CustomerID'] = data_popular_movies_active_user['CustomerID'].astype('int32')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

In [253]:
print("Memory Usage: ", cleanedMovie.memory_usage().sum() / (1024**2), " MB")

Memory Usage:  268.3271312713623  MB


In [254]:
cleanedMovie.to_pickle("cleanedMovie.pkl")

In [255]:
df = pd.read_pickle('/Users/xjshen/data/DSC478_Project/cleanedMovie.pkl')

In [256]:
df

Unnamed: 0,MovieID,CustomerID,Rating
0,1,1488844,3
3,1,30878,4
7,1,1248029,3
19,1,372233,5
20,1,1080361,3
...,...,...,...
100480501,17770,311124,3
100480502,17770,1790158,4
100480504,17770,234275,1
100480505,17770,255278,4


# InitialRatings Result - Rob

In [257]:
user_ratings = [[11283, 5],[4306, 1],[1905, 3],[14691, 5],[14410, 4],[12918, 3],[2862, 4],[15124, 2],[14312, 4],[13728, 3],[6971, 4],[15107, 2],[10042, 3]]

In [258]:
user_ratings

[[11283, 5],
 [4306, 1],
 [1905, 3],
 [14691, 5],
 [14410, 4],
 [12918, 3],
 [2862, 4],
 [15124, 2],
 [14312, 4],
 [13728, 3],
 [6971, 4],
 [15107, 2],
 [10042, 3]]

## user based recommender - KNN
### find best key

In [259]:
# pivot data to user-movie matrix
user_movie_df = df.pivot(index='CustomerID',columns ='MovieID' ,values='Rating').fillna(0)
user_movie_df

MovieID,1,2,3,4,5,6,8,10,11,12,...,17761,17762,17763,17764,17765,17766,17767,17768,17769,17770
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
1333,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,4.0,3.0,1.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0
1442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
2213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,4.0
2455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2648589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
2648734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
2648869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
2648885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [260]:
movie_names = list(user_movie_df.columns) 
customer_names = list(user_movie_df.index.values) 

In [261]:
user_movie_df_rename = user_movie_df

In [262]:
user_movie_df_rename.columns = range(16795)

In [263]:
user_movie_df_rename = user_movie_df_rename.reset_index()

In [264]:
user_movie_df_rename = user_movie_df_rename.drop(columns=['CustomerID'])
user_movie_df_rename

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16785,16786,16787,16788,16789,16790,16791,16792,16793,16794
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,4.0,3.0,1.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,4.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
13137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
13138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
13139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [265]:
users_movie_array = user_movie_df_rename.values
users_movie_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [266]:
from sklearn.neighbors import NearestNeighbors

In [267]:
def cross_validate_user(dataMat, movie, test_ratio, k):

    number_of_users = np.shape(dataMat)[0] 
    rated_items_by_user = np.array([i for i in range(number_of_users) if dataMat[i,movie]>0])
    test_size = math.ceil(test_ratio * len(rated_items_by_user))  # round up the test_size
    test_indices = np.random.randint(0, len(rated_items_by_user), test_size)
    withheld_users = rated_items_by_user[test_indices]
    original_movie_profile = np.copy(dataMat[:, movie])
    dataMat[withheld_users, movie] = 0 # So that the withheld test items is not used in the rating estimation below
    error_u = 0.0
    count_u = len(withheld_users)
    # Compute absolute error for user u over all test items
    knn = NearestNeighbors(metric='cosine',algorithm = 'brute', n_neighbors=k)
    knn.fit(dataMat)
    for user in withheld_users:
        sum_predict = 0
        count_predict = 0 

        # Estimate rating on the withheld item
        neigh_dist, neigh_ind= knn.kneighbors(dataMat[user, :].reshape(1, 16795), n_neighbors=k+1)
        distLst = neigh_dist.tolist()[0][1:]
        indLst = neigh_ind.tolist()[0][1:]

        for j in indLst:
            if original_movie_profile[j] != 0:
                sum_predict += original_movie_profile[j]
                count_predict += 1
        if count_predict == 0:
            continue
        else:
            estimatedScore = sum_predict/count_predict
            error_u = error_u + abs(estimatedScore - original_movie_profile[user])

    # Now restore ratings of the withheld items to the user profile
    for user in withheld_users:
        dataMat[user, movie] = original_movie_profile[user]

    # Return sum of absolute errors and the count of test cases for this user
    # Note that these will have to be accumulated for each user to compute MAE
    return error_u, count_u

In [268]:
def test(dataMat, test_ratio, k):
# Write this function to iterate over all users and for each perform evaluation by calling
# the above cross_validate_user function on each user. MAE will be the ratio of total error 
# across all test cases to the total number of test cases, for all users
    total_error = 0
    cases_number = 0
    movies_number = np.shape(dataMat)[1]
    test_size = math.ceil(test_ratio * movies_number)
    test_indices = np.random.randint(0, movies_number, test_size)
    for movie in test_indices:
        error_user, count_user = cross_validate_user(dataMat, movie, test_ratio, k)
        total_error += error_user
        cases_number += count_user
    MAE = total_error/cases_number
    return MAE

In [269]:
from tqdm import tqdm

KLst = [10, 30, 50]
for k in tqdm(KLst):
    MAE = test(users_movie_array, 0.005, k)
    print(MAE)
    print('-'*40)

 33%|███▎      | 1/3 [18:45<37:31, 1125.62s/it]

0.681615935775738
----------------------------------------


 67%|██████▋   | 2/3 [34:03<16:43, 1003.37s/it]

0.73067419331194
----------------------------------------


100%|██████████| 3/3 [47:22<00:00, 947.55s/it] 

0.7178979223200956
----------------------------------------





k = 10 turns out to be the best result.

## user based recommender - KNN
### make recommendation

In [270]:
# pivot data to user-movie matrix
# redo it to keep column name
user_movie_df = df.pivot(index='CustomerID',columns ='MovieID' ,values='Rating').fillna(0)
user_movie_df

MovieID,1,2,3,4,5,6,8,10,11,12,...,17761,17762,17763,17764,17765,17766,17767,17768,17769,17770
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
1333,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,4.0,3.0,1.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0
1442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
2213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,4.0
2455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2648589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
2648734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
2648869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
2648885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [271]:
# define the model
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine',algorithm = 'brute', n_neighbors=10)

In [272]:
# create a function which takes a movie name and make recommedation for it
recommendation = {}
def make_recommendation(input_user,data,model,n_recommendation):
    model.fit(data)
    input_user_array = input_user.to_numpy()
    similar_users_list = (model.kneighbors(input_user_array,n_neighbors=n_recommendation+1,return_distance=False)).tolist()
    for i in similar_users_list[0][1:]:
        for j in data.columns:
            ratingLst = []
            if int(input_user[j]) == 0 and data.iloc[i][j] > 3:
                if j not in recommendation:
                    ratingLst.append(data.iloc[i][j])
                    recommendation[j] = ratingLst
                else:
                    recommendation.get(j).append(data.iloc[i][j])
    print("The new user who will like following movies.")
    number = 0
    for k in sorted(recommendation, key=lambda k: len(recommendation[k]), reverse=True):
        if number < 5:
            res = movie_title.loc[k-1]['title']
            print(res)
            number += 1 #recommend top 5 movies

In [273]:
#consert user_ratings from list to dataframe
# create a empty dataframe
column_names = movie_names
df_newUser = pd.DataFrame(columns = column_names)

In [274]:
for i in user_ratings:
    df_newUser.at[0 , i[0]] = i[1]
df_newUser

Unnamed: 0,1,2,3,4,5,6,8,10,11,12,...,17761,17762,17763,17764,17765,17766,17767,17768,17769,17770
0,,,,,,,,,,,...,,,,,,,,,,


In [275]:
#check input: 16795 - 16782 = 13
df_newUser.isna().sum().sum()

16782

In [276]:
newUser_df = df_newUser.fillna(0)
newUser_df

Unnamed: 0,1,2,3,4,5,6,8,10,11,12,...,17761,17762,17763,17764,17765,17766,17767,17768,17769,17770
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [277]:
make_recommendation(newUser_df, user_movie_df, knn, 10)

The new user who will like following movies.
Star Wars: Episode IV: A New Hope
Indiana Jones and the Last Crusade
Lord of the Rings: The Fellowship of the Ring
Finding Nemo (Widescreen)
Star Wars: Episode V: The Empire Strikes Back


## item based recommender - KNN
### find best key

In [278]:
# pivot data to movie-users matrix
movie_users_df = df.pivot(index='MovieID',columns = 'CustomerID',values='Rating').fillna(0)
movie_users_df
# 16795 movies 13141 users

CustomerID,769,1333,1442,2213,2455,2469,2787,2905,2976,3321,...,2647871,2647888,2648287,2648465,2648502,2648589,2648734,2648869,2648885,2649285
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17767,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17768,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [279]:
movie_users_df_rename = movie_users_df
movie_users_df_rename.columns = range(13141)

In [280]:
movie_users_df_rename = movie_users_df_rename.reset_index()

In [281]:
movie_users_df_rename

Unnamed: 0,MovieID,0,1,2,3,4,5,6,7,8,...,13131,13132,13133,13134,13135,13136,13137,13138,13139,13140
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16790,17766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16791,17767,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16792,17768,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16793,17769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [282]:
movie_users_df_rename = movie_users_df_rename.drop(columns=['MovieID'])
movie_users_df_rename

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13131,13132,13133,13134,13135,13136,13137,13138,13139,13140
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16791,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16792,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [283]:
movie_users_array = movie_users_df_rename.values
movie_users_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 4., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [284]:
from sklearn.neighbors import NearestNeighbors

In [285]:
def cross_validate_user(dataMat, user, test_ratio, k):

    number_of_items = np.shape(dataMat)[0] 
    rated_items_by_user = np.array([i for i in range(number_of_items) if dataMat[i,user]>0])
    test_size = math.ceil(test_ratio * len(rated_items_by_user))  # round up the test_size
    test_indices = np.random.randint(0, len(rated_items_by_user), test_size)
    withheld_items = rated_items_by_user[test_indices]
    original_user_profile = np.copy(dataMat[:,user])
    dataMat[withheld_items, user] = 0 # So that the withheld test items is not used in the rating estimation below
    error_u = 0.0
    count_u = len(withheld_items)
    # Compute absolute error for user u over all test items
    knn = NearestNeighbors(metric='cosine',algorithm = 'brute', n_neighbors=k)
    knn.fit(dataMat)
    for item in withheld_items:
        sum_predict = 0
        count_predict = 0
        # Estimate rating on the withheld item
        neigh_dist, neigh_ind= knn.kneighbors(dataMat[item, :].reshape(1, 13141), n_neighbors=k+1)
        distLst = neigh_dist.tolist()[0][1:]
        indLst = neigh_ind.tolist()[0][1:]
        for j in indLst:
            if original_user_profile[j] != 0:
                sum_predict += original_user_profile[j]
                count_predict += 1
        if count_predict == 0:
            continue
        else:
            estimatedScore = sum_predict/count_predict   
            error_u = error_u + abs(estimatedScore - original_user_profile[item])

    # Now restore ratings of the withheld items to the user profile
    for item in withheld_items:
        dataMat[item, user] = original_user_profile[item]

    # Return sum of absolute errors and the count of test cases for this user
    # Note that these will have to be accumulated for each user to compute MAE
    return error_u, count_u

In [286]:
def test(dataMat, test_ratio, k):
# Write this function to iterate over all users and for each perform evaluation by calling
# the above cross_validate_user function on each user. MAE will be the ratio of total error 
# across all test cases to the total number of test cases, for all users
    total_error = 0
    cases_number = 0
    users_number = np.shape(dataMat)[1]
    test_size = math.ceil(test_ratio * users_number)
    test_indices = np.random.randint(0, users_number, test_size)
    for user in test_indices:
        error_user, count_user = cross_validate_user(dataMat, user, test_ratio, k)
        total_error += error_user
        cases_number += count_user
    MAE = total_error/cases_number
    return MAE

In [287]:
KLst = [10, 30, 50]
for k in KLst:
    MAE = test(movie_users_array, 0.005, k)
    print(MAE)
    print('-'*40)

0.7159477427334573
----------------------------------------
0.7008209024522387
----------------------------------------
0.7106937697992747
----------------------------------------


k = 30 turns out to be the best result.

## user based recommender - KNN
### make recommendation

In [289]:
# pivot data to movie-users matrix
# redo it to keep column name
movie_users_df = df.pivot(index='MovieID',columns = 'CustomerID',values='Rating').fillna(0)
movie_users_df

CustomerID,769,1333,1442,2213,2455,2469,2787,2905,2976,3321,...,2647871,2647888,2648287,2648465,2648502,2648589,2648734,2648869,2648885,2649285
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17767,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17768,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [290]:
#convert new user rating to dictionary
newUser_dict = {}
for i in user_ratings:
    newUser_dict[i[0]] = i[1]
newUser_dict    

{11283: 5,
 4306: 1,
 1905: 3,
 14691: 5,
 14410: 4,
 12918: 3,
 2862: 4,
 15124: 2,
 14312: 4,
 13728: 3,
 6971: 4,
 15107: 2,
 10042: 3}

In [291]:
#find top 3 favorite movies of the new user
newUser_dict_sort = dict(sorted(newUser_dict.items(), key=lambda item: item[1],reverse=True))
top3movies = {k: newUser_dict_sort[k] for k in list(newUser_dict_sort)[:3]}
top3movies

{11283: 5, 14691: 5, 14410: 4}

In [292]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine',algorithm = 'brute', n_neighbors=k)

In [293]:
def find_similarity(topMovies,data,model,n_recommendation):
    dict_similarity = {}
    model.fit(data)
    for index in topMovies:
        array_index = data.iloc[index].to_numpy()
        neigh_dist, neigh_ind = model.kneighbors(array_index.reshape(1, 13141),n_neighbors=n_recommendation+1)
        neigh_dist_Lst = neigh_dist.tolist()[0][1:] #ignore itself
        neigh_ind_Lst = neigh_ind.tolist()[0][1:]
        for i in range(len(neigh_dist_Lst)):
            if neigh_ind_Lst[i] not in dict_similarity:
                dict_similarity[neigh_ind_Lst[i]] = (1-neigh_dist_Lst[i]) * topMovies.get(index)
            else:
                ratio = dict_similarity.get(neigh_ind_Lst[i])
                new_ratio = (ratio + (1-neigh_dist_Lst[i]) * topMovies.get(index))/2
                dict_similarity[neigh_ind_Lst[i]] = new_ratio
            
    sort_dict = dict(sorted(dict_similarity.items(), key=lambda item: item[1],reverse=True))
    return sort_dict
    

In [294]:
def make_recommendation(topMovies, newUser_rating, data, model, n_recommendation):    
    count = 0
    sort_dict = find_similarity(topMovies,data,model,n_recommendation)
    print("The new user who will like following movies:")
    for movie_ind in sort_dict:
        if count < 5:
            if int(newUser_rating[movie_ind]) == 0:
                res = movie_title.loc[movie_ind-1]['title']
            print(res)
            count += 1
    return

In [295]:
make_recommendation(top3movies, newUser_df, movie_users_df,knn,10)

The new user who will like following movies:
Between Heaven and Hell
Star Trek: The Next Generation: Season 2
Cadillac Man
Snowy River: The McGregor Saga "The Race"
Vampire in Brooklyn
