### Load and Process Trainning Data

In [2]:
import numpy as np
import pandas as pd

In [249]:
case = 1 #1: user-based cosine similarity; 2: user-based Pearson correlation

In [365]:
is_IUF = False #set as true when testing IUF Pearson Correlation

In [364]:
is_std = True #set as true when testing my algorithm(movie controversy)

In [303]:
case_modification = True# set as true when testing case modification

In [7]:
trainData = pd.read_csv('train.txt', delim_whitespace=True, header=None)

In [8]:
trainData.head()


Unnamed: 0,0,1,2
0,1,1,5
1,1,2,3
2,1,4,3
3,1,5,3
4,1,6,5


In [9]:
trainData.columns = ["userId", "movieId", "rating"]


In [10]:
trainData.head()

Unnamed: 0,userId,movieId,rating
0,1,1,5
1,1,2,3
2,1,4,3
3,1,5,3
4,1,6,5


In [11]:
traindf = trainData.pivot_table(index=["userId"],columns=["movieId"],values="rating")

In [12]:
traindf.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,,3.0,3.0,5.0,,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [318]:
#Run this block when modifying Pearson Correlation with IUF in 1.2
#create a movie map to store IUF for each movie
import math
movieMap = [0] * 1001
for movie in range(1, 1001):
    count = 0
    for user in range(1, 201):
        if movie in traindf.columns and traindf.at[user, movie] > 0:
            count += 1
    if count > 0:
        movieMap[movie] = math.log10(200 / count)


In [346]:
if (is_IUF): #create a matrix for rating with IUF weight when testing IUF
    traindf_IUF = traindf.copy()
    for user in range(1, 201):
        for movie in range(1, 1001):
            if movie in traindf.columns and traindf_IUF.at[user, movie] > 0:
                traindf_IUF.at[user, movie] *= movieMap[movie]
    

### Load and Process Test Data

In [351]:
test5 = pd.read_csv('test5.txt', delim_whitespace=True, header=None) #when loading different test data, just change the file name

In [352]:
test5.head()

Unnamed: 0,0,1,2
0,201,237,4
1,201,268,5
2,201,306,5
3,201,331,5
4,201,934,5


In [353]:
test5.columns = ["userId", "movieId", "rating"]

In [354]:
test5.head()

Unnamed: 0,userId,movieId,rating
0,201,237,4
1,201,268,5
2,201,306,5
3,201,331,5
4,201,934,5


In [355]:
test5_df = test5.pivot_table(index=["userId"],columns=["movieId"],values="rating")

In [356]:
test5_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,989,990,991,993,994,995,996,997,999,1000
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201,0.0,,,,,,,,,,...,,,,,,,,,,
202,,,,,,,,,,,...,,,,,,,,,,
203,,,,,,,0.0,,0.0,,...,,,,,,,,,,
204,,,,,,,,,,,...,,,,,,,,,,
205,,0.0,,,0.0,,,0.0,,,...,,,,,0.0,,0.0,0.0,0.0,


In [357]:
#initialize a 2D array store the movieID with known rating for each test user
test_user_known_movie = []
for i in range(501): 
    test_user_known_movie.append([])

In [358]:
for user in range(201, 301): #for different test data file, could change the range userID
    for movie in range(1, 1001):
        if movie in test5_df.columns and test5_df.at[user, movie] > 0:
            test_user_known_movie[user].append(movie)
                    

In [359]:
test_user_known_movie[201] #use a sample userId to check the array is created correctly

[237, 268, 306, 331, 934]

In [360]:
if (is_IUF):#create a matrix for rating with IUF weight when testing IUF
    testdf_IUF = test5_df.copy() 
    for user in range(201, 301): #change the user range when testing different files
        for movie in range(1, 1001):
            if movie in testdf_IUF.columns and testdf_IUF.at[user, movie] > 0:
                testdf_IUF.at[user, movie] *= movieMap[movie]

In [305]:
movie_std = traindf.std(ddof = 0) #used when implementing my own algorithm based on movie controversy
movie_std.to_dict()

{1: 0.9545008871650057,
 2: 0.6169463812765599,
 3: 0.9568466729604881,
 4: 0.9938079899999065,
 5: 0.924211375534118,
 6: 1.3966450099973928,
 7: 1.0478077005860698,
 8: 0.8647283400240888,
 9: 0.8255803051409876,
 10: 1.135755620017954,
 11: 1.1792326187656943,
 12: 0.8024653371230127,
 13: 0.98752549920002,
 14: 0.9264979425233496,
 15: 0.9687397135727748,
 16: 1.3564659966250538,
 17: 0.9166442529086911,
 18: 1.2000000000000002,
 19: 1.067940011315521,
 20: 1.1357556200179537,
 21: 0.9761703679469163,
 22: 0.8595146491472763,
 23: 0.6376980003071353,
 24: 1.107061848994897,
 25: 1.053058852678366,
 26: 0.7810249675906654,
 27: 0.9162456945817024,
 28: 0.8244707794189572,
 29: 0.8360394355030526,
 30: 0.8975274678557507,
 31: 0.6740919818541088,
 32: 1.2031209415515964,
 33: 1.0198039027185568,
 34: 0.4714045207910317,
 35: 0.0,
 36: 0.5,
 37: 0.5,
 38: 0.9674453024451255,
 39: 0.890448992522325,
 40: 0.9433981132056604,
 41: 1.2453996981544782,
 42: 1.0679400113155209,
 43: 0.68718

In [366]:
if (is_std): #create a traindf for rating with std weight (the larger std, the more controversia, the more important)
    traindf_std = traindf.copy()
    for user in range(1, 201):
        for movie in range(1, 1001):
            if movie in traindf.columns and traindf_std.at[user, movie] > 0:
                traindf_std.at[user, movie] *= math.log10(1 + movie_std[movie])

In [367]:
if (is_std): #create a testdf for rating with std weight (the larger std, the more controversia, the more important)
    testdf_std = test5_df.copy() 
    for user in range(201, 301): #change the user range when using different test files
        for movie in range(1, 1001):
            if movie in testdf_std.columns and movie in traindf.columns and testdf_std.at[user, movie] > 0:
                testdf_std.at[user, movie] *= math.log10(1 + movie_std[movie])
    

### Some helper functions

In [29]:
#calculate cosine similarity
def cal_cosine(vector1, vector2):
    if len(vector1) == 1: #edge case: only 1 dimension, add another dimension with 1 or -1 based on two vector's difference
        if abs(vector1[0] - vector2[0]) >= 3:
            vector1.append(1)
            vector2.append(-1)
        elif abs(vector1[0] - vector2[0]) == 0:
            return 1
        else:
            vector1.append(1)
            vector2.append(1)
    numerator = np.dot(vector1, vector2)
    denominator = np.sqrt(np.dot(vector1, vector1)) * np.sqrt(np.dot(vector2, vector2))
    return numerator / denominator

In [30]:
def getK(similarity): #define k for top k similar user
    if (len(similarity) < 15):
        return len(similarity)
    count = 0
    for i in range(len(similarity)):
        if abs(similarity[i][1]) > 0.9:
            count = count + 1
    return max(count, 15) #return at least 15 most similar user or return all user with similarity > 0.9

In [300]:
def predict_rating(similarity, k, movie): #predict rating based on weighted average in basic cosine similarity
    sum_rating = 0
    sum_sim = 0
    if (case_modification): #apply case_modification to similarity/weight
        similarity = [list(simi) for simi in similarity]
        for ele in similarity:
            ele[1] = ele[1] * math.pow(abs(ele[1]), 2.5)
            
    for i in range(k):
        sum_rating += similarity[i][1] * traindf.at[similarity[i][0], movie]
        sum_sim += similarity[i][1]
    return round(sum_rating / sum_sim)

In [32]:
def get_avg_rating(test_user_known_movie, user): #get test user's average rating with rounded value
    sum = 0;
    for m in test_user_known_movie[user]:
        sum += test5_df.at[user, m]
    return round(sum / len(test_user_known_movie[user]))

In [33]:
def get_test_avg(test_user, testdf): # get test user's average rating without round 
    test_sum = 0
    test_count = 0
    for i in testdf.loc[test_user]:
        if i > 0:
            test_sum += i
            test_count += 1
    return test_sum / test_count

In [34]:
def normalize_vector(train_vector, test_vector, traindf, train_user, testdf, test_user): #normalize rating for Pearson Correlation
    if (is_IUF):
        train_avg = get_train_avg(train_user, traindf_IUF)
        test_avg = get_test_avg(test_user, testdf_IUF)
    elif (is_std):
        train_avg = get_train_avg(train_user, traindf_std)
        test_avg = get_test_avg(test_user, testdf_std)
    else:
        train_avg = get_train_avg(train_user, traindf)
        test_avg = get_test_avg(test_user, testdf)
        
    for rating in train_vector:
        rating -= train_avg
    for rating1 in test_vector:
        rating1 -= test_avg

In [35]:
def isZero(rating_vector): #check number in vector are all 0s or not
    for rating in rating_vector:
        if rating > 0:
            return False
    return True

In [36]:
def get_train_avg(train_user, traindf): #get the average rating for user in trainning data 
    train_sum = 0
    train_count = 0
    for i in traindf.loc[train_user]:
        if i > 0:
            train_sum += i
            train_count += 1
    return train_sum / train_count

In [37]:
def predict_pearson_rating(similarity, k, movie, test_user,test5_df, traindf): #predict rating on Pearson Correlation
    active_avg = get_test_avg(test_user, test5_df)
    sum_rating = 0
    sum_sim = 0
    if (case_modification): #apply case_modification to similarity/weight
        similarity = [list(simi) for simi in similarity]
        for ele in similarity:
            ele[1] = ele[1] * math.pow(abs(ele[1]), 2.5)

    for i in range(k):
        sum_rating += similarity[i][1] * (traindf.at[similarity[i][0], movie] - get_train_avg(similarity[i][0], traindf))
        sum_sim += abs(similarity[i][1])
    result = round(active_avg + sum_rating / sum_sim)
    
    if result <= 0: #edge case: when Pearson result is <=0 or greater than 5
        return 1
    if result > 5:
        return 5
    return result 
    

### Algorithms --User-based Collaborating Filtering

In [368]:
#user-based collaborating filtering based on basic cosine similarity and Pearson Correlation
result5 = []
for user in range(201, 301): #change the range when testing different files
    for movie in range(1, 1001):
        if movie in test5_df.columns and test5_df.at[user, movie] == 0:#find the movie to predict
            similarity = []
            for train_user in range(1, 201):
                test_user_vector = []
                train_user_vector = []
                if movie in traindf.columns and traindf.at[train_user, movie] > 0:#find the user has rating for the predicted movie
                    for m in test_user_known_movie[user]:
                        if m in traindf.columns and traindf.at[train_user, m] > 0:
                            if (is_IUF):#use IUF rating dataframe when required
                                train_user_vector.append(traindf_IUF.at[train_user, m])
                                test_user_vector.append(testdf_IUF.at[user, m])
                            elif (is_std):#use std rating dataframe when calculate based on movie controversy
                                train_user_vector.append(traindf_std.at[train_user, m])
                                test_user_vector.append(testdf_std.at[user, m])
                            else:
                                train_user_vector.append(traindf.at[train_user, m])
                                test_user_vector.append(test5_df.at[user, m])
                    if len(train_user_vector) > 0: #only select the train user has at least one common rating movie          
                        if case == 2: 
                            normalize_vector(train_user_vector, test_user_vector, traindf, train_user, test5_df, user)
                            if isZero(train_user_vector) or isZero(test_user_vector): #edge case: if the vector are zero after normalizing
                                continue
                            else:
                                similarity.append((train_user, cal_cosine(train_user_vector, test_user_vector)))
                        elif case == 1:
                            similarity.append((train_user, cal_cosine(train_user_vector, test_user_vector)))
            if len(similarity) > 0:
                similarity.sort(key=lambda x:abs(x[1]), reverse = True)
                k = getK(similarity)
                if case == 1:
                    result5.append((user, movie, predict_rating(similarity, k, movie)))
                if case == 2:
                    result5.append((user, movie, predict_pearson_rating(similarity, k, movie, user, test5_df, traindf)))
            else: #if can't find any eligible similar user, use average rating for the user
                result5.append((user, movie, get_avg_rating(test_user_known_movie, user)))
            

### Algorithms -- Item-based Collaborating Filtering

In [835]:
def cal_adjust_cosine(movie, m, similarity):
    #create two vectors with rating for both movie and m on the same user, minus user avg rating
    test_movie_vector = []
    train_movie_vector = []
    for train_user in range (1, 201):
        if movie in traindf.columns and m in traindf.columns and traindf.at[train_user, movie] > 0 and traindf.at[train_user, m] > 0:
            user_avg = get_train_avg(train_user, traindf)
            test_movie_vector.append(traindf.at[train_user, movie] - user_avg)
            train_movie_vector.append(traindf.at[train_user, m] - user_avg)
    if len(test_movie_vector) > 0:
        if not isZero(train_movie_vector) and not isZero(test_movie_vector): 
            similarity.append((m, cal_cosine(train_movie_vector, test_movie_vector)))
    return similarity

In [836]:
def predict_item_rating(similarity, movie, user):
    active_avg = get_test_avg(user, test5_df)
    sum_rating = 0
    sum_sim = 0
    for i in range(len(similarity)): #min(len(similarity, 10))
        sum_rating += similarity[i][1] * (test5_df.at[user, similarity[i][0]] - active_avg)
        sum_sim += abs(similarity[i][1])
    result = round(active_avg + sum_rating / sum_sim)
    if result <= 0: #edge case: when Pearson result is <=0 or greater than 5
        return 1
    if result > 5:
        return 5
    return result

In [866]:
result5 = []
for user in range(201, 301): #change the range when testing different files
    for movie in range(1, 1001):
        if movie in test5_df.columns and test5_df.at[user, movie] == 0:
            similarity = [] #movie id with rating for the test user; similarity between m and predicted movie
            for m in test_user_known_movie[user]:
                similarity = cal_adjust_cosine(movie, m, similarity)#only calculate similarity with known rating movie for the test user
            if len(similarity) > 0:
                result5.append((user, movie, predict_item_rating(similarity, movie, user)))
            else: #if can't find any eligible similar movie, use user's average rating for the movie
                result5.append((user, movie, get_avg_rating(test_user_known_movie, user)))

### Algorithms -- The Slope One Algorithm

In [143]:
def get_avg_deviation(movie, user): #get avg deviation for current movie to predict with all the rating movie from test user 
    movie_dev_map = []
    for i in range(1001):
        movie_dev_map.append([])
    for m in test_user_known_movie[user]:
        count = 0
        sum_dev = 0
        for train_user in range (1, 201):
            if m in traindf.columns and movie in traindf.columns and traindf.at[train_user, movie] > 0 and traindf.at[train_user, m] > 0:
                count += 1
                sum_dev += traindf.at[train_user, movie] - traindf.at[train_user, m]
        if count != 0:
            movie_dev_map[m].append(sum_dev / count)
            movie_dev_map[m].append(count)
    return movie_dev_map
                

In [178]:
def predict_weighted_slope_one(movie, user): #weighted slope one
    movie_dev_map = get_avg_deviation(movie, user)
    weighted_sum = 0
    num_user = 0
    
    for m in test_user_known_movie[user]:
        # if can't find same user in trainning data rate both m and movie, the avg dev is 0, count is 1
        if len(movie_dev_map[m]) == 0: 
            weighted_sum += test5_df.at[user, m]
            num_user += 1
        else:
            weighted_sum += (test5_df.at[user, m] + movie_dev_map[m][0]) * movie_dev_map[m][1]
            num_user += movie_dev_map[m][1]
            
    result = round (weighted_sum / num_user)
    if result > 5:
        return 5
    if result <= 0:
        return 1
    return result

In [203]:
result5 = []
for user in range(201, 301): #change the range when testing different files
    for movie in range(1, 1001):
        if movie in test5_df.columns and test5_df.at[user, movie] == 0:
            result5.append((user, movie, predict_weighted_slope_one(movie, user)))

### Check Result and Write to Output File

In [369]:
result5

[(201, 1, 4),
 (201, 111, 4),
 (201, 283, 4),
 (201, 291, 3),
 (201, 305, 4),
 (201, 361, 5),
 (201, 475, 4),
 (201, 740, 3),
 (202, 259, 3),
 (202, 288, 3),
 (202, 294, 3),
 (202, 682, 3),
 (202, 876, 2),
 (202, 880, 3),
 (202, 887, 4),
 (202, 895, 3),
 (202, 948, 2),
 (203, 7, 4),
 (203, 9, 4),
 (203, 111, 4),
 (203, 118, 3),
 (203, 121, 4),
 (203, 123, 3),
 (203, 127, 4),
 (203, 129, 4),
 (203, 148, 3),
 (203, 181, 4),
 (203, 222, 4),
 (203, 235, 3),
 (203, 240, 3),
 (203, 258, 4),
 (203, 274, 4),
 (203, 276, 4),
 (203, 284, 3),
 (203, 291, 3),
 (203, 370, 2),
 (203, 410, 3),
 (203, 471, 4),
 (203, 472, 3),
 (203, 477, 3),
 (203, 515, 4),
 (203, 546, 3),
 (203, 597, 3),
 (203, 748, 3),
 (203, 845, 4),
 (203, 864, 3),
 (203, 866, 3),
 (203, 925, 3),
 (203, 928, 3),
 (204, 22, 4),
 (204, 66, 4),
 (204, 243, 3),
 (204, 245, 3),
 (204, 258, 4),
 (204, 259, 2),
 (204, 269, 4),
 (204, 270, 4),
 (204, 289, 3),
 (204, 294, 3),
 (204, 300, 4),
 (204, 302, 4),
 (204, 307, 4),
 (204, 310, 4),


In [370]:
f = open('output.txt', 'w') #write result to output txt files
for ele in result5:
  line = ' '.join(str(x) for x in ele)
  f.write(line + '\n')
f.close()

### Validate Result Files

In [408]:
resultData = pd.read_csv('result5.txt', delim_whitespace=True, header=None)
resultData.columns = ["userId", "movieId", "rating"]
resultData.head()

Unnamed: 0,userId,movieId,rating
0,201,1,5
1,201,111,5
2,201,283,5
3,201,291,4
4,201,305,5


In [409]:

resultData.info() #check if missing some rows|

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7997 entries, 0 to 7996
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   userId   7997 non-null   int64
 1   movieId  7997 non-null   int64
 2   rating   7997 non-null   int64
dtypes: int64(3)
memory usage: 187.6 KB


In [410]:
boollist = pd.isnull(resultData['rating']) #check there's null values or not
resultData[boollist]

Unnamed: 0,userId,movieId,rating


In [411]:
resultData.loc[resultData['rating'] > 5] #validate rating

Unnamed: 0,userId,movieId,rating


In [412]:
resultData.loc[resultData['rating'] <= 0] #vaidate rating

Unnamed: 0,userId,movieId,rating


### Combining multiple algorithms's results to get average rating

In [434]:
result5_my_alogrithm = pd.read_csv('result5.txt', delim_whitespace=True, header=None)#change file name when combining different files
result5_my_alogrithm.columns = ["userId", "movieId", "rating"]
result5_my_alogrithm.head()

Unnamed: 0,userId,movieId,rating
0,201,1,5
1,201,111,5
2,201,283,5
3,201,291,4
4,201,305,5


In [435]:
result5_slope = pd.read_csv('result5_slope.txt', delim_whitespace=True, header=None)
result5_slope.columns = ["userId", "movieId", "rating"]
result5_slope.head()

Unnamed: 0,userId,movieId,rating
0,201,1,5
1,201,111,4
2,201,283,5
3,201,291,4
4,201,305,5


In [436]:
result5_cosine = pd.read_csv('result5_cosine.txt', delim_whitespace=True, header=None)
result5_cosine.columns = ["userId", "movieId", "rating"]
result5_cosine.head()

Unnamed: 0,userId,movieId,rating
0,201,1,4
1,201,111,4
2,201,283,4
3,201,291,3
4,201,305,4


In [439]:
result5 = []
for row in range(len(result5_my_alogrithm)): # each
    combine_rating = round((result5_my_alogrithm.iloc[row]["rating"] + result5_slope.iloc[row]["rating"] + result5_cosine.iloc[row]["rating"]) / 3)
    result5.append((result5_my_alogrithm.iloc[row]["userId"], result5_my_alogrithm.iloc[row]["movieId"], combine_rating))

In [440]:
result5

[(201, 1, 5),
 (201, 111, 4),
 (201, 283, 5),
 (201, 291, 4),
 (201, 305, 5),
 (201, 361, 5),
 (201, 475, 5),
 (201, 740, 4),
 (202, 259, 3),
 (202, 288, 3),
 (202, 294, 3),
 (202, 682, 3),
 (202, 876, 3),
 (202, 880, 3),
 (202, 887, 4),
 (202, 895, 3),
 (202, 948, 2),
 (203, 7, 4),
 (203, 9, 4),
 (203, 111, 4),
 (203, 118, 3),
 (203, 121, 4),
 (203, 123, 3),
 (203, 127, 4),
 (203, 129, 4),
 (203, 148, 4),
 (203, 181, 4),
 (203, 222, 4),
 (203, 235, 3),
 (203, 240, 3),
 (203, 258, 4),
 (203, 274, 4),
 (203, 276, 4),
 (203, 284, 3),
 (203, 291, 3),
 (203, 370, 2),
 (203, 410, 3),
 (203, 471, 4),
 (203, 472, 3),
 (203, 477, 3),
 (203, 515, 4),
 (203, 546, 3),
 (203, 597, 3),
 (203, 748, 3),
 (203, 845, 4),
 (203, 864, 3),
 (203, 866, 4),
 (203, 925, 4),
 (203, 928, 3),
 (204, 22, 4),
 (204, 66, 4),
 (204, 243, 3),
 (204, 245, 3),
 (204, 258, 4),
 (204, 259, 3),
 (204, 269, 4),
 (204, 270, 4),
 (204, 289, 3),
 (204, 294, 3),
 (204, 300, 4),
 (204, 302, 4),
 (204, 307, 4),
 (204, 310, 4),


In [441]:
f = open('output.txt', 'w') #write result to output txt files
for ele in result5:
  line = ' '.join(str(x) for x in ele)
  f.write(line + '\n')
f.close()