In [75]:
import pandas as pd
import math
import numpy as np
import math
from sklearn.cross_validation import train_test_split # to split the dataset for training and testing

In [76]:
# getting the ratings data in pandas dataframe
ratings = pd.read_csv('../Yahoo_movies_multi-criteria/data_movies.txt', sep='\t',names=['user_id', 'criterion1', 'criterion2', 'criterion3', 'criterion4', 'overall', 'movie_id', 'num'])
# train, test = sample_split(ratings)

# print(train)
# print(test)

In [77]:
ratings.user_id.drop_duplicates().count() #number of users

6078

In [78]:
# split the datasets into smaller datasets
def split_dataset(ratings):
        df_group = ratings.groupby(['user_id'])
        list_datasets = list() # list containing all the small datasets
        temp = pd.DataFrame()
        count=0
        flag=0
        for key, item in df_group:
            count+=1
            temp = temp.append(item)
            if count==1000:
                if(len(list_datasets)==5 and flag!=1):
                    flag=1
                    count = count - 78
                    continue
                list_datasets.append(temp)
                temp = pd.DataFrame()
                count=0
        return list_datasets

In [80]:
# Spliting a sample into groups userwise
def sample_split(dataFrame):
    df_group = dataFrame.groupby('user_id')
    train = pd.DataFrame()
    test = pd.DataFrame()
    
    for key, item in df_group:
        train, test = split_train_test(item, train, test)
    
    return train, test

In [81]:
# spliting sample in 70% training data and 30% testing data
def split_train_test(dataFrame, train, test):
    temp_train, temp_test = train_test_split(dataFrame, test_size = 0.3, random_state=1212)# in this our main data is split into train and test
    # the attribute test_size=0.3 splits the data into 70% and 30% ratio. train=70% and test=30%
    train = train.append(temp_train)
    test = test.append(temp_test)
    
    return train, test

In [82]:
# list conatining all the datasets
list_datasets = split_dataset(ratings)

In [83]:
# splitting into train and test data
train, test = sample_split(list_datasets[0])

In [84]:
# function to find similarity between two users based on Manhattan distance
def manhattan_similarity(df, user1, user2):
    s = pd.merge(df[df['user_id']==user1], df[df['user_id']==user2], how="inner", on="movie_id")
#     print(s)
    sum1 = 0
    similarity=0
    for index, row in s.iterrows():
        sum1 += abs(row.criterion1_x - row.criterion1_y) + \
                abs(row.criterion2_x - row.criterion2_y) + \
                abs(row.criterion3_x - row.criterion3_y) + \
                abs(row.criterion4_x - row.criterion4_y) + \
                abs(row.overall_x - row.overall_y)
        distance = sum1/s.shape[0]
        similarity = 1/(1+distance)
    return similarity
        
manhattan_similarity(train, 1, 3)

0.06666666666666667

In [131]:
# function to find simialrity between two users based on Euclidean distance
def euclidean_similarity(df, user1, user2):
    s = pd.merge(df[df['user_id']==user1], df[df['user_id']==user2], how="inner", on="movie_id")
#     print(s)
    sum1 = 0
    similarity=0
    for index, row in s.iterrows():
        sum1 += math.sqrt((row.criterion1_x - row.criterion1_y)**2 + \
                (row.criterion2_x - row.criterion2_y)**2 + \
                (row.criterion3_x - row.criterion3_y)**2 + \
                (row.criterion4_x - row.criterion4_y)**2) + \
                (row.overall_x - row.overall_y)**2
        distance = sum1/s.shape[0]
        similarity = 1/(1+distance)
    return similarity
euclidean_similarity(train, 1, 3)

0.06319010128182817

In [132]:
# function to find simialrity between two users based on Chebyshev distance
def chebyshev_similarity(df, user1, user2):
    s = pd.merge(df[df['user_id']==user1], df[df['user_id']==user2], how="inner", on="movie_id")
#     print(s)
    sum1 = 0
    similarity=0
    for index, row in s.iterrows():
        sum1 += max(abs(row.criterion1_x - row.criterion1_y), \
                abs(row.criterion2_x - row.criterion2_y), \
                abs(row.criterion3_x - row.criterion3_y), \
                abs(row.criterion4_x - row.criterion4_y), \
                abs(row.overall_x - row.overall_y))
        distance = sum1/s.shape[0]
        similarity = 1/(1+distance)
    return similarity
        
chebyshev_similarity(train, 1, 3)

0.2222222222222222

In [172]:
# function to predict the rating given by user to item
# neighbours = -1 implies that consider similarity with all possible users
def predict(df, user, item, similarity, neighbours = -1): # df is the train dataset
    neighbours_data_list = list()
    for v in df.user_id.unique():
        if(v==user): # not including the user itself
            continue
        temp = df[df['user_id']==v]
        temp = temp[temp['movie_id']==item]
        if(temp.empty): # user 'v' has not rated the item
            continue
        else:
            rate = temp.iloc[0].overall
        sim = similarity(df, user, v) # find appropriate similarity measure between the two users
        neighbours_data_list.append((sim, rate))
        
    # sort the neighbours_data_list in descending order based on rate
    neighbours_data_list.sort(reverse=True)
    
    # crop the list to the number of neighbours given in the argument
    if(neighbours!=-1):
        length = len(neighbours_data_list)
        neighbours_data_list = neighbours_data_list[:min(neighbours, length)]
    
    # predict the rating using collaborative filtering formula
    numerator = 0
    denominator = 0
    for tup in neighbours_data_list:
        numerator += tup[0]*tup[1]
        denominator += tup[0]
    predicted_rating = numerator/denominator
    return predicted_rating
        

In [96]:
print(test.head(10))

    user_id  criterion1  criterion2  criterion3  criterion4  overall  \
3         1           6           6           6           5        5   
4         1          10          11          10           9       10   
24        1           3          10           6           6        3   
45        1           1           3           3           1        1   
7         1          12          12          10          12       11   
43        1           8          11          10          11       11   
39        1           4           8           6          12        3   
29        1           7           8           6           9        6   
9         1          12           8           8          10       10   
28        1          12          11          13          11       12   

    movie_id  num  
3         86    4  
4        132    5  
24       581   25  
45       919   46  
7        191    8  
43       879   44  
39       842   40  
29       685   30  
9        232   10  
28     

In [179]:
# print(train)
print(predict(train, 1, 879, euclidean_similarity))
print(predict(train, 1, 879, manhattan_similarity))
print(predict(train, 1, 879, chebyshev_similarity))

10.37336757999115
9.959976730657361
9.569398389388915
