In [1]:
import pandas as pd
import numpy as np
import os
import sys
import pickle
import time

from sklearn.metrics import mean_squared_error
import math
import statistics as stat

# Loading Train and test sets 

In [2]:
def load_csv(filename):

    df = pd.read_csv(filename)
    df['User'] = df['Id'].apply(lambda x: int(x.split('_')[0][1:]))
    df['Movie'] = df['Id'].apply(lambda x: int(x.split('_')[1][1:]))
    df['Rating'] = df['Prediction']
    df = df.drop(['Id', 'Prediction'], axis=1)
    return df

In [3]:
train = load_csv('../data/data/data_train.csv')
test = load_csv('../data/data/sampleSubmission.csv')

In [4]:
train.head()

Unnamed: 0,User,Movie,Rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [5]:
list(train.columns.values)

['User', 'Movie', 'Rating']

### Useful global variables :

In [6]:
median = train['Rating'].median()
mean = train['Rating'].mean()

# Creating users and movies dataframe

To be filled with interesting informations (such as mean, median, ...)

In [7]:
users = pd.DataFrame(index = range(10000), columns=['User', 'Mean'])
movies = pd.DataFrame(index = range(1000), columns=['Movie', 'Mean'])

In [8]:
for i in range(0, 10000):
    users.set_value(i,'User',i+1)
    
for i in range(0, 1000):
    movies.set_value(i,'Movie',i+1)

### Set of useful functions to get intersting informations about users and movies

In [9]:
def dict_mean_user(df):
    """ dictionary with key UserID and value User Mean """
    return dict(df.groupby('User').mean().Rating)

def dict_mean_movie(df):
    """ dictionary with key UserID and value Movie Mean """
    return dict(df.groupby('Movie').mean().Rating)

def dict_median_user(df):
    """ dictionary with key UserID and value User Mean """
    return dict(df.groupby('User').median().Rating)

def dict_median_movie(df):
    """ dictionary with key UserID and value Movie Mean """
    return dict(df.groupby('Movie').median().Rating)

In [10]:
mean_u = dict_mean_user(train)

for i in range(0,10000):
    users.set_value(i,'Mean',list(mean_u.values())[i])

In [11]:
mean_m = dict_mean_movie(train)

for i in range(1000):
    movies.set_value(i,'Mean',list(mean_m.values())[i])

In [12]:
median_u = dict_median_user(train)

for i in range(0,10000):
    users.set_value(i,'Median',list(median_u.values())[i])

In [13]:
median_m = dict_median_movie(train)

for i in range(1000):
    movies.set_value(i,'Median',list(median_m.values())[i])

### Reg Lin for Movie Ratings

Investigated as there was a high correlation between movie mean and number of rating

In [14]:
def dict_nbrating_movie(df):
    """ dictionary with key UserID and value Movie Mean """
    return dict(df.groupby('Movie').count().Rating)

In [15]:
nbrating_m = dict_nbrating_movie(train)

for i in range(1000):
    movies.set_value(i,'Nb Ratings',list(nbrating_m.values())[i])

In [16]:
m, b = np.polyfit(movies['Nb Ratings'],movies['Mean'], 1)

In [17]:
for i in range(1000):
    movies.set_value(i,'Reg Lin Value',m * movies['Nb Ratings'][i] + b)

# Updating train dataframe and computing RMSE

** / ! \ Next window takes time to execute **

In [18]:
for index, row in train.iterrows():
    train.set_value(index, 'User_Mean', users.iloc[int(train.iloc[index]['User'])-1]['Mean'])
    train.set_value(index, 'Movie_Mean', movies.iloc[int(train.iloc[index]['Movie'])-1]['Mean'])
    
    train.set_value(index, 'Global_Mean', mean)
    train.set_value(index, 'Global_Median', median)

    train.set_value(index, 'User_Median', users.iloc[int(train.iloc[index]['User'])-1]['Median'])
    train.set_value(index, 'Movie_Median', movies.iloc[int(train.iloc[index]['Movie'])-1]['Median'])
    
    train.set_value(index, 'Movie_RegLin', movies.iloc[int(train.iloc[index]['Movie'])-1]['Reg Lin Value'])

In [19]:
rmse_user_mean = math.sqrt(mean_squared_error(train['User_Mean'], train['Rating']))
rmse_movie_mean = math.sqrt(mean_squared_error(train['Movie_Mean'], train['Rating']))

rmse_global_mean = math.sqrt(mean_squared_error(train['Global_Mean'], train['Rating']))
rmse_global_median =  math.sqrt(mean_squared_error(train['Global_Median'], train['Rating']))

rmse_user_median = math.sqrt(mean_squared_error(train['User_Median'], train['Rating']))
rmse_movie_median = math.sqrt(mean_squared_error(train['Movie_Median'], train['Rating']))

rmse_movie_reg_lin = math.sqrt(mean_squared_error(train['Movie_RegLin'], train['Rating']))

In [20]:
train.head()

Unnamed: 0,User,Movie,Rating,User_Mean,Movie_Mean,Global_Mean,Global_Median,User_Median,Movie_Median,Movie_RegLin
0,44,1,4,3.893701,3.379412,3.857281,4.0,4.0,3.0,3.328395
1,61,1,3,4.233696,3.379412,3.857281,4.0,4.5,3.0,3.328395
2,67,1,4,3.580645,3.379412,3.857281,4.0,4.0,3.0,3.328395
3,72,1,3,3.861423,3.379412,3.857281,4.0,4.0,3.0,3.328395
4,86,1,5,4.103321,3.379412,3.857281,4.0,4.0,3.0,3.328395


### Correction for users ratings :

In [21]:
users['Difference_To_Mean'] = users['Mean'].mean() - users['Mean']
users.head()

Unnamed: 0,User,Mean,Median,Difference_To_Mean
0,1,4.04348,4.0,-0.211353
1,2,3.77181,4.0,0.0603135
2,3,3.52326,3.5,0.30887
3,4,3.8,4.0,0.0321256
4,5,3.98507,4.0,-0.152949


** / ! \ Next window takes time to execute **

In [22]:
for index, row in train.iterrows():
    train.set_value(index, 'Movie_Mean_Corrected', users.iloc[int(train.iloc[index]['Movie'])-1]['Mean'] + users.iloc[int(train.iloc[index]['User'])-1]['Difference_To_Mean'])
    train.set_value(index, 'Movie_Median_Corrected', users.iloc[int(train.iloc[index]['Movie'])-1]['Median'] + users.iloc[int(train.iloc[index]['User'])-1]['Difference_To_Mean'])

In [23]:
rmse_movie_mean_c = math.sqrt(mean_squared_error(train['Movie_Mean_Corrected'], train['Rating']))
rmse_movie_median_c = math.sqrt(mean_squared_error(train['Movie_Median_Corrected'], train['Rating']))

# Keeping track of baselines found :

In [24]:
Baselines = ['Global Mean', 'User Mean', 'Movie Mean', 'Movie Mean Corrected', 
             'Global Median', 'User Median', 'Movie Median', 'Movie Median Corrected', 'Movie RegLin']

RMSE = [rmse_global_mean, rmse_user_mean, rmse_movie_mean, rmse_movie_mean_c,
        rmse_global_median, rmse_user_median, rmse_movie_median, rmse_movie_median_c, rmse_movie_reg_lin]

for i in range(len(Baselines)):
    print('RMSE for {} is : {}'.format(Baselines[i], RMSE[i]))

RMSE for Global Mean is : 1.1190567337624968
RMSE for User Mean is : 1.0850291595233115
RMSE for Movie Mean is : 1.029420674583946
RMSE for Movie Mean Corrected is : 1.2401999682815303
RMSE for Global Median is : 1.1281209282810263
RMSE for User Median is : 1.13975762984132
RMSE for Movie Median is : 1.0978514017684773
RMSE for Movie Median Corrected is : 1.2868866674862935
RMSE for Movie RegLin is : 1.0678666500565257


Final dataframe states :

In [25]:
train.to_pickle('./train_baseline.p')
train.head()

# Can be reopened with train = pd.read_pickle(train_baseline.p)

Unnamed: 0,User,Movie,Rating,User_Mean,Movie_Mean,Global_Mean,Global_Median,User_Median,Movie_Median,Movie_RegLin,Movie_Mean_Corrected,Movie_Median_Corrected
0,44,1,4,3.893701,3.379412,3.857281,4.0,4.0,3.0,3.328395,3.981903,3.938425
1,61,1,3,4.233696,3.379412,3.857281,4.0,4.5,3.0,3.328395,3.641908,3.59843
2,67,1,4,3.580645,3.379412,3.857281,4.0,4.0,3.0,3.328395,4.294959,4.25148
3,72,1,3,3.861423,3.379412,3.857281,4.0,4.0,3.0,3.328395,4.014181,3.970702
4,86,1,5,4.103321,3.379412,3.857281,4.0,4.0,3.0,3.328395,3.772283,3.728805


In [28]:
movies.to_pickle('./movies_baseline.p')
movies.head()

Unnamed: 0,Movie,Mean,Median,Nb Ratings,Reg Lin Value
0,1,3.37941,3.0,340.0,3.328395
1,2,3.50094,4.0,531.0,3.391218
2,3,3.48359,3.0,792.0,3.477066
3,4,3.93647,4.0,3164.0,4.257262
4,5,3.55913,4.0,1243.0,3.625409


In [29]:
movies.to_pickle('./users_baseline.p')
users.head()

Unnamed: 0,User,Mean,Median,Difference_To_Mean
0,1,4.04348,4.0,-0.211353
1,2,3.77181,4.0,0.0603135
2,3,3.52326,3.5,0.30887
3,4,3.8,4.0,0.0321256
4,5,3.98507,4.0,-0.152949
