# Recommender Systems - Project 1
## Alvaro Bueno

### Recommender system dataset

I have created my own dataset based on cooking books, there are 6 users and 6 books to be reviewed, the idea is that the system will take readers input ratings for some cooking books and predict the reader's rating for other similar books

In [62]:
#import packages to set working environment:
import pandas as pd
from matplotlib import pyplot as plt
books = pd.read_csv('https://raw.githubusercontent.com/delagroove/data612/master/book_ratings.csv')
books = books[sorted(books.columns)]
books.head()


Unnamed: 0.1,Delish,Plenty,Salt Fat Acid Heat,The VietNamese Cookbook,Thug Kitchen,Unnamed: 0
0,3.0,4.0,4,3.0,2.0,0
1,4.0,3.0,5,3.0,,1
2,5.0,3.0,4,,3.0,2
3,5.0,,4,3.0,,3
4,2.0,2.0,5,2.0,3.0,4


In [63]:
books.mean(axis=1,skipna = True)

0    2.666667
1    3.200000
2    3.400000
3    3.750000
4    3.000000
5    3.400000
dtype: float64

In [64]:
books_mean_col = books.mean(axis=0, skipna = True)
books_mean_row = books.mean(axis=1, skipna = True)
print(books_mean_col)
print(books_mean_row)

Delish                     3.800000
Plenty                     3.200000
Salt Fat Acid Heat         4.166667
The VietNamese Cookbook    2.600000
Thug Kitchen               2.750000
Unnamed: 0                 2.500000
dtype: float64
0    2.666667
1    3.200000
2    3.400000
3    3.750000
4    3.000000
5    3.400000
dtype: float64


### creating 2 datasets: train & test

In [65]:
from sklearn.model_selection import train_test_split

books_col = books.stack()
books_col.head(30)
books_train, books_test = train_test_split(books_col, test_size =0.2)

books_train = books_train.sort_index()
books_test = books_test.sort_index()

books_train_mean = books_train.mean()
books_test_mean = books_test.mean()

print("Average of training set: " + str(books_train_mean) + "\n")
print("Average of test set: " + str(books_test_mean) + "\n")

Average of training set: 3.1666666666666665

Average of test set: 3.2857142857142856



### Now, let's calculate the row and column averages to calculate the biases

In [66]:
books_train_wide = books_train.unstack().sort_index()
books_train_column_means = books_train_wide.mean(axis=0,skipna = True)

print(str(books_train_column_means))

books_train_row_means = books_train_wide.mean(axis=1,skipna = True)

print(str(books_train_row_means))

Delish                     3.800000
Plenty                     3.000000
Salt Fat Acid Heat         4.500000
The VietNamese Cookbook    2.600000
Thug Kitchen               3.000000
Unnamed: 0                 1.666667
dtype: float64
0    2.50
1    3.20
2    3.75
3    4.00
4    3.00
5    2.50
dtype: float64


In [67]:
books_test_all = books_test.unstack().sort_index()
books_test_all
books_test_column_means = books_test_all.mean(axis=0,skipna = True)
print(str(books_test_column_means))
books_test_row_means = books_test_all.mean(axis=1,skipna = True)
print(str(books_test_row_means))

Salt Fat Acid Heat    3.500000
Thug Kitchen          2.000000
Unnamed: 0            3.333333
Plenty                4.000000
dtype: float64
0    3.0
2    2.0
3    3.0
5    4.0
dtype: float64


### Calculate the RMSE 

In [68]:
from sklearn.metrics import mean_squared_error

train_pred = [books_train.mean()]*len(books_train)
train_rmse = mean_squared_error(books_train, train_pred)**0.5
print("RMSE training set: " + str(train_rmse) + "\n")
test_pred = [books_train.mean()]*len(books_test) 
test_rmse = mean_squared_error(books_test, test_pred)**0.5
print("RMSE test set: " + str(test_rmse) + "\n")

RMSE training set: 1.247219128924647

RMSE test set: 1.0370134162086937



### calculate bias using training data

In [69]:
# Calculate the bias for each user and each item.

show_bias = books_train_column_means - books_train_mean 
user_bias = books_train_row_means - books_train_mean 

#calculate the baseline predictors -> every user-item combination. 

books_prediction = pd.DataFrame(index=books.index, columns=books.columns.values)
for i in range(0,len(user_bias)):
    for j in range(0,len(show_bias)):
            books_prediction.iloc[i,j] = books_train_mean + user_bias[i] + show_bias[j]
            if books_prediction.iloc[i,j] < 1.0:
                books_prediction.iloc[i,j] = 1.0
            elif books_prediction.iloc[i,j] > 5.0:
                books_prediction.iloc[i,j] = 5.0
books_prediction.head()

Unnamed: 0.1,Delish,Plenty,Salt Fat Acid Heat,The VietNamese Cookbook,Thug Kitchen,Unnamed: 0
0,3.13333,2.33333,3.83333,1.93333,2.33333,1.0
1,3.83333,3.03333,4.53333,2.63333,3.03333,1.7
2,4.38333,3.58333,5.0,3.18333,3.58333,2.25
3,4.63333,3.83333,5.0,3.43333,3.83333,2.5
4,3.63333,2.83333,4.33333,2.43333,2.83333,1.5


In [70]:
import math
books_prediction_train = pd.DataFrame(index=books.index, columns=books.columns.values)
books_prediction_train = books_prediction_train[sorted(books_prediction_train.columns)]
books_train_wide = books_train_wide[sorted(books_train_wide.columns)]
for i in range(0,len(user_bias)):
    for j in range(0,len(show_bias)):
        if math.isnan(books_train_wide.iloc[i,j]):
            continue
        else:
            books_prediction_train.iloc[i,j] = books_train_mean + user_bias[i] + show_bias[j]
            if books_prediction_train.iloc[i,j] < 1.0:
                books_prediction_train.iloc[i,j] = 1.0
            elif books_prediction_train.iloc[i,j] > 5.0:
                books_prediction_train.iloc[i,j] = 5.0
books_prediction_train.head()

Unnamed: 0.1,Delish,Plenty,Salt Fat Acid Heat,The VietNamese Cookbook,Thug Kitchen,Unnamed: 0
0,3.13333,2.33333,,1.93333,,1.0
1,3.83333,3.03333,4.53333,2.63333,,1.7
2,4.38333,3.58333,5.0,,3.58333,
3,4.63333,,5.0,3.43333,,
4,3.63333,2.83333,4.33333,2.43333,2.83333,1.5


In [71]:
train_base_rmse = mean_squared_error(books_train, books_prediction_train.stack().sort_index())**0.5
print("RMSE training set: " + str(train_base_rmse) + "\n")

RMSE training set: 0.9107828846453819



In [72]:
books_prediction_test = pd.DataFrame(index=books.index, columns=books.columns.values)
books_prediction_test = books_prediction_test[sorted(books_prediction_test.columns)]
books_test_all = books_test_all[sorted(books_test_all.columns)]
books_test_all = books_test_all.reindex(books.index)






In [73]:
for l in range(0,books_test_all.shape[0]):
    for k in range(0,books_test_all.shape[1]):
        if math.isnan(books_test_all.iloc[l,k]):
            continue
        else:
            books_prediction_test.iloc[l,k] = books_test_mean + user_bias[l] + show_bias[k]
            if books_prediction_test.iloc[l,k] < 1.0:
                books_prediction_test.iloc[l,k] = 1.0
            elif books_prediction_test.iloc[l,k] > 5.0:
                books_prediction_test.iloc[l,k] = 5.0
books_prediction_test.head()

Unnamed: 0.1,Delish,Plenty,Salt Fat Acid Heat,The VietNamese Cookbook,Thug Kitchen,Unnamed: 0
0,,2.45238,3.95238,,,
1,,,,,,
2,,,,3.30238,,
3,,,,3.55238,,
4,,,,,,


In [74]:
test_base_rmse = mean_squared_error(books_test, books_prediction_test.stack().sort_index())**0.5
print("RMSE test set: " + str(test_base_rmse) + "\n")

RMSE test set: 1.5926400307582675



## Conclusions


My results show 26.97% of improvement for the rating prediction for the training set but it shows issues with the test dues probably due to the fact that it was broken down incorrectly or there are too many missing values.

In [80]:
print("Train: "+str((1-(train_base_rmse/train_rmse))*100) )
print("test: "+str((1-(test_base_rmse/test_rmse))*100) )




Train: 26.974910541128448
test: -53.57950108118532
