# Homework - Recommendations

## Libraries

In [1]:
import matplotlib.pyplot as plt # To plot some data if you want to.

import pandas as pd # Library to handle dataframes.
import numpy as np  # Useful library for matrix manipulation.

## 0.2 - Load Dataset

In [2]:
# Import data from a csv into a dataframe.
ratings = pd.read_csv("u.data",
                      delim_whitespace=True,
                      header=None,
                      names=["userId", "movieId", "rating", "timestamp"])

## Basic exploration

In [3]:
ratings.head() # Displays the first rows of the dataframe.

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
ratings.describe() # Similar to summary() in R.

Unnamed: 0,userId,movieId,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


### Standardization

In [16]:
# Standardize the "rating" column and put the results in a "std_ratings" column.
ratings["std_rating"] = (ratings["rating"] - ratings["rating"].mean()) / ratings["rating"].std()

ratings["std_rating"]

0       -0.470705
1       -0.470705
2       -2.247419
3       -1.359062
4       -2.247419
5        0.417652
6       -1.359062
7        1.306009
8       -0.470705
9       -0.470705
10      -1.359062
11       1.306009
12       1.306009
13      -0.470705
14      -0.470705
15      -0.470705
16       1.306009
17      -1.359062
18       0.417652
19      -1.359062
20       0.417652
21       0.417652
22       0.417652
23      -1.359062
24       0.417652
25      -1.359062
26       1.306009
27      -1.359062
28       0.417652
29       1.306009
           ...   
99970   -2.247419
99971   -1.359062
99972   -0.470705
99973    0.417652
99974   -0.470705
99975   -0.470705
99976   -1.359062
99977   -0.470705
99978    1.306009
99979   -1.359062
99980    0.417652
99981   -0.470705
99982   -2.247419
99983   -0.470705
99984   -1.359062
99985    0.417652
99986   -0.470705
99987   -1.359062
99988    0.417652
99989    0.417652
99990    0.417652
99991    0.417652
99992   -0.470705
99993   -1.359062
99994   -0

## 0.3 - Create a separate validation set

In [10]:
from numpy.random import RandomState

In [11]:
prng = RandomState(58)

In [12]:
prng.rand(len(ratings))

array([ 0.36510558,  0.45120592,  0.49606035, ...,  0.35743007,
        0.68356079,  0.84245802])

In [13]:
msk = prng.rand(len(ratings)) < 0.8
trn = ratings[msk]
val = ratings[~msk]
len(trn), len(val)

(79948, 20052)

In [14]:
trn.head()

Unnamed: 0,userId,movieId,rating,timestamp,std_rating
0,196,242,3,881250949,-0.470705
1,186,302,3,891717742,-0.470705
2,22,377,1,878887116,-2.247419
3,244,51,2,880606923,-1.359062
4,166,346,1,886397596,-2.247419


## User averages

"You should compute a numpy array (or python list) user_average, of size
n_users+1, such that user_average[i] is the average rating given by user i. The value of
user_average[0] does not matter."

In [25]:
# Get series of the the "std_rating" and "rating" columns grouped by "movieId".
std_ratings_user_grouped = ratings.groupby("userId")["std_rating"]
ratings_user_grouped = ratings.groupby("userId")["rating"]

# Aggregate them with the "mean" function, and sort them.
std_user_mean_ratings = std_ratings_user_grouped.mean().sort_values(ascending=False)
user_mean_ratings = ratings_user_grouped.mean().sort_values(ascending=False)

In [26]:
std_user_mean_ratings

userId
849    1.190136
688    1.157950
507    1.060945
628    1.042792
928    1.028398
118    1.005719
907    0.925285
686    0.918135
427    0.904816
565    0.899903
469    0.892820
850    0.887959
225    0.878282
330    0.858809
477    0.823758
242    0.817413
636    0.817413
583    0.812477
767    0.801807
252    0.798377
810    0.793495
366    0.767611
12     0.766027
312    0.756264
383    0.755478
862    0.740691
513    0.740691
523    0.737461
324    0.727231
909    0.725160
         ...   
570   -0.753364
21    -0.763515
104   -0.766824
578   -0.766824
797   -0.778213
933   -0.784527
155   -0.793744
617   -0.807951
102   -0.812064
824   -0.826048
129   -0.826048
660   -0.839532
900   -0.865530
161   -0.868934
255   -0.877422
656   -0.877868
509   -0.901423
637   -0.923425
702   -0.957868
302   -0.978337
609   -1.010064
626   -1.053689
865   -1.103323
206   -1.206375
724   -1.212744
774   -1.307505
685   -1.314644
445   -1.372223
405   -1.506117
181   -1.810388
Name: std_rating,

In [27]:
user_mean_ratings

userId
849    4.869565
688    4.833333
507    4.724138
628    4.703704
928    4.687500
118    4.661972
907    4.571429
686    4.563380
427    4.548387
565    4.542857
469    4.534884
850    4.529412
225    4.518519
330    4.496599
477    4.457143
242    4.450000
636    4.450000
583    4.444444
767    4.432432
252    4.428571
810    4.423077
366    4.393939
12     4.392157
312    4.381166
383    4.380282
513    4.363636
862    4.363636
523    4.360000
324    4.348485
909    4.346154
         ...   
570    2.681818
21     2.670391
578    2.666667
104    2.666667
797    2.653846
933    2.646739
155    2.636364
617    2.620370
102    2.615741
129    2.600000
824    2.600000
660    2.584821
900    2.555556
161    2.551724
255    2.542169
656    2.541667
509    2.515152
637    2.490385
702    2.451613
302    2.428571
609    2.392857
626    2.343750
865    2.287879
206    2.171875
724    2.164706
774    2.058036
685    2.050000
445    1.985185
405    1.834464
181    1.491954
Name: rating, Len

From now on, results are computed based on the raw ratings data, not the standardized version. Be free to do as you please. 

## Movie averages

You should compute a numpy array (or python list) movie_average, of size
n_movies+1, such that movie_average[i] is the average rating for movie i. If movie i has
not been rated, movie_average[i] should contain the global average, global_average

In [29]:
ratings_movie_grouped = ratings.groupby("movieId")["rating"]

movie_mean_ratings = ratings_movie_grouped.mean().sort_values(ascending=False)

movie_mean_ratings

movieId
1293    5.000000
1467    5.000000
1653    5.000000
814     5.000000
1122    5.000000
1599    5.000000
1201    5.000000
1189    5.000000
1500    5.000000
1536    5.000000
1449    4.625000
1642    4.500000
119     4.500000
1398    4.500000
1594    4.500000
408     4.491071
318     4.466443
169     4.466102
483     4.456790
114     4.447761
64      4.445230
603     4.387560
12      4.385768
50      4.358491
178     4.344000
513     4.333333
1639    4.333333
1191    4.333333
134     4.292929
963     4.292683
          ...   
1576    1.000000
1575    1.000000
1574    1.000000
1566    1.000000
1572    1.000000
1571    1.000000
1570    1.000000
1569    1.000000
1586    1.000000
1563    1.000000
858     1.000000
1562    1.000000
852     1.000000
1568    1.000000
1626    1.000000
830     1.000000
1363    1.000000
1621    1.000000
1618    1.000000
439     1.000000
1546    1.000000
1374    1.000000
1548    1.000000
1373    1.000000
437     1.000000
1601    1.000000
1557    1.000000
1559  

## 0.7 Define RMSE and MAE

### RMSE

Define a function `evalrmse(pred, ground)`
 which, given predicted ratings **pred** and ground-truth ratings **ground** returns the root-mean-square error of the prediction

Try to define this function in plain python, then using numpy, then using sklearn (which must be installed if necessary)

### [RMSE] Classic implementation

In [72]:
def evalrmse_classic(pred, grnd):
    """
    evalrmse takes a prediction list and a ground-truth list,
    computes and returns the root-mean square error of the prediction.
    
    Args:
        pred (list): List containing the predictions.
        grnd (list): List containing the ground-truth results.

    Returns:
        rmse (list): The root-mean square error of the prediction.
    """
    square_sum = 0
    n = len(grnd)
    
    for i in range(n):
        square_sum += (pred[i] - grnd[i]) ** 2
    
    return np.sqrt(square_sum / n)

In [73]:
A = [1, 2, 3, 4]
B = [1, 2, 3, 3]

evalrmse_classic(A, B)

0.5

### [RMSE] Numpy implementation

In [74]:
# Same implementation but using numpy and as a lambda function, because "fun".
evalrmse_np = lambda pred, grnd: np.sqrt(((pred - grnd) ** 2).mean())

In [75]:
A = np.array([1, 2, 3, 4])
B = np.array([1, 2, 3, 3])

evalrmse_np(A, B)

0.5

### [RMSE] Sklearn implementation

In [78]:
from sklearn.metrics import mean_squared_error

evalrmse_sklearn = lambda pred, grnd : np.sqrt(mean_squared_error(grnd, pred))

In [79]:
evalrmse_sklearn(A, B)

0.5

### MAE

Same question for the mean absolute error.

### [MAE] Classic implementation

In [82]:
def evalmae_classic(pred, grnd):
    """
    evalmae takes a prediction list and a ground-truth list,
    computes and returns the mean absolute error of the prediction.
    
    Args:
        pred (list): List containing the predictions.
        grnd (list): List containing the ground-truth results.

    Returns:
        mae (list): The mean absolute error of the prediction.
    """
    square_sum = 0
    n = len(grnd)
    
    for i in range(n):
        square_sum += abs(pred[i] - grnd[i])
    
    return square_sum / n

In [84]:
A = [1, 2, 3, 4]
B = [1, 2, 3, 3]

evalmae_classic(A, B)

0.25

### [MAE] Numpy implementation

In [87]:
evalmae_np = lambda pred, grnd: abs((pred - grnd)).mean()

In [88]:
A = np.array([1, 2, 3, 4])
B = np.array([1, 2, 3, 3])

evalmae_np(A, B)

0.25

### [MAE] Sklearn implementation

In [97]:
from sklearn.metrics import mean_absolute_error

evalmae_sklearn = lambda pred, grnd : mean_absolute_error(grnd, pred)

In [98]:
evalmae_sklearn(A, B)

0.25