In [1]:
%matplotlib inline

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csgraph

In [3]:
# load data into a table by extracting user and item numbers
ratings = pd.read_csv('data/data_train.csv', dtype={'Prediction': np.int})

How many ratings ?

In [4]:
len(ratings)

1176952

Data structure :

In [5]:
ratings.head()

Unnamed: 0,Id,Prediction
0,r44_c1,4
1,r61_c1,3
2,r67_c1,4
3,r72_c1,3
4,r86_c1,5


In [6]:
ratings.Prediction.max()

5

In [7]:
ratings.Prediction.min()

1

* `Id` is structured as:
    * ri_cj where i is the row number, j is the column number
* Prediction is the rating between 1 and 5

Extract the matrix indices :

In [8]:
idx = ratings.Id.str.extract('r([0-9]+)_c([0-9]+)', expand=True)

In [179]:
idx.head()

Unnamed: 0,0,1
0,44,1
1,61,1
2,67,1
3,72,1
4,86,1


We can now buid the occupancy matrix (it will be sparse !)

In [188]:
user_idx = idx[0].astype(int)
film_idx = idx[1].astype(int)

In [192]:
matrix = np.zeros((max(user_idx), max(film_idx)), dtype=np.int)

In [193]:
for i in range(len(user_idx)):
    matrix[user_idx[i]-1, film_idx[i]-1] = ratings.Prediction[i]

In [194]:
matrix[1].nonzero()

(array([  3,   5,   7,  21,  31,  33,  43,  44,  47,  59,  60,  69,  71,
         80,  88,  92,  94, 101, 134, 137, 143, 160, 168, 172, 176, 181,
        187, 190, 212, 214, 219, 221, 225, 227, 239, 256, 257, 282, 284,
        287, 294, 303, 305, 309, 313, 316, 336, 337, 349, 362, 364, 365,
        367, 375, 384, 387, 395, 396, 401, 433, 439, 442, 443, 456, 457,
        458, 464, 474, 476, 477, 484, 491, 494, 507, 521, 524, 533, 548,
        555, 570, 575, 582, 583, 591, 593, 596, 605, 607, 608, 610, 611,
        617, 618, 619, 623, 626, 631, 632, 639, 641, 642, 655, 657, 658,
        659, 667, 670, 672, 689, 693, 694, 696, 699, 715, 724, 726, 747,
        768, 775, 778, 784, 788, 790, 792, 803, 804, 807, 814, 818, 821,
        824, 826, 840, 847, 849, 863, 866, 870, 881, 889, 949, 966, 973,
        980, 982, 984, 997, 998, 999]),)

In [195]:
occupancy_stat = np.count_nonzero(matrix) / matrix.size
str(occupancy_stat*100) + ' % occupancy'

'11.76952 % occupancy'

In [186]:
"The data is {} % sparse !".format((1-occupancy_stat)*100)

'The data is 88.24341342089568 % sparse !'

Let's take a subset (1000 movies) to get started prototyping

In [71]:
matrix_subset = matrix[:,:1000]

In [73]:
occupancy_stat = np.count_nonzero(matrix_subset) / matrix_subset.size
str(occupancy_stat*100) + ' % occupancy'

'11.318978102189782 % occupancy'

In [117]:
# Leave one out test / train split
# Adapted from https://gist.github.com/Wann-Jiun/d91f7ccbd20659e9725052a9ac5aed10#file-nycdsa_p5_split-py
train_matrix = matrix_subset.copy()
test_matrix = np.zeros(matrix_subset.shape)
np.random.seed(42)

for i in range(1,len(matrix_subset[0])):
    rating_idx = np.random.choice(
        matrix_subset[i, :].nonzero()[0], 
        size=1)
    train_matrix[i, rating_idx] = 0.0
    test_matrix[i, rating_idx] = matrix_subset[i, rating_idx]
    


---

Trying out some stuff :)

# Cosine Similarity

The basic idea is to find the similarity between pairs of users and pairs of movies, so when we query for a user/movie pair, it will take information from both classes of similarities.

In more technical terms, we use the similarities as the latent space for users and movies.

In [152]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarities = cosine_similarity(train_matrix) # row wise similarities
movie_similarities = cosine_similarity(train_matrix.transpose()) # column wise similarities

For a user u and a movie i, the prediction is then the weighted sum of similarities with other users * the rating they gave movie i (normalized)

In [153]:
from sklearn.metrics import mean_squared_error # to compute metrics

In [154]:
predictions = user_similarities.dot(train_matrix) / np.array([np.abs(user_similarities).sum(axis=1)]).T

The accuracy metrics only make sense on ratings we know were given (i.e. non zero ratings for the test)

In [155]:
predictions = predictions[test_matrix.nonzero()].flatten()

In [156]:
test_predictions = test_matrix[test_matrix.nonzero()].flatten()

In [157]:
mse = mean_squared_error(predictions, test_predictions)

In [164]:
np.sqrt(mse)

2.4285423739668959

We can also do this the other way around : using movie similarities

In [159]:
predictions_movies = movie_similarities.dot(train_matrix.transpose()) / np.array([np.abs(movie_similarities).sum(axis=1)]).T

In [160]:
predictions_movies = predictions_movies[test_matrix.nonzero()].flatten()

In [161]:
mse_movies = mean_squared_error(predictions_movies, test_predictions)

In [165]:
np.sqrt(mse_movies)

2.7335775486596536

Not as good as with users :) 

---