In [23]:
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
import sklearn.cluster
import seaborn
import time

def timer():
    elapsed_time = (time.time() - start)
    return " {0:.2f}".format(elapsed_time) + " seconds" if elapsed_time <= 120 else " {0:.2f}".format(elapsed_time/60) + " minutes"

def create_movie_genre_dict():
    movie_keys = mov_genres['movieID'].unique()
    movie_genres = dict.fromkeys(movie_keys)
    for mov_id in tqdm(movie_keys):
        movie_genres[mov_id] = get_genres(mov_id)
    return movie_genres

def get_genres(movieID):
    return pd.Series( [1 if (genre in mov_genres[mov_genres['movieID'] == movieID]['genre'].unique()) else 0 for genre in genres], index=genres)

my_test = pd.read_csv('output/test_movies_with_genres.dat', '\t')
my_train = pd.read_csv('output/train_movies_with_genres.dat', '\t')
mov_genres = pd.read_csv('movie_training_data/movie_genres.dat','\t')
predictions = pd.read_csv('predictions.dat', '\t')
genres = mov_genres['genre'].unique()

movie_genres = create_movie_genre_dict()




In [2]:
train_data = my_train.drop(my_train.columns[:4], axis=1)

start = time.time()
M = 10
mat_train = train_data.as_matrix()
print("Starting clusters")
km = sklearn.cluster.KMeans(n_clusters=M)
km.fit(mat_train)
print("Created clusters ", timer())
labels = km.labels_
results = pd.DataFrame([train_data.index,labels]).T
print("Have results")
print(results)

Starting clusters
Created clusters   45.76 seconds
Have results
             0  1
0            0  1
1            1  1
2            2  6
3            3  3
4            4  2
5            5  8
6            6  2
7            7  0
8            8  3
9            9  7
10          10  5
11          11  2
12          12  5
13          13  7
14          14  7
15          15  8
16          16  3
17          17  5
18          18  6
19          19  1
20          20  6
21          21  5
22          22  8
23          23  5
24          24  9
25          25  5
26          26  2
27          27  5
28          28  7
29          29  3
...        ... ..
685568  685568  3
685569  685569  9
685570  685570  0
685571  685571  0
685572  685572  4
685573  685573  7
685574  685574  7
685575  685575  2
685576  685576  5
685577  685577  7
685578  685578  3
685579  685579  7
685580  685580  0
685581  685581  2
685582  685582  6
685583  685583  0
685584  685584  7
685585  685585  7
685586  685586  5
685587  685587  4


In [3]:
def classify(row):
    row_genres = row[genres].T
    row_genres.columns = range(len(row_genres.columns))
    row_genres = row_genres.loc[row_genres[0] == 1].T
    possible_clusters = []
    for col in row_genres.columns:
        possible_clusters.append(np.random.choice(range(M), p=cond_classifier[col]))
    return np.random.choice(possible_clusters)

def round_of_rating(number):
    return round(number * 2) / 2

def rate(row):
    clust = classify(row)
    return round_of_rating(np.random.normal(means[clust], stds[clust]))

In [24]:
results.columns = ['index', 'cluster']

clusters = dict.fromkeys(range(0,M))
means = dict.fromkeys(range(0,M))
stds = dict.fromkeys(range(0,M)) 
for cluster_idx in tqdm(range(0,M)):
    clusters[cluster_idx] = my_train[results['cluster'] == cluster_idx]
    means[cluster_idx] = clusters[cluster_idx]['rating'].mean()
    stds[cluster_idx] = clusters[cluster_idx]['rating'].std()

# Create Conditional Classifier
cond_classifier = dict.fromkeys(genres)
for genre in genres:
    cond_classifier[genre] = []
    for cluster_idx, cluster in clusters.items():
        cond_classifier[genre].append(cluster[genre].mean())
    cond_classifier[genre] /= np.sum(cond_classifier[genre])

test_ratings = []
for index, row in tqdm(my_test.iterrows()):
    test_ratings.append(rate(pd.DataFrame(row).T))

my_test["pred_ratings"] = pd.Series( test_ratings, index=my_test.index )






In [14]:
success_rate = 0
rmse = []
for index, row in tqdm(my_test.iterrows()):
    success_rate += 1 if (np.abs(row['rating'] - row['pred_ratings']) <=1) else 0
    rmse.append(row['rating'] - row['pred_ratings'])
print((success_rate / len(my_test.index)) * 100, '%')
print("RMSE is: ", np.std(rmse))


63.44 %
RMSE is:  1.44741811914


In [33]:
rating_predictions = []
for index, row in tqdm(predictions.iterrows()):
    rating_predictions.append(rate(pd.DataFrame(movie_genres[row['movieID']]).T))

predictions['predicted rating'] = pd.Series( rating_predictions, index=predictions.index )

predictions.to_csv('output/predictions.dat', sep='\t')




Project Write-up

1. Exploratory data analysis
   In order to better understand the data that I was dealing with I first began to graph a number of histograms along different axes. I saw that there was a strong correlation between the number of ratings and the months of December and January. But honestly it wasn't a ton to go on so I began to search through a few different ideas for how I could create my recommender system. I found a few good tutorials but most of them didn't have exactly what I needed so eventually I decided to create my own hybrid system. It's an Item-Item Conditional Classifier System based exclusively on the genre.

2. Description of technical approach 
   The reason that I went with a Item-Item recommender system is because it's very fast after the preliminary training and you don't have to know anything about the users. While exploring the data I realized that I know almost nothing about the users who created these ratings while I know a whole lot more about the movies themselves. So what my system is doing is trying to find patterns in the data that I can then use to create a Classifier for my incoming data so that whenever I get a new movie I can guess what it's rating would be. 
   Obviously there are a few erroneous assumptions that this system jumps to out of the gate. First and most concerning of which is that it assumes that all movies of a specific genre are given similar ratings. This is obviously a pretty bad assumption to make but it turns out that it didn't matter a whole lot in the end.
   The Process - In order to create my recommender system I first had to combine all of the data for genres into the original dataset. This was the most computationally intensive part of my recommender because for each item in my training data I had to create a new column for each genre and mark it according to which movie it was. There were probably a number of ways to optomize but I just brute forced it and ended up taking 3 hours but after it was all said and done I just saved the newly manipulated data into .dat files so that I didn't have to ever do it again.
   After I had the seperate genres recorded I was able to utilize the scipy clustering library to cluster my data according to the matrix or genres and their corresponding rating. I started with 10 clusters and it ran relatively quickly. That is one area that I noted I would have to adjust later to see what kind of changes I could get to these results.
   After I created the clusters I gathered them together and found their mean rating and with their standard deviation. Once I was able to get that data I still needed to create the actual recommender system itself. What I ended up doing was I created distribution over genres for every cluster that I had created. I then used those distributions to create a conditional classifier that would use a normalized version of those distributions to pick a random cluster to be a part of weighted by the distribution of the genres that that movie had in it. I then randomly selected one of the clusters from that list and used it's mean and standard deviation to sample from a gaussian to get it's rating. So for example movie 2 is an Children Adventure Fantasy movie. So for each of those genres I would look at the distribution over Children movies and randomly select a cluster according to the weights for Children movies. Then I would do the same for each of the other two genres. Say the result was an array of [0,1,0] for the cluster that those movies belonged to. Then I randomly select one of those to be the official cluster that it belongs to and use that clusters mean and standard deviation to select it's rating.

3. Analysis of performance of method
   At first I was and honestly still am worried about the amount of randomness that is inherent in this solution. The first time I ran this my algorithm all the way through I only had a 14% success rate for guessing the correct rating for the movie. It was a little discouraging but not all that surprising given that there are a lot of random samples in this method. I was however very surprised by my next discovery. Because of the randomness I wanted to see if I was even in the ballpark of getting the correct ratings. So I increased my check on percentage to a .5 error. When I increased my check to have that small margin or error my successful guesses increased to 42%! It was pretty awesome to see that despite all the randomness this algorithm was able to get almost half of it's guesses correct within a half a point. when I increase the error to 1 I get a 63% success rate, which isn't as drastic of a change as before but still pretty impressive for a simple recommender system. I actually later saw that I needed to calculate the RMSE and I have an RMSE of 1.44 which isn't too bad, but it's not great considering completely random gets around 1.99. However, because of the data that I got from the percentage I think that a majority of my guesses are either very close to spot on or they are very far off, which I think leads to the highish RMSE. This makes sense because many of my clusters had means that were centered around different numbers so if a movie is accidentally assigned to a wrong cluster then the number will be very far off.