In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from time import time
import os
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Reads the downsampled dataframe
df = pd.read_csv(os.getcwd()[:os.getcwd().find("Code")] + "/Data/netflix-prize/downsampled-csv/few_samples.csv", index_col=0)
# Reads the json with all the clusters for each user
path = os.getcwd()[:os.getcwd().find("Code")] + "/Data/user-clusters/clusters.json"
with open(path, "r") as s:
    clusters = json.loads(s.read())

def get_predictions_for_user(user):
    
    lr = LinearRegression()
    # Gets the cluster of users for user
    similar_users_to_user = [int(user) for user in clusters[user].split()]
    # Gets the data for the users in user cluster
    df_user_cluster = df[df["user_id"].isin(similar_users_to_user)]
    df_user = df[df["user_id"].isin([user])]
    
    groups = df_user_cluster.groupby("movie_id")
    movies_avg_rating = groups["rating"].mean()
    df_user = pd.merge(movies_avg_rating, df_user, on="movie_id")
    del df_user["user_id"]
    df_user.columns = ["movie_id", "cluster_avg_rating", "user_rating"]
    
    df_user.replace("?", np.NaN, inplace=True)
    df_user.dropna(inplace=True)
    try:
        if min(df_user["user_rating"].value_counts()) == 1:  # Condition to avoid error on train_test_split
            # ValueError: The least populated class in y has only 1 member, which is too few. 
            # The minimum number of groups for any class cannot be less than 2.
            pass
        else:
            X = df_user.drop(["user_rating"], axis=1)
            y = df_user["user_rating"]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
            X_train.drop("movie_id", axis=1, inplace=True), X_test.drop("movie_id", axis=1, inplace=True)
            lr.fit(X_train, y_train)
            y_pred = lr.predict(X_test)
            r_squared = lr.score(X_test, y_test)
    except:
        print("error:", user)
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    predictions = pd.DataFrame(X_test["movie_id"])
    predictions["user_predicted_score"] = y_pred
    return lr, pd.DataFrame(X_test["movie_id"], "user_predicted_score": y_pred), r_squared

lr, y_pred, r_squared = get_predictions_for_user("729846")

In [39]:
y_pred.head()

Unnamed: 0,movie_id,user_predicted_score
19,1144,3.612079
222,12870,4.747988
236,13650,4.747988
221,12785,4.14217
181,9886,3.612079


In [55]:
r_squared

0.8502632013835584

In [173]:
df_729846 = df[df["user_id"] == 729846]
movie_id = df_729846[df_729846["rating"] == 5].iloc[5]["movie_id"]

In [174]:
path = os.getcwd()[:os.getcwd().find("Code")] + "Data"
def default_progress_handler(percentage):
    print('parsing metadata: ' + str(percentage))
def load_from_txt(data_dir, progress_handler=default_progress_handler):
    """
    Function to metadata provided by netflix
    :param data_dir: path to the Data directory
    :param progress_handler: function responsible for feeding progress updates back to gui
    :return: pandas dataframe with movie metadata
    """
    path = os.path.join(data_dir, "netflix-prize")
    acc = pd.DataFrame(columns=['id', 'year', 'title'])
    num_movies = 17770
    progress_step = int(num_movies*0.01)
    movie_count = 0
    with open(os.path.join(path, 'movie_titles.csv'), "r", encoding = "ISO-8859-1") as s:
        line = s.readline().strip()
        while line:
            movie_count += 1
            if movie_count % progress_step == 0:
                progress_handler(movie_count/num_movies*100)
            tokens = line.split(",")
            acc = acc.append(
                {'id': tokens[0], #movie id
                'year': tokens[1], #year
                'title': ','.join(tokens[2:])},
            ignore_index=True) # movie_title
            line = s.readline().strip()
    progress_handler(100)

    return acc
# movie_titles = load_from_txt(path)

In [175]:
title = movie_titles[movie_titles["id"] == str(movie_id)]["title"].iloc[0]
title

'Kill Bill: Vol. 2'

In [176]:
path = os.getcwd()[:os.getcwd().find("Code")]
path += "aaron-gauthier-individual-project/Code/export_data_netflix_full_movies.csv"
data = pd.read_csv(path)

movie_piv = data.pivot_table(index = 'user_id', columns = 'movie_title', values = 'rating')
movie_user_ratings = movie_piv[title]

similar_to_movie = movie_piv.corrwith(movie_user_ratings)
corr_movie = pd.DataFrame(similar_to_movie, columns = ['Correlation'])
corr_movie.dropna(inplace = True)

movie_rating_counts = pd.DataFrame(data.groupby('movie_title')['rating'].mean())
movie_rating_counts['number_ratings'] = pd.DataFrame(data.groupby('movie_title')['rating'].count())

corr_movie.sort_values('Correlation', ascending = False).head(10)
corr_movie = corr_movie.join(movie_rating_counts['number_ratings'])
corr_movie[corr_movie['number_ratings']>100].sort_values('Correlation', ascending = False).head()

Unnamed: 0_level_0,Correlation,number_ratings
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Kill Bill: Vol. 2,1.0,588
Kill Bill: Vol. 1,0.778418,721
The Princess Diaries (Fullscreen),0.538431,113
Being There,0.518336,113
Born on the Fourth of July,0.497244,181


In [177]:
mov_id = int(movie_titles[movie_titles["title"] == "The Color of Money"]["id"].iloc[0])

Let's see if the user has actually watched an rated this movie

In [178]:
df_729846[df_729846["movie_id"] == mov_id]

Unnamed: 0,movie_id,user_id,rating


In [179]:
y_pred[y_pred["movie_id"] == mov_id]

Unnamed: 0,movie_id,user_predicted_score


Let's try with another movie

In [180]:
mov_id = int(movie_titles[movie_titles["title"] == "Joy Ride"]["id"].iloc[0])
df_729846[df_729846["movie_id"] == mov_id]

Unnamed: 0,movie_id,user_id,rating


In [181]:
y_pred[y_pred["movie_id"] == mov_id]

Unnamed: 0,movie_id,user_predicted_score


Let's use a movie we have predictions for instead and see how it compares

In [186]:
y_pred.iloc[10]

movie_id                12034.000000
user_predicted_score        4.369351
Name: 208, dtype: float64

In [187]:
title2 = movie_titles[movie_titles["id"] == str(12034)]["title"].iloc[0]
corr_movie.loc[title2]

Correlation         0.221494
number_ratings    421.000000
Name: The Shining, dtype: float64

Using this code, the correlations themselves can also be evaluated