In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import accuracy_score
import random
import csv

personality_data = pd.read_csv('./data/PersonalityData.csv', names=["user_id", "openness", "agreeableness", "emotional_stability", "conscientiousness", "extraversion"])
print("pdf", personality_data.shape)
rating_data = pd.read_csv('./data/Rating.csv', names=["user_id", "movie_id", "rating", "timestamp"])
rating_data = rating_data.drop("timestamp", axis=1)

print("rdf", rating_data.shape)
whole_data = pd.merge(personality_data, rating_data, left_on='user_id', right_on='user_id', how='left')
print(whole_data.head())


pdf (1820, 6)
rdf (1028751, 3)
                            user_id  openness  agreeableness  \
0  8e7cebf9a234c064b75016249f2ac65e       5.0            2.0   
1  8e7cebf9a234c064b75016249f2ac65e       5.0            2.0   
2  8e7cebf9a234c064b75016249f2ac65e       5.0            2.0   
3  8e7cebf9a234c064b75016249f2ac65e       5.0            2.0   
4  8e7cebf9a234c064b75016249f2ac65e       5.0            2.0   

   emotional_stability  conscientiousness  extraversion  movie_id  rating  
0                  3.0                2.5           6.5         1     5.0  
1                  3.0                2.5           6.5         2     4.0  
2                  3.0                2.5           6.5         3     4.0  
3                  3.0                2.5           6.5         5     5.0  
4                  3.0                2.5           6.5         6     4.0  


In [2]:
def specificMovieData(table, movie_id):
    return table.loc[table['movie_id'] == movie_id]

movie_ids = sorted(set(whole_data['movie_id']))

predicted_ratings = []

for movie_id in movie_ids:
    
    movie_data = specificMovieData(whole_data, movie_id)
    movie_data = movie_data.drop('user_id', axis=1)
    movie_data = movie_data.drop('movie_id', axis=1)
    movie_data['rating'] = movie_data['rating']
    
    overall_rating = movie_data['rating'].sum()/movie_data.shape[0]
    predicted_rating = 0
    
    if len(movie_data) > 5:
        #drop output column for x_train and set y_train to output column
        X, y = movie_data.drop("rating", axis=1), movie_data["rating"].copy()
        #prediction based off 20% of the viewers
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        #scale data since they are all in different units
        scaler = StandardScaler() 
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        #Create and fit the model
        model = KNeighborsRegressor(n_neighbors=3)

        #Fit the model using the training data
        model.fit(X_train,y_train)

        #Predict unseen data
        y_predicted = model.predict(X_test)
        predicted_rating = sum(y_predicted)/len(y_predicted)  
        
    else:
        predicted_rating = overall_rating + random.uniform(-0.201, 0.201)
    predicted_rating = max(min(predicted_rating, 5), 0) #between 0 and 5
    
    predicted_ratings.append(str(movie_id) + "," + str(round(predicted_rating, 2)) + "," + str(round(overall_rating, 2)))
    
    
with open('./data/PredictedMovieRatings.csv', 'w') as f:
    w = csv.writer(f, delimiter = ',')
    w.writerows([x.split(',') for x in predicted_ratings])
    
