# Personality Aware Recommender System: MovieLens Replication and Experiment

## Setup
This first cell is for importing required libraries and required pip installs

In [26]:
import pandas as pd
import numpy as np
from surprise import Dataset
from surprise import Reader
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise import PredictionImpossible
import heapq
import numpy as np
import json

!pip install surprise



## Preprocessing/importing data

In [41]:
movies = pd.read_csv("aggregate_movie_data.csv")
personality = pd.read_csv("personality-data.csv")
ratings = pd.read_csv("updatedRatings.csv")
ratings.rename(columns={"useri": "userid", ' movie_id': 'movie_id', ' rating': 'rating'})

# Opening JSON files
f = open('result.json') 
personalityAndRatingsDict = json.load(f) 

f = open('userIdToPersonalityTraitDict.json') 
userIdToPersonalityTraitDict = json.load(f) 

f = open('movieDetails.json') 
movieDetailsDict = json.load(f) 

## Defining important variables

In [28]:
pTraits = ["openness", "conscientiousness", "extraversion", "agreeableness", "emotional_stability"]
levelOfTraits = ["lowTrait", "moderateTrait", "highTrait"]
ratingLevels = ["lowRating", "moderateRating", "highRating"]
popularityLevels = ["lowPopularity", "moderatePopularity" ,"highPopularity"]
genres = ["is_action", "is_adventure", "is_animation", "is_children", "is_comedy",
          "is_crime", "is_documentary", "is_drama", "is_fantasy", "is_horror", "is_musical",
          "is_mystery", "is_romance", "is_science_fiction", "is_thriller", "is_war", "is_western"]


## Standard User-User CF

In [42]:
# Define the format
reader = Reader(rating_scale=(ratings[' rating'].min(), ratings[' rating'].max()))

# Load the data from the dataframe
data = Dataset.load_from_df(ratings[['useri', ' movie_id', ' rating']], reader)

model = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})

# Perform cross-validation and print results
results = cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9150  0.9120  0.9149  0.9129  0.9109  0.9132  0.0016  
MAE (testset)     0.6858  0.6842  0.6849  0.6844  0.6829  0.6844  0.0010  
Fit time          3.83    3.88    4.14    3.74    3.78    3.87    0.14    
Test time         21.96   22.25   21.74   21.38   21.51   21.77   0.31    


## Personality Aware User-User CF Model

In [91]:
# Inherits from KNNBasic (User User)
class KNNBasicWithPersonality(KNNBasic):
  # Overriding estimate function
  # Input: u - userId
  #        i - itemId
  # Return: Prediced score and Details
  def estimate(self, u, i):
    # Establish trait names and weights
    Traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "emotional_stability"]
    if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
        raise PredictionImpossible("User and/or item is unknown.")

    x, y = self.switch(u, i)

    neighbors = [(self.sim[x, x2], r) for (x2, r) in self.yr[y]]
    k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])

    # compute weighted average
    sum_sim = sum_ratings = actual_k = 0
    for (sim, r) in k_neighbors:
        if sim > 0:
            sum_sim += sim
            sum_ratings += sim * r
            actual_k += 1

    if actual_k < self.min_k:
        raise PredictionImpossible("Not enough neighbors.")

    # Convert model Id's to raw Id's
    rawId = self.trainset.to_raw_uid(u)
    rawMovieId = self.trainset.to_raw_iid(i)
    rawMovieId = str(float(rawMovieId))

    # Prepare data
    personalityData = self.sim_options["personalityData"]
    personalityData = personalityData[personalityData["userid"] == rawId]
    movieData = self.sim_options["movieData"]
    personalityWeight = self.sim_options["personalityWeight"]

    # aggScore = Aggregate score added to standard user user predicted score
    aggScore = 0

    # If movieId is valid
    # if rawMovieId in movieData:
    movieData = movieData[rawMovieId]
    countOfTraits = 0
    if len(movieData["setOfGenres"]) > 0:
        personalityAndRatingsDict = self.sim_options["personalityAndRatingsDict"]
        userIdToPersonalityTraitDict = self.sim_options["userIdToPersonalityTraitDict"]
        movieRatingLevel = movieData["ratingLevel"]
        moviePopularityLevel = movieData["popularityLevel"]
        # traitScore = float(personalityData[trait].iloc[0])
        for trait in Traits:
            traitLevel = userIdToPersonalityTraitDict[rawId][trait]
            # if traitLevel != "moderateTrait":
            traitScore = 0
            # mult = traitMultiplier[trait]
            for genre in movieData["setOfGenres"]:
                traitScore += personalityAndRatingsDict[trait][traitLevel][movieRatingLevel][moviePopularityLevel][genre] # - personalityAndRatingsDict[trait]["moderateTrait"][movieRatingLevel][moviePopularityLevel][genre])
            aggScore += traitScore / len(movieData["setOfGenres"])
            countOfTraits += 1
        aggScore /= 5

    est = (((1-personalityWeight)/2) * (sum_ratings / sum_sim)) + (((1-personalityWeight)/2) * movieData["average_score"]) + (personalityWeight * aggScore)

    details = {"actual_k": actual_k}

    return est, details

## Run Personality Aware Model

In [90]:
personalityWeight = 0.23224511718044732

# Define the format
reader = Reader(rating_scale=(ratings[' rating'].min(), ratings[' rating'].max()))
# Load the data from the dataframe
data = Dataset.load_from_df(ratings[['useri', ' movie_id', ' rating']], reader)
model = KNNBasicWithPersonality(sim_options={
  'name': 'cosine', 
  'user_based': True, 
  "personalityAndRatingsDict": personalityAndRatingsDict, 
  "userIdToPersonalityTraitDict": userIdToPersonalityTraitDict,
  "personalityData": personality, 
  "movieData": movieDetailsDict,
  "personalityWeight": personalityWeight,
  "genres": genres
})

results = cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print(results)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasicWithPersonality on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8885  0.8872  0.8926  0.8859  0.8882  0.8885  0.0023  
MAE (testset)     0.6841  0.6837  0.6872  0.6835  0.6843  0.6846  0.0014  
Fit time          4.44    4.52    4.50    4.51    4.47    4.49    0.03    
Test time         60.83   61.26   60.73   60.90   61.40   61.02   0.26    
{'test_rmse': array([0.88846538, 0.88723197, 0.89263614, 0.88588207, 0.88822942]), 'test_mae': array([0.68410402, 0.6837298 , 0.68723688, 0.68351404, 0.68427325]), 'fit_ti

## Gradient Descent

In [93]:
import numpy as np

# Define the format
reader = Reader(rating_scale=(ratings[' rating'].min(), ratings[' rating'].max()))
# Load the data from the dataframe
data = Dataset.load_from_df(ratings[['useri', ' movie_id', ' rating']], reader)

# Cost Function - Runs Cross validation that outputs the RMSE
def compute_rmse(weight):
  model = KNNBasicWithPersonality(sim_options={
    'name': 'cosine', 
    'user_based': True, 
    "personalityAndRatingsDict": personalityAndRatingsDict, 
    "userIdToPersonalityTraitDict": userIdToPersonalityTraitDict,
    "personalityData": personality, 
    "movieData": movieDetailsDict,
    "personalityWeight": weight,
    "genres": genres
  })
  results = cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=False)
  rmse = np.mean(results['test_rmse'])
  return rmse

# Gradient Descent Function
def gradient_descent(init_weight, learning_rate, iterations):
  # Obtain Weight from initial weight
  weight = init_weight
  # Run N iterations
  h = .001  # Small step for numerical gradient
  for i in range(iterations):
    print("Starting step", i)
    # Get current Cost
    current_rmse = compute_rmse(weight)
    print(f"Iteration {i}, RMSE: {current_rmse}, Weight: {weight}")
    grad = 0
    # Adjust weight by h
    weight += h
    print(f"Testing with weight: {weight}")
    # Compute RSME with adjusted trait
    rmse_with_h = compute_rmse(weight)
    print(f"RMSE tested {rmse_with_h}")
    # Compute derivitive according to weight
    grad = (rmse_with_h - current_rmse) / h
    print(f"Gradient: {grad}")
    # Reset trait back to original weight
    weight -= h 
    
    # After test, descend gradient
    print(f"Gradient changing by {learning_rate * grad}")
    weight -= learning_rate * grad
    print(f"New weight: {weight}")

  print("Final", weight)
  return weight

# Initialize weights
initial_weight = 0.23224511718044732

# Parameters for the gradient descent
learning_rate = .1
iterations = 20

# Run gradient descent
optimized_weights = gradient_descent(initial_weight, learning_rate, iterations)


Starting step 0
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Iteration 0, RMSE: 0.88862820442151, Weight: 0.23141492101871536
Testing with weight: 0.23241492101871536
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE tested 0.8886299024598927
Gradient: 0.0016980383826803447
Gradient changing by 0.00016980383826803447
New weight: