In [1]:
# Decision Trees
# Needed libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [2]:
# Takes the csv file and converts it into a dataframe
df = pd.read_csv('/Users/dimitrishort/Documents/DataMiningFinal/ProcessedMovieData.csv')

# Displays the first few rows of the dataframe 
df.head()

Unnamed: 0,userId,movieId,rating,title,genres,datetime,year,month,day,dayofweek,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year
0,1,1,0.777778,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,2000-07-30 18:45:03,2000,7,30,6,...,0,0,0,0,0,0,0,0,0,1995.0
1,5,1,0.777778,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1996-11-08 06:36:02,1996,11,8,4,...,0,0,0,0,0,0,0,0,0,1995.0
2,7,1,0.888889,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,2005-01-25 06:52:26,2005,1,25,1,...,0,0,0,0,0,0,0,0,0,1995.0
3,15,1,0.444444,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,2017-11-13 12:59:30,2017,11,13,0,...,0,0,0,0,0,0,0,0,0,1995.0
4,17,1,0.888889,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,2011-05-18 05:28:03,2011,5,18,2,...,0,0,0,0,0,0,0,0,0,1995.0


In [3]:
# Drop the title and genres columns as they aren't needed
df.drop(['datetime'], axis=1, inplace=True)

df = df.dropna() # Removes rows that has missing values 

# Prints info to ensure data is in right format 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100818 entries, 0 to 100835
Data columns (total 33 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   userId              100818 non-null  int64  
 1   movieId             100818 non-null  int64  
 2   rating              100818 non-null  float64
 3   title               100818 non-null  object 
 4   genres              100818 non-null  object 
 5   year                100818 non-null  int64  
 6   month               100818 non-null  int64  
 7   day                 100818 non-null  int64  
 8   dayofweek           100818 non-null  int64  
 9   hour                100818 non-null  int64  
 10  genres_encoded      100818 non-null  int64  
 11  title_encoded       100818 non-null  int64  
 12  (no genres listed)  100818 non-null  int64  
 13  Action              100818 non-null  int64  
 14  Adventure           100818 non-null  int64  
 15  Animation           100818 non-nul

In [4]:
# Drops columns that aren't needed for the model 
X = df.drop(['userId', 'movieId', 'rating', 'title', 'genres'], axis=1)
y = df['rating'] # Sets the target to rating for the model 

# Splits the dataset into training and test sets with 20% for testing and 80% for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
model = DecisionTreeRegressor(random_state=42) # Initializes DecisionTreeRegressor 
model.fit(X_train, y_train) # Trains the model using the training dataset 

y_pred = model.predict(X_test) # Uses the trained model to predict ratings on the test dataset

mse = mean_squared_error(y_test, y_pred) # Calculates the MSE between the actual ratings and the predicted ratings
print("Mean Squared Error:", mse) # Prints the MSE 

Mean Squared Error: 0.0804015100864271


In [6]:
def recommend_movies(user_id, n_recommendations):
    # Gets the movies that the user already rated 
    rated_movies = df[df['userId'] == user_id]['movieId']
    
    # Gets the movies that haven't been rated by the user 
    unrated_movies = df[~df['movieId'].isin(rated_movies)]
    
    # Prepares the data for the unrated movies by dropping unnecessary columns
    X_unrated = unrated_movies.drop(['userId', 'movieId', 'rating', 'title', 'genres'], axis=1)
    
    # Predicts ratings for the unrated movies from the trained model 
    predicted_ratings = model.predict(X_unrated) 
    
    # Adds the predicted ratings to the unrated dataframe 
    unrated_movies['predicted_rating'] = predicted_ratings
    
    # Gets the needed columns, cleans it by removing duplicates and gets the top recommendations 
    recommendations = unrated_movies[['movieId', 'title', 'genres', 'predicted_rating']].drop_duplicates()
    
    # Sorts the recommended movies in descending order 
    recommendations = recommendations.sort_values('predicted_rating', ascending=False).head(n_recommendations)
    return recommendations # Returns the list of recommended movies 

# Calls the function to give recommendations 
print(recommend_movies(1, 5))

       movieId                                              title  \
89904     5088                      Going Places (Valseuses, Les)   
56211    74510  Girl Who Played with Fire, The (Flickan som le...   
88655   159811                          The Bremen Town Musicians   
88656   163072                                        Winnie Pooh   
88657   163112                      Winnie the Pooh Goes Visiting   

                                    genres  predicted_rating  
89904                   Comedy|Crime|Drama               1.0  
56211  Action|Crime|Drama|Mystery|Thriller               1.0  
88655              Animation|Drama|Fantasy               1.0  
88656                   Animation|Children               1.0  
88657                            Animation               1.0  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unrated_movies['predicted_rating'] = predicted_ratings


In [None]:
'''
The movie recommendations using Decision Tree modeling does produce recommendations
that are aligned with the user preferences. It captures genres such as Drama, Action, 
and Crime which are among the top genres preferred by the user. On the otherhand, the 
recommendations also include genres that are less preferred as well such as Children and
Fantasy. It appears that the model is also overfitting as the predicted ratings are 
all 1.0. Altogether, the Decision Tree model can work as an effective movie recommendation
model as most of the movies do align with the users preferred genres. 
'''