In [1]:
# Initial imports.
import numpy as np
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
# Loading data
file_path = Path("Resources/Movie_Data.csv")
movies_df = pd.read_csv(file_path)
movies_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Little Buddha (1993),Drama
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance


In [3]:
movies_df.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
title         object
genres        object
dtype: object

In [4]:
# Preprocess the data

# check which columns have empty values and see if they can be replaced, if they can not be replaced then drop the null rows

# Drop the null columns where all values are null
# This step is not necessary - movies_df = movies_df.dropna(axis='columns', how='all')

# Drop the null rows
movies_df = movies_df.dropna()





movies_df


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Little Buddha (1993),Drama
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance
5,1,1088,4.0,1147868495,Dirty Dancing (1987),Drama|Musical|Romance


In [5]:
#convert timestamp column into datetime
# how is datetime send to model? 
# this column represents - time movie was rated or time movie was watched?
# google to see how other developers handled this column

In [6]:
# do we need to send title to model?
# pandas.DataFrame.corr
# run a pandas correlation function and create a heat map to see which all columns are connected to ratings, 
# research how corr is used to create heatmaps
# if there is correlation -1 to 1, if corr is high then send it to model, if low then delete the column. 
# closer to -1 means negativee cor, closer to 1 is positive corr

# to better understand importance of this feature, use heatmap graph on 

In [7]:
# if there is no cor then remove column, if keeping it then use LAbel Enconder
# Drop title column
movies_df = movies_df.drop(['title'], axis=1)

# Use label encoder on title column, if movie title is sent to model

In [8]:
# movie id - not necessary for training model - drop column

# user id is not necessary for training the model- drop column

In [9]:
movies_df.select_dtypes(include=['object']).columns

Index(['genres'], dtype='object')

In [10]:
movies_df.genres.value_counts()

Drama                          3
Comedy|Crime|Drama|Thriller    1
Comedy|Musical|Romance         1
Drama|Musical|Romance          1
Name: genres, dtype: int64

In [11]:
# convert genres into separate columns - using your own function instead of get Dummies

movies_df = pd.get_dummies(movies_df, columns = ['genres'])
movies_df.head()

# instead of using get_dummies, write a custom function to transform data into multiple new columns. 

Unnamed: 0,userId,movieId,rating,timestamp,genres_Comedy|Crime|Drama|Thriller,genres_Comedy|Musical|Romance,genres_Drama,genres_Drama|Musical|Romance
0,1,296,5.0,1147880044,1,0,0,0
1,1,306,3.5,1147868817,0,0,1,0
2,1,307,5.0,1147868828,0,0,1,0
3,1,665,5.0,1147878820,0,0,1,0
4,1,899,3.5,1147868510,0,1,0,0


In [12]:
# remove userid column
X = movies_df.drop(['userId'], axis =1)


y = movies_df[['rating']]

In [13]:
X.describe()

Unnamed: 0,movieId,rating,timestamp,genres_Comedy|Crime|Drama|Thriller,genres_Comedy|Musical|Romance,genres_Drama,genres_Drama|Musical|Romance
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,593.5,4.333333,1147872000.0,0.166667,0.166667,0.5,0.166667
std,345.316521,0.752773,5576.639,0.408248,0.408248,0.547723,0.408248
min,296.0,3.5,1147868000.0,0.0,0.0,0.0,0.0
25%,306.25,3.625,1147869000.0,0.0,0.0,0.0,0.0
50%,486.0,4.5,1147869000.0,0.0,0.0,0.5,0.0
75%,840.5,5.0,1147876000.0,0.0,0.0,1.0,0.0
max,1088.0,5.0,1147880000.0,1.0,1.0,1.0,1.0


In [14]:
y['rating'].value_counts()

5.0    3
3.5    2
4.0    1
Name: rating, dtype: int64

In [15]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [16]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# Create a random forest Regressor
k = 5
rf_model = RandomForestRegressor(n_estimators=128, random_state=78, max_depth = k) 

In [18]:
# Fitting the model
# y_train.shape
# rf_model = rf_model.fit(X_train_scaled, y_train)
rf_model = rf_model.fit(X_train_scaled, y_train.values.ravel())
# np.ravel(y_train)
# y_train.values.ravel().shape

In [19]:
# Making predictions using the testing data.

predictions = rf_model.predict(X_test_scaled)

In [26]:
# X_train_scaled

array([[-0.56553735,  0.96225045, -0.55377331, -0.57735027,  0.        ,
         1.        , -0.57735027],
       [-0.56847902, -1.34715063, -0.55601448, -0.57735027,  0.        ,
         1.        , -0.57735027],
       [-0.59789579,  0.96225045,  1.73140751,  1.73205081,  0.        ,
        -1.        , -0.57735027],
       [ 1.73191216, -0.57735027, -0.62161971, -0.57735027,  0.        ,
        -1.        ,  1.73205081]])

In [29]:
print("Difference between y_test and predictions\n", y_test, " ", predictions)

Difference between y_test and predictions
    rating
3     5.0
4     3.5   [4.82421875 3.92578125]


In [24]:
# predictions

array([4.82421875, 3.92578125])

In [20]:
# Display performance metrics
print('MAE: ', mean_absolute_error(y_test, predictions))
print('MSE: ', mean_squared_error(y_test, predictions)) 

MAE:  0.30078125
MSE:  0.1060943603515625


In [21]:
# train_scores = []
# random_forest_models = []

# train_scores.append(rf_model.score(X_train_scaled, y_train))
rf_model.score(X_train_scaled, y_train)

# random_forest_models.append(rf_model)

0.8545464409722222

In [22]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.09371593, 0.32247993, 0.42462087, 0.03270889, 0.        ,
       0.0415469 , 0.08492748])

In [23]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.424620874219447, 'timestamp'),
 (0.32247992863514713, 'rating'),
 (0.09371592823867575, 'movieId'),
 (0.0849274787722602, 'genres_Drama|Musical|Romance'),
 (0.04154689926322393, 'genres_Drama'),
 (0.032708890871245955, 'genres_Comedy|Crime|Drama|Thriller'),
 (0.0, 'genres_Comedy|Musical|Romance')]

In [None]:
#  note:  handle genres differently or is it ok? The above indicates that movie that falls in multiple genres have a higher importance.


In [None]:
print("Difference between y_test and predictions\n", y_test, " ", predictions)

In [None]:
# compare ypred against ytest
# create graph 

In [None]:
# join predictions with dataframe based on index
make plots, compare ypred and ytest
