In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Load data
ratings_df = pd.read_csv('ratings.csv')
movies_df = pd.read_csv('movies.csv')

print("Load Dataset - Done")

# Merge data
df = pd.merge(ratings_df, movies_df, on='movieId')

# Create one-hot encoding of movie genres
genres = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
for genre in genres:
    df[genre] = df['genres'].apply(lambda x: 1 if genre in x else 0)

df = df.head(1000)
    
# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2)

print("Splitting Dataset - Done")

# Train Decision Tree model
dt = DecisionTreeRegressor()
dt.fit(train_df[genres], train_df['rating'])

print("Train Decision Tree model - Done")

# Make predictions
preds = dt.predict(test_df[genres])

#Evaluate Model
from sklearn.metrics import mean_squared_error,mean_absolute_error, explained_variance_score, median_absolute_error

mae = mean_absolute_error(test_df['rating'], preds)
rmse = mean_squared_error(test_df['rating'], preds, squared=False)
mape = (abs(test_df['rating'] - preds) / test_df['rating']).mean() * 100
evs = explained_variance_score(test_df['rating'], preds)
medae = median_absolute_error(test_df['rating'], preds)

print("Decision Tree model - Evaluated")
print('MAE: ', mae)
print('RMSE: ', rmse)
print('MAPE: ', mape)
print('Explained Variance Score: ', evs)
print('Median Absolute Error: ', medae)


C:\Users\rithe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\rithe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


Load Dataset - Done
Splitting Dataset - Done
Train Decision Tree model - Done
Decision Tree model - Evaluated
MAE:  0.7264125
RMSE:  0.9103064387886092
MAPE:  25.1611498015873
Explained Variance Score:  1.1102230246251565e-16
Median Absolute Error:  0.82125
