In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder


In [31]:
# data processing

df= pd.read_csv("movie_metadata.csv")
#drop useless(we think it's) columns
df=df.drop(columns=["color","director_name","actor_1_name","actor_2_name","actor_3_name","language","country","movie_imdb_link","plot_keywords","movie_title","title_year","cast_total_facebook_likes","facenumber_in_poster","aspect_ratio","movie_facebook_likes"])
#drop row with missing(NAN)value
df=df.dropna()
# count how many movies (row )left
df.count()

#encode genres(multi-label)
df['genres']=df['genres'].str.split('|')
mlb= MultiLabelBinarizer()
genre_encoded=pd.DataFrame(mlb.fit_transform(df['genres']), columns=mlb.classes_, index=df.index)
df=df.drop(columns=["genres"]).join(genre_encoded)

# encode content_rating
le=LabelEncoder()
df['content_rating'] = le.fit_transform(df['content_rating'])

#the value we want to predict
scores=df["imdb_score"].values
df= df.drop(columns=["imdb_score"])
print(scores)


[7.9 7.1 6.8 ... 6.9 6.4 6.6]


In [43]:
X_train,X_test,y_train,y_test=train_test_split(df,scores,test_size=0.2,shuffle=True,random_state=42)
model=RandomForestRegressor()
model.fit(X_train, y_train)
# predict
y_pred=model.predict(X_test)

# evaluate
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(y_pred[:10])  # Print first 10 predictions
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

[5.421 7.171 6.808 8.148 6.007 5.458 4.967 7.166 5.241 5.876]
Mean Squared Error: 0.44152238331160365
R^2 Score: 0.6070761701749023


In [45]:
# base  model  decision tree
from random_forest import DecisionTree
X_train,X_test,y_train,y_test=train_test_split(df,scores,test_size=0.2,shuffle=True,random_state=42)
model = DecisionTree(
    max_depth=None,
    max_features=None,
)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(y_pred[:10])  # Print first 10 predictions
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


[5.8 6.7 7.5 8.4 6.  5.8 5.5 6.6 4.8 6.2]
Mean Squared Error: 0.8696610169491525
R^2 Score: 0.22606293509679543


In [None]:
from random_forest import RandomForest
# pre processing
X_train,X_test,y_train,y_test=train_test_split(df,scores,test_size=0.2,shuffle=True,random_state=42)
# train model

model = RandomForest(
    n_trees=50,
    max_depth=None,
    max_features='sqrt'
)
model.fit(X_train, y_train)
# predict
y_pred=model.predict(X_test)

# evaluate
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(y_pred[:10])  # Print first 10 predictions
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")



Training trees...: 100%|██████████| 50/50 [02:47<00:00,  3.35s/it]


(767, 33)
[5.62665952 6.837      6.81506641 7.6873155  5.95910476 5.72383622
 5.76999399 7.00005013 5.55366667 6.03416389]
Mean Squared Error: 0.4532623446109051
R^2 Score: 0.5966284313284114
