In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

data = pd.read_csv('NetflixOriginals.csv')


data['IMDB'] = data['IMDB'].fillna(data['IMDB'].mean())

genres = data['Genre'].str.get_dummies(',')
data = pd.concat([data, genres], axis=1)

data['Premiere'] = pd.to_datetime(data['Premiere'])
data['PremiereYear'] = data['Premiere'].dt.year

data = data.drop(['Title', 'Genre', 'Premiere', 'Runtime', 'Language', 'Director'], axis=1)


data['NumActors'] = data['Cast'].str.count(',') + 1


X = data.drop(['IMDB'], axis=1)
y = data['IMDB']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R^2): {r2:.2f}")


plt.scatter(y_test, y_pred)
plt.xlabel("Actual IMDb Scores")
plt.ylabel("Predicted IMDb Scores")
plt.title("Actual vs. Predicted IMDb Scores")
plt.show()
