In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("mymoviedb.csv", on_bad_lines='skip', engine='python')

In [3]:
df.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...


In [4]:
df.tail()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
9832,1973-10-15,Badlands,A dramatization of the Starkweather-Fugate kil...,13.357,896,7.6,en,"Drama, Crime",https://image.tmdb.org/t/p/original/z81rBzHNgi...
9833,2020-10-01,Violent Delights,A female vampire falls in love with a man she ...,13.356,8,3.5,es,Horror,https://image.tmdb.org/t/p/original/4b6HY7rud6...
9834,2016-05-06,The Offering,When young and successful reporter Jamie finds...,13.355,94,5.0,en,"Mystery, Thriller, Horror",https://image.tmdb.org/t/p/original/h4uMM1wOhz...
9835,2021-03-31,The United States vs. Billie Holiday,Billie Holiday spent much of her career being ...,13.354,152,6.7,en,"Music, Drama, History",https://image.tmdb.org/t/p/original/vEzkxuE2sJ...
9836,1984-09-23,Threads,Documentary style account of a nuclear holocau...,13.354,186,7.8,en,"War, Drama, Science Fiction",https://image.tmdb.org/t/p/original/lBhU4U9Eeh...


In [5]:
df.columns

Index(['Release_Date', 'Title', 'Overview', 'Popularity', 'Vote_Count',
       'Vote_Average', 'Original_Language', 'Genre', 'Poster_Url'],
      dtype='object')

In [6]:
df.shape

(9837, 9)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9837 entries, 0 to 9836
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Release_Date       9837 non-null   object 
 1   Title              9828 non-null   object 
 2   Overview           9828 non-null   object 
 3   Popularity         9827 non-null   float64
 4   Vote_Count         9827 non-null   object 
 5   Vote_Average       9827 non-null   object 
 6   Original_Language  9827 non-null   object 
 7   Genre              9826 non-null   object 
 8   Poster_Url         9826 non-null   object 
dtypes: float64(1), object(8)
memory usage: 691.8+ KB


In [8]:
df.describe(include="all")

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
count,9837,9828,9828,9827.0,9827.0,9827.0,9827,9826,9826
unique,5903,9514,9823,,3267.0,75.0,44,2337,9826
top,2022-03-10,Beauty and the Beast,The Kingdom of the People of the Earth once ru...,,0.0,6.4,en,Drama,https://image.tmdb.org/t/p/original/lBhU4U9Eeh...
freq,16,4,2,,100.0,435.0,7569,466,1
mean,,,,40.32057,,,,,
std,,,,108.874308,,,,,
min,,,,7.1,,,,,
25%,,,,16.1275,,,,,
50%,,,,21.191,,,,,
75%,,,,35.1745,,,,,


In [9]:
df.isna().sum()

Release_Date          0
Title                 9
Overview              9
Popularity           10
Vote_Count           10
Vote_Average         10
Original_Language    10
Genre                11
Poster_Url           11
dtype: int64

In [3]:
num_cols = df.select_dtypes(include = np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

In [4]:
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [5]:
df = df.drop_duplicates()

In [6]:
df['Release_Date'] = pd.to_datetime(df['Release_Date'], errors = 'coerce')
df['release_year'] = df['Release_Date'].dt.year
df['release_month'] = df['Release_Date'].dt.month

In [None]:
df.sort_values(by='popularity', ascending=False)

In [None]:
df[df['Vote_Average'] > 7.0]

In [None]:
df.groupby('original_language')['revenue'].mean()

In [None]:
df['long_movie'] = df['runtime'].apply(lambda x: 1 if x >= 120 else 0)

In [None]:
df['rating_rank'] = df['vote_average'].rank(ascending=False)

In [None]:
df['revenue_per_minute'] = df['revenue'] / df['runtime']

In [None]:
df['popularity_per_vote'] = df['popularity'] / (df['vote_count'] + 1)

In [None]:
df['revenue'] = np.log1p(df['revenue'])

In [None]:
df = pd.get_dummies(df, columns=['original_language'], drop_first=True)

In [None]:
np.mean(df['vote_average'])
np.median(df['vote_average'])
np.std(df['vote_average'])
np.percentile(df['vote_average'], [25, 50, 75])
np.corrcoef(df['budget'], df['revenue'])

In [None]:
features = ['budget', 'runtime', 'popularity', 'vote_count', 'revenue_per_min', 'popularity_vote_ratio', 'release_year']
X = df[features]
y = df['vote_average']

In [None]:
X_scaled = (X - np.mean(X, axis=0) / np.std(X, axis=0))

In [None]:
import matplotlib.pyplot as plt

plt.hist(df['vote_average'], bins=20)
plt.title('Movie Ratings Distribution')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.scatter(df['budget'], df['revenue'], alpha=0.3)
plt.xlabel('Budget')
plt.ylabel('Revenue')
plt.title('Budget vs Revenue')
plt.show()

In [None]:
plt.scatter(df['popularity'], df['vote_average'], alpha=0.3)
plt.xlabel('Popularity')
plt.ylabel('Rating')
plt.title('Popularity vs Rating')
plt.show()

In [None]:
df['genres'] = df['genres'].str.lower()
df['genres'] = df['genres'].str.replace('[^a-zA-Z]', '', regex=True)

In [None]:
genre_dummies = df['genres'].str.get_dummies(sep=',')
df = pd.concat([df, genre_dummies], axis=1)

In [None]:
revenue_theshold = df['revenue'].median()

df['success'] = np.where(
    (df['voting_average'] > 7.0) & (df['revenue'] >= revenue_theshold),
    1,
    0
)

In [None]:
base_features = ['budget', 'runtime', 'popularity', 'vote_count',
'revenue_per_min', 'popularity_vote_ratio', 'release_year']

genre_features = genre_dummies.columns.tolist()
features = base_features + genre_features

X = df[features].fillna(0)
y_ref = df['vote_average']
y_clif = df['success']

In [None]:
X_scaled = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train_pred, y_train_pred = train_test_split(
    X_scaled, y_ref, test_size=0.2, random_state=42
) 

_, _, y_train_clf, y_test_clf = train_test_split(
    X_scaled, y_clif, test_size=0.2, random_state=42
)

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train_pred)
y_pred_lr = lr.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor


rf_reg = RandomForestRegressor(n_estimators=200, random_state=42)
rf_reg.fit(X_train, y_train_reg)
y_pred_rf = rf_reg.predict(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(max_iter=1000)
log_clf.fit(X_train, y_train_clf)
y_pred_log = log_clf.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
rf_clf.fit(X_train, y_train_clf)
y_pred_rf_clf = rf_clf.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test_reg, y_pred_rf)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_rf))
r2 = r2_score(y_test_reg, y_pred_rf)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test_clf, y_pred_rf_clf)
classification = classification_report(y_test_clf, y_pred_rf_clf)

In [None]:
plt.scatter(y_test_reg, y_pred_rf, alpha=0.3)
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.title('Random Forest Rating Prediction')
plt.show()

In [None]:
ml_df = pd.concat([X_scaled, y_ref, y_clif], axis=1)
ml_df.to_csv('movie_ml_data.csv', index=False)

In [None]:
plt.scatter(df['budget'], df['revenue'], alpha=0.3)
plt.xlabel('Budget')
plt.ylabel('Revenue')
plt.title('Budget vs Predicted Revenue')
plt.show()

In [None]:
plt.scatter(df['popularity'], df['vote_average'], alpha=0.3)
plt.xlabel('Popularity')
plt.ylabel('Rating')
plt.title('Popularity vs Rating')
plt.show()

In [None]:
ml_df = pd.concat([X_scaled, y], axis=1)
ml_df.to_csv('movie_ml_data.csv', index=False)