## Regression Analysis & Preditction for Business Success Metrics

### Import Libraries

In [27]:
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint
import warnings
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  

In [3]:
df = pd.DataFrame(pd.read_csv('../data/imdb_tmdb_regression.csv'))
df.head()

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,budget,original_language,popularity,production_companies,production_countries,keywords,overview_sentiment,actor_list,director,genre
0,27205,Inception,8.364,34495,2010-07-15,825532764,148,160000000,en,83.952,"['Legendary Pictures', 'Syncopy', 'Warner Bros...","['United Kingdom', 'United States of America']","['rescue', 'mission', 'dream', 'airplane', 'pa...",-0.011111,"['Leonardo DiCaprio', 'Joseph Gordon-Levitt', ...",['Christopher Nolan'],"['Action', 'Science Fiction', 'Adventure']"
1,157336,Interstellar,8.417,32571,2014-11-05,701729206,169,165000000,en,140.241,"['Legendary Pictures', 'Syncopy', 'Lynda Obst ...","['United Kingdom', 'United States of America']","['rescue', 'future', 'spacecraft', 'race again...",0.045455,"['Matthew McConaughey', 'Anne Hathaway', 'Jess...",['Christopher Nolan'],"['Adventure', 'Drama', 'Science Fiction']"
2,155,The Dark Knight,8.512,30619,2008-07-16,1004558444,152,185000000,en,130.643,"['DC Comics', 'Legendary Pictures', 'Syncopy',...","['United Kingdom', 'United States of America']","['joker', 'sadism', 'chaos', 'secret identity'...",0.025,"['Christian Bale', 'Heath Ledger', 'Aaron Eckh...",['Christopher Nolan'],"['Drama', 'Action', 'Crime', 'Thriller']"
3,19995,Avatar,7.573,29815,2009-12-15,2923706026,162,237000000,en,79.932,"['Dune Entertainment', 'Lightstorm Entertainme...","['United States of America', 'United Kingdom']","['future', 'society', 'culture clash', 'space ...",0.041667,"['Sam Worthington', 'Zoe Saldana', 'Sigourney ...",['James Cameron'],"['Action', 'Adventure', 'Fantasy', 'Science Fi..."
4,24428,The Avengers,7.71,29166,2012-04-25,1518815515,143,220000000,en,98.082,['Marvel Studios'],['United States of America'],"['new york city', 'superhero', 'shield', 'base...",0.025,"['Robert Downey Jr.', 'Chris Evans', 'Scarlett...",['Joss Whedon'],"['Science Fiction', 'Action', 'Adventure']"


In [4]:
df['production_companies'] = df['production_companies'].progress_apply(lambda x: ast.literal_eval(x))
df['production_countries'] = df['production_countries'].progress_apply(lambda x: ast.literal_eval(x))
df['keywords'] = df['keywords'].progress_apply(lambda x: ast.literal_eval(x))
df['actor_list'] = df['actor_list'].progress_apply(lambda x: ast.literal_eval(x))
df['director'] = df['director'].progress_apply(lambda x: ast.literal_eval(x))
df['genre'] = df['genre'].progress_apply(lambda x: ast.literal_eval(x))
df.head()

  0%|          | 0/709920 [00:00<?, ?it/s]

  0%|          | 0/709920 [00:00<?, ?it/s]

  0%|          | 0/709920 [00:00<?, ?it/s]

  0%|          | 0/709920 [00:00<?, ?it/s]

  0%|          | 0/709920 [00:00<?, ?it/s]

  0%|          | 0/709920 [00:00<?, ?it/s]

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,budget,original_language,popularity,production_companies,production_countries,keywords,overview_sentiment,actor_list,director,genre
0,27205,Inception,8.364,34495,2010-07-15,825532764,148,160000000,en,83.952,"[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[rescue, mission, dream, airplane, paris, fran...",-0.011111,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",[Christopher Nolan],"[Action, Science Fiction, Adventure]"
1,157336,Interstellar,8.417,32571,2014-11-05,701729206,169,165000000,en,140.241,"[Legendary Pictures, Syncopy, Lynda Obst Produ...","[United Kingdom, United States of America]","[rescue, future, spacecraft, race against time...",0.045455,"[Matthew McConaughey, Anne Hathaway, Jessica C...",[Christopher Nolan],"[Adventure, Drama, Science Fiction]"
2,155,The Dark Knight,8.512,30619,2008-07-16,1004558444,152,185000000,en,130.643,"[DC Comics, Legendary Pictures, Syncopy, Isobe...","[United Kingdom, United States of America]","[joker, sadism, chaos, secret identity, crime ...",0.025,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",[Christopher Nolan],"[Drama, Action, Crime, Thriller]"
3,19995,Avatar,7.573,29815,2009-12-15,2923706026,162,237000000,en,79.932,"[Dune Entertainment, Lightstorm Entertainment,...","[United States of America, United Kingdom]","[future, society, culture clash, space travel,...",0.041667,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],"[Action, Adventure, Fantasy, Science Fiction]"
4,24428,The Avengers,7.71,29166,2012-04-25,1518815515,143,220000000,en,98.082,[Marvel Studios],[United States of America],"[new york city, superhero, shield, based on co...",0.025,"[Robert Downey Jr., Chris Evans, Scarlett Joha...",[Joss Whedon],"[Science Fiction, Action, Adventure]"


In [5]:
df['month'] = df['release_date'].apply(lambda x: x.split('-')[1])
df['day'] = df['release_date'].apply(lambda x: x.split('-')[2])
df.drop(columns=['release_date'], inplace=True)
df.head()

Unnamed: 0,id,title,vote_average,vote_count,revenue,runtime,budget,original_language,popularity,production_companies,production_countries,keywords,overview_sentiment,actor_list,director,genre,month,day
0,27205,Inception,8.364,34495,825532764,148,160000000,en,83.952,"[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[rescue, mission, dream, airplane, paris, fran...",-0.011111,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",[Christopher Nolan],"[Action, Science Fiction, Adventure]",7,15
1,157336,Interstellar,8.417,32571,701729206,169,165000000,en,140.241,"[Legendary Pictures, Syncopy, Lynda Obst Produ...","[United Kingdom, United States of America]","[rescue, future, spacecraft, race against time...",0.045455,"[Matthew McConaughey, Anne Hathaway, Jessica C...",[Christopher Nolan],"[Adventure, Drama, Science Fiction]",11,5
2,155,The Dark Knight,8.512,30619,1004558444,152,185000000,en,130.643,"[DC Comics, Legendary Pictures, Syncopy, Isobe...","[United Kingdom, United States of America]","[joker, sadism, chaos, secret identity, crime ...",0.025,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",[Christopher Nolan],"[Drama, Action, Crime, Thriller]",7,16
3,19995,Avatar,7.573,29815,2923706026,162,237000000,en,79.932,"[Dune Entertainment, Lightstorm Entertainment,...","[United States of America, United Kingdom]","[future, society, culture clash, space travel,...",0.041667,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],"[Action, Adventure, Fantasy, Science Fiction]",12,15
4,24428,The Avengers,7.71,29166,1518815515,143,220000000,en,98.082,[Marvel Studios],[United States of America],"[new york city, superhero, shield, based on co...",0.025,"[Robert Downey Jr., Chris Evans, Scarlett Joha...",[Joss Whedon],"[Science Fiction, Action, Adventure]",4,25


In [6]:
df = df[df['runtime'] <= 300]
df.shape

(708541, 18)

In [7]:
# Selecting the columns for PCA
pca_features = df[['vote_average', 'popularity', 'revenue']]

# Standardizing the features
scaler = MinMaxScaler()
pca_features_scaled = scaler.fit_transform(pca_features)
df['runtime'] = scaler.fit_transform(df[['runtime']])

# Performing PCA
pca = PCA()
pca_result = pca.fit_transform(pca_features_scaled)

# Showing the variance percentage from PCA
explained_variance = pca.explained_variance_ratio_
explained_variance

array([9.99434833e-01, 4.89825234e-04, 7.53414346e-05])

In [8]:
# Adding the PCA results to the dataframe
df['pca'] = pca_result[:, 0]
df.head()

Unnamed: 0,id,title,vote_average,vote_count,revenue,runtime,budget,original_language,popularity,production_companies,production_countries,keywords,overview_sentiment,actor_list,director,genre,month,day,pca
0,27205,Inception,8.364,34495,825532764,0.493333,160000000,en,83.952,"[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[rescue, mission, dream, airplane, paris, fran...",-0.011111,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",[Christopher Nolan],"[Action, Science Fiction, Adventure]",7,15,0.595799
1,157336,Interstellar,8.417,32571,701729206,0.563333,165000000,en,140.241,"[Legendary Pictures, Syncopy, Lynda Obst Produ...","[United Kingdom, United States of America]","[rescue, future, spacecraft, race against time...",0.045455,"[Matthew McConaughey, Anne Hathaway, Jessica C...",[Christopher Nolan],"[Adventure, Drama, Science Fiction]",11,5,0.60106
2,155,The Dark Knight,8.512,30619,1004558444,0.506667,185000000,en,130.643,"[DC Comics, Legendary Pictures, Syncopy, Isobe...","[United Kingdom, United States of America]","[joker, sadism, chaos, secret identity, crime ...",0.025,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",[Christopher Nolan],"[Drama, Action, Crime, Thriller]",7,16,0.610701
3,19995,Avatar,7.573,29815,2923706026,0.54,237000000,en,79.932,"[Dune Entertainment, Lightstorm Entertainment,...","[United States of America, United Kingdom]","[future, society, culture clash, space travel,...",0.041667,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],"[Action, Adventure, Fantasy, Science Fiction]",12,15,0.517698
4,24428,The Avengers,7.71,29166,1518815515,0.476667,220000000,en,98.082,[Marvel Studios],[United States of America],"[new york city, superhero, shield, based on co...",0.025,"[Robert Downey Jr., Chris Evans, Scarlett Joha...",[Joss Whedon],"[Science Fiction, Action, Adventure]",4,25,0.530734


In [9]:
y_df = df[['vote_average', 'revenue', 'popularity']]
df.drop(columns=['vote_average', 'revenue', 'popularity'], inplace=True)
y_df.head()

Unnamed: 0,vote_average,revenue,popularity
0,8.364,825532764,83.952
1,8.417,701729206,140.241
2,8.512,1004558444,130.643
3,7.573,2923706026,79.932
4,7.71,1518815515,98.082


In [10]:
# Selecting the columns to scale
scale_features = df[['budget', 'overview_sentiment']]

# Applying MinMaxScaler
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(scale_features)

# Adding the scaled features back to the dataframe
df[['budget', 'overview_sentiment']] = scaled_features
df.head()

Unnamed: 0,id,title,vote_count,runtime,budget,original_language,production_companies,production_countries,keywords,overview_sentiment,actor_list,director,genre,month,day,pca
0,27205,Inception,34495,0.493333,0.18018,en,"[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[rescue, mission, dream, airplane, paris, fran...",0.494444,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",[Christopher Nolan],"[Action, Science Fiction, Adventure]",7,15,0.595799
1,157336,Interstellar,32571,0.563333,0.185811,en,"[Legendary Pictures, Syncopy, Lynda Obst Produ...","[United Kingdom, United States of America]","[rescue, future, spacecraft, race against time...",0.522727,"[Matthew McConaughey, Anne Hathaway, Jessica C...",[Christopher Nolan],"[Adventure, Drama, Science Fiction]",11,5,0.60106
2,155,The Dark Knight,30619,0.506667,0.208333,en,"[DC Comics, Legendary Pictures, Syncopy, Isobe...","[United Kingdom, United States of America]","[joker, sadism, chaos, secret identity, crime ...",0.5125,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",[Christopher Nolan],"[Drama, Action, Crime, Thriller]",7,16,0.610701
3,19995,Avatar,29815,0.54,0.266892,en,"[Dune Entertainment, Lightstorm Entertainment,...","[United States of America, United Kingdom]","[future, society, culture clash, space travel,...",0.520833,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],"[Action, Adventure, Fantasy, Science Fiction]",12,15,0.517698
4,24428,The Avengers,29166,0.476667,0.247748,en,[Marvel Studios],[United States of America],"[new york city, superhero, shield, based on co...",0.5125,"[Robert Downey Jr., Chris Evans, Scarlett Joha...",[Joss Whedon],"[Science Fiction, Action, Adventure]",4,25,0.530734


In [11]:
# One-hot encode the 'original_language' column
encoded_languages = pd.get_dummies(df['original_language'], prefix='lang')
encoded_languages = encoded_languages.astype(int)

# Concatenate the new binary encoded columns to the original dataframe
df = pd.concat([df, encoded_languages], axis=1)

# Drop the original 'original_language' column
df.drop(columns=['original_language'], inplace=True)

df.head()

Unnamed: 0,id,title,vote_count,runtime,budget,production_companies,production_countries,keywords,overview_sentiment,actor_list,...,lang_uz,lang_vi,lang_wo,lang_xh,lang_xx,lang_yi,lang_yo,lang_za,lang_zh,lang_zu
0,27205,Inception,34495,0.493333,0.18018,"[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[rescue, mission, dream, airplane, paris, fran...",0.494444,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",...,0,0,0,0,0,0,0,0,0,0
1,157336,Interstellar,32571,0.563333,0.185811,"[Legendary Pictures, Syncopy, Lynda Obst Produ...","[United Kingdom, United States of America]","[rescue, future, spacecraft, race against time...",0.522727,"[Matthew McConaughey, Anne Hathaway, Jessica C...",...,0,0,0,0,0,0,0,0,0,0
2,155,The Dark Knight,30619,0.506667,0.208333,"[DC Comics, Legendary Pictures, Syncopy, Isobe...","[United Kingdom, United States of America]","[joker, sadism, chaos, secret identity, crime ...",0.5125,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",...,0,0,0,0,0,0,0,0,0,0
3,19995,Avatar,29815,0.54,0.266892,"[Dune Entertainment, Lightstorm Entertainment,...","[United States of America, United Kingdom]","[future, society, culture clash, space travel,...",0.520833,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",...,0,0,0,0,0,0,0,0,0,0
4,24428,The Avengers,29166,0.476667,0.247748,[Marvel Studios],[United States of America],"[new york city, superhero, shield, based on co...",0.5125,"[Robert Downey Jr., Chris Evans, Scarlett Joha...",...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [13]:
def encode_production_companies(df):
    # Fit MLBinarizer once on all unique values
    mlb = MultiLabelBinarizer(sparse_output=True)
    
    # Transform all rows at once
    encoded_sparse = mlb.fit_transform(df)
    
    # Convert to DataFrame
    encoded_df = pd.DataFrame.sparse.from_spmatrix(
        encoded_sparse,
        index=df.index,
        columns=mlb.classes_
    )
    
    return encoded_df

In [14]:
encoded_production_companies = encode_production_companies(df['production_companies'])
encoded_production_companies = encoded_production_companies.loc[:, encoded_production_companies.sum() > 1]
print(encoded_production_companies.shape)
encoded_production_companies.head()

(708541, 50395)


Unnamed: 0,"""A ME AND YOU MOTION PICTURE""","""G"" P.C. S.A.","""Hypnosis Mic - Division Rap Battle"" Rule the Stage Production Committee","""It Goes to 11"" Productions","""unknown"" estudios",#Sinning Works,&Bromet,((O))ECO,(Brothel movie),(Colossal) Pictures,...,닷 팩토리,동남아영화공사,미라클필름,선우엔터테인먼트,영화사 연두,영화사 장춘,웃음을 주는 영화,유비네트워크,프릭쇼,한국독립애니메이션협회
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
encoded_production_countries = encode_production_companies(df['production_countries'])
encoded_production_countries = encoded_production_countries.loc[:, encoded_production_countries.sum() > 1]
print(encoded_production_countries.shape)
encoded_production_countries.head()

(708541, 245)


Unnamed: 0,Afghanistan,Albania,Algeria,American Samoa,Andorra,Angola,Anguilla,Antarctica,Antigua and Barbuda,Argentina,...,Uzbekistan,Vanuatu,Venezuela,Vietnam,Wallis and Futuna Islands,Western Sahara,Yemen,Yugoslavia,Zambia,Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
encoded_keywords = encode_production_companies(df['keywords'])
encoded_keywords = encoded_keywords.loc[:, encoded_keywords.sum() > 1]
print(encoded_keywords.shape)
encoded_keywords.head()

(708541, 28001)


Unnamed: 0,"""robin hood"" thieves",#finalclash,#tubeclash,00's,10th century,10th century bc,11th century,12th century,13th century,14th century,...,蔡依林,跨年,阅兵,陰陽座,音乐,音乐 演唱会,音乐会,音乐剧,魔幻,단편영화
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
encoded_actors = encode_production_companies(df['actor_list'])
encoded_actors = encoded_actors.loc[:, encoded_actors.sum() > 1]
print(encoded_actors.shape)
encoded_actors.head()

(708541, 525599)


Unnamed: 0,"""Fast"" Eddie Clarke","""Freeway"" Ricky Ross","""Hollywood Jade"" Anderson","""Mr. Wang Leizhan""",''Knife'' Sotelo,'ABS','Baby' Carmen De Rue,'Big' Dale Storm,'Big' LeRoy Mobley,'Chico' Hernandez,...,진양욱,최가람,최대진,최두영,최수형,춘화,하성훈,허혜진,﻿Ar﻿chie Lanfranc﻿o,ＭＡＳＡ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
encoded_directors = encode_production_companies(df['director'])
encoded_directors = encoded_directors.loc[:, encoded_directors.sum() > 1]
print(encoded_directors.shape)
encoded_directors.head()

(708541, 83942)


Unnamed: 0,'Xiongzaixia' Tan Jiahao,(LA)HORDE,011668,3000cenahill,A Couple' A Cowboys,A Da,A K Dolven,A K Lohithadas,A K Sajan,A Nian,...,高书雷,高群书,黄微,강물결,권은지,김경재,이만흥,이상호,이석용,이현진
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
encoded_genres = encode_production_companies(df['genre'])
encoded_genres = encoded_genres.loc[:, encoded_genres.sum() > 1]
print(encoded_genres.shape)
encoded_genres.head()

(708541, 20)


Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,Unknown,War,Western
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
3,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [20]:
# Concatenate the new binary encoded columns to the original dataframe
df = pd.concat([df, encoded_production_companies, encoded_production_countries, encoded_keywords, encoded_actors, encoded_directors, encoded_genres], axis=1)

# Drop the original columns
df.drop(columns=['production_companies', 'production_countries', 'keywords', 'actor_list', 'director', 'genre'], inplace=True)

print(df.shape)
df.head()

(708541, 688381)


Unnamed: 0,id,title,vote_count,runtime,budget,overview_sentiment,month,day,pca,lang_aa,...,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,Unknown,War,Western
0,27205,Inception,34495,0.493333,0.18018,0.494444,7,15,0.595799,0,...,0,0,0,0,1,0,0,0,0,0
1,157336,Interstellar,32571,0.563333,0.185811,0.522727,11,5,0.60106,0,...,0,0,0,0,1,0,0,0,0,0
2,155,The Dark Knight,30619,0.506667,0.208333,0.5125,7,16,0.610701,0,...,0,0,0,0,0,0,1,0,0,0
3,19995,Avatar,29815,0.54,0.266892,0.520833,12,15,0.517698,0,...,0,0,0,0,1,0,0,0,0,0
4,24428,The Avengers,29166,0.476667,0.247748,0.5125,4,25,0.530734,0,...,0,0,0,0,1,0,0,0,0,0


In [21]:
scaler = MinMaxScaler()
X = df.drop(columns=['id', 'vote_count', 'pca'])
y = df['pca']

### Modelling

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Display the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_test.shape}")

In [25]:
with tf.device('/GPU:0'):
    model = Sequential([
        Dense(64, activation='relu', input_dim=10), BatchNormalization(), Dropout(0.2),
        Dense(32, activation='relu'), BatchNormalization(), Dropout(0.2),
        Dense(16, activation='relu'), BatchNormalization(), Dropout(0.2),
        Dense(1)  # Output layer for regression
    ])
    
    model.compile(optimizer='adam', loss='mse', metrics=['mape'])
    
    # Setup simple checkpoint
    checkpoint = ModelCheckpoint('model_weights.h5', save_weights_only=True, save_best_only=True, monitor='val_loss'
    )
    
    # Train the model
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), callbacks=[checkpoint])