## Import necessary modules and libraries

In [31]:
import pandas as pd
from math import ceil
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
import time

## Read the data

In [32]:
df = pd.read_excel("imdb_top_1000.xlsx")

## Get shape, head and describe

In [33]:
df.shape

(1000, 16)

In [34]:
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,Crime,9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,Action,9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,Crime,9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,Crime,9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [35]:
df.describe()

Unnamed: 0,IMDB_Rating,Meta_score,No_of_Votes,Gross
count,1000.0,843.0,1000.0,1000.0
mean,7.9493,77.97153,273692.9,62451650.0
std,0.275491,12.376099,327372.7,105114800.0
min,7.6,28.0,25088.0,1305.0
25%,7.7,70.0,55526.25,2351568.0
50%,7.9,79.0,138548.5,17917140.0
75%,8.1,87.0,374161.2,74103870.0
max,9.3,100.0,2343110.0,936662200.0


In [36]:
df.nunique()

Poster_Link      1000
Series_Title      999
Released_Year     100
Certificate        15
Runtime           140
Genre              14
IMDB_Rating        17
Overview         1000
Meta_score         63
Director          548
Star1             660
Star2             841
Star3             891
Star4             939
No_of_Votes       999
Gross             990
dtype: int64

## Drop undesired columns

In [37]:
df.drop(["Poster_Link", "Series_Title", "Released_Year", "Certificate", 
         "Runtime", "Overview", "Star1", "Star2", "Star3", "Star4", "Gross", "No_of_Votes"], axis=1, inplace = True)

## Reorder the dataframe columns

In [38]:
df = df[["Director", "Genre", "IMDB_Rating", "Meta_score"]]

---

# Dealing with missing data

In [39]:
print(df.isnull().sum())

Director         0
Genre            0
IMDB_Rating      0
Meta_score     157
dtype: int64


## There are 157 missing values in the "Meta_score" column

## Finding the mean of "Meta_score" column values with math module

In [40]:
mean = df["Meta_score"].mean()
print(mean)

roundUp_mean = ceil(mean)
print(roundUp_mean)

77.97153024911032
78


## Mean value is 77.97. Since all other numbers in "Meta_score" are integers, round this number up to 78

## Fill in the missing values with the mean value

In [41]:
df = df.fillna(value = roundUp_mean)

print(df["Meta_score"].isnull().sum())

0


## No more missing values in the "Meta_score" column

---

# Label and OneHot Encoding for converting categorical values/labels to numerical values
## Columns "Director" and "Genre" should be converted into a numeric form for machine learning algorithm

## Encode the "Director" column with LabelEncoder

In [42]:
le = LabelEncoder()

df["Director"] = le.fit_transform(df["Director"])

## Pandas get_dummies function will be used to perform One-Hot Encoding on column "Genre"

In [43]:
dummy_genres = pd.get_dummies(df[['Genre']], prefix="Genre")

df = pd.concat([df, dummy_genres], axis = 1)
df = df.drop(['Genre'], axis = 1)

# Re-ordering columns
df = df[["Director", "Genre_Action", "Genre_Adventure", "Genre_Animation", "Genre_Biography", "Genre_Comedy", 
         "Genre_Crime", "Genre_Drama","Genre_Family", "Genre_Fantasy", "Genre_Film-Noir", 
         "Genre_Horror", "Genre_Mystery", "Genre_Thriller", "Genre_Western", "IMDB_Rating", "Meta_score"]]

## After Label and One-Hot Encoding:


In [44]:
df

Unnamed: 0,Director,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Drama,Genre_Family,Genre_Fantasy,Genre_Film-Noir,Genre_Horror,Genre_Mystery,Genre_Thriller,Genre_Western,IMDB_Rating,Meta_score
0,141,0,0,0,0,0,0,1,0,0,0,0,0,0,0,9.3,80.0
1,137,0,0,0,0,0,1,0,0,0,0,0,0,0,0,9.2,100.0
2,83,1,0,0,0,0,0,0,0,0,0,0,0,0,0,9.0,84.0
3,137,0,0,0,0,0,1,0,0,0,0,0,0,0,0,9.0,90.0
4,456,0,0,0,0,0,1,0,0,0,0,0,0,0,0,9.0,96.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,50,0,0,0,0,1,0,0,0,0,0,0,0,0,0,7.6,76.0
996,164,0,0,0,0,0,0,1,0,0,0,0,0,0,0,7.6,84.0
997,145,0,0,0,0,0,0,1,0,0,0,0,0,0,0,7.6,85.0
998,22,0,0,0,0,0,0,1,0,0,0,0,0,0,0,7.6,78.0


---

# Scaling the data

## MinMaxScaler

In [45]:
mms = MinMaxScaler()
df[['IMDB_Rating', 'Meta_score']] = mms.fit_transform(df[['IMDB_Rating', 'Meta_score']])

df

Unnamed: 0,Director,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Drama,Genre_Family,Genre_Fantasy,Genre_Film-Noir,Genre_Horror,Genre_Mystery,Genre_Thriller,Genre_Western,IMDB_Rating,Meta_score
0,141,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1.000000,0.722222
1,137,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.941176,1.000000
2,83,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.823529,0.777778
3,137,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.823529,0.861111
4,456,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.823529,0.944444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,50,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.000000,0.666667
996,164,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.000000,0.777778
997,145,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.000000,0.791667
998,22,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.000000,0.694444


---

# Defining inputs and outputs
## "Director" and "Genre" columns will be our inputs. The algorithm will predict the "IMDB_Rating" and "Meta_score" values according to these inputs

In [46]:
X = df.drop(['IMDB_Rating', 'Meta_score'], axis = 1)
y = df['IMDB_Rating']
y2 = df['Meta_score']

# Train Test Split

In [47]:
X_train, X_test, y_train, y_test, y2_train, y2_test = train_test_split(
    X, y, y2, test_size = 0.3, random_state = 42)

# First algorithm: Linear Regression

## Train the machine with LinearRegression and make predictions for "IMDB_Rating"

In [48]:
lr = LinearRegression()

start_time = time.time()
lr.fit(X_train, y_train)
lr_training_time = "%.4f" % (time.time() - start_time)

start_time = time.time()
ypred = lr.predict(X_test)
lr_predict_time = "%.4f" % (time.time() - start_time)

## Use Mean Absolute Error to calculate the error

In [49]:
start_time = time.time()
lr_mae_y = mean_absolute_error(y_test, ypred)
lr_evaluation_time = "%.4f" % (time.time() - start_time)

print(lr_mae_y, " Linear Regression MAE for IMDB_Rating")

0.1330227843388942  Linear Regression MAE for IMDB_Rating


## Now train the machine to make the predictions for "Meta_score"

In [50]:
lr.fit(X_train, y2_train)
ypred2 = lr.predict(X_test)

## Calculate the error

In [51]:
lr_mae_y2 = mean_absolute_error(y2_test, ypred2)
print(lr_mae_y2, " Linear Regression MAE for Meta_score")

0.12011013593659937  Linear Regression MAE for Meta_score


---

# Second algorithm: Decision Tree Regressor

## Train the machine with DecisionTreeRegression and make predictions for "IMDB_Rating"

In [52]:
dtr = DecisionTreeRegressor()

start_time = time.time()
dtr.fit(X_train, y_train)
dtr_training_time = "%.4f" % (time.time() - start_time)

start_time = time.time()
ypred = dtr.predict(X_test)
dtr_predict_time = "%.4f" % (time.time() - start_time)

## Calculate the error

In [53]:
start_time = time.time()
dtr_mae_y = mean_absolute_error(y_test, ypred)
dtr_evaluation_time = "%.4f" % (time.time() - start_time)

print(dtr_mae_y, " DTC MAE for IMDB_Rating")

0.15996732026143778  DTC MAE for IMDB_Rating


## Now train for "Meta_score"

In [54]:
dtr.fit(X_train, y2_train)
ypred2 = dtr.predict(X_test)

## Calculate the error

In [55]:
dtr_mae_y2 = mean_absolute_error(y2_test, ypred2)
print(dtr_mae_y2, " DTC MAE for Meta_score")

0.14100077160493826  DTC MAE for Meta_score


---

# Comparison

In [56]:
data = {'Linear Regression': ["%.4f" % lr_mae_y, "%.4f" % lr_mae_y2], 
        'Decision Tree Regressor': ["%.4f" % dtr_mae_y, "%.4f" % dtr_mae_y2]}
error_comparison_df = pd.DataFrame(data = data, index=["IMDB_Rating", "Meta_score"])
error_comparison_df = error_comparison_df.style.set_caption("Mean Absolute Error")

In [57]:
error_comparison_df

Unnamed: 0,Linear Regression,Decision Tree Regressor
IMDB_Rating,0.133,0.16
Meta_score,0.1201,0.141


# The model built using Linear Regression made closer predictions for both IMDB_Rating and Meta_score.

In [58]:
data = {'Linear Regression': [lr_training_time, lr_predict_time, lr_evaluation_time], 
        'Decision Tree Regressor': [dtr_training_time, dtr_predict_time, dtr_evaluation_time]}
time_comparison_df = pd.DataFrame(data = data, index=["Training Time (s)", "Prediction Time (s)", "Evaluation Time (s)"])
time_comparison_df = time_comparison_df.style.set_caption("Model Performances on IMDB_Rating")

In [59]:
display(time_comparison_df)

Unnamed: 0,Linear Regression,Decision Tree Regressor
Training Time (s),0.0052,0.0092
Prediction Time (s),0.003,0.0031
Evaluation Time (s),0.0009,0.0011


# Linear Regression training and prediction time is less than DTR.
# Evaluation time of both models are very close to each other.