In [15]:
# All library and packages import here

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [2]:
# Reading the dataset

movies = pd.read_csv('movie_metadata.csv')

In [3]:
# Getting to know all the column names

print(movies.columns)

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')


In [4]:
movies.shape

(5043, 28)

In [5]:
movies['profit'] = movies['gross'] - movies['budget']
movies['earning_rate'] = (movies['gross'] - movies['budget']) / movies['budget']
movies['y_profit'] = np.where(movies['earning_rate'] >= 2, 1, 0)
movies['y_rating'] = np.where(movies['imdb_score'] > 8, 1, 0)
movies['y_both'] = np.where((movies['y_rating'] == 1) & (movies['y_profit'] == 1), 1, 0)


In [6]:
movies.shape

(5043, 33)

In [7]:
movies = movies.dropna()

In [8]:
movies.shape

(3756, 33)

In [9]:
movies.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,profit,earning_rate,y_profit,y_rating,y_both
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,2009.0,936.0,7.9,1.78,33000,523505847.0,2.208885,1,0,0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,2007.0,5000.0,7.1,2.35,0,9404152.0,0.031347,0,0,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,2015.0,393.0,6.8,2.35,85000,-44925825.0,-0.183371,0,0,0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2012.0,23000.0,8.5,2.35,164000,198130642.0,0.792523,0,1,0
5,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,2012.0,632.0,6.6,2.35,24000,-190641321.0,-0.722948,0,0,0


In [10]:
# Column selection for feature and target variables for profit
feature_columns = ['num_critic_for_reviews','duration','director_facebook_likes','actor_3_facebook_likes','actor_1_facebook_likes','gross','num_voted_users','cast_total_facebook_likes','facenumber_in_poster','num_user_for_reviews','budget','actor_2_facebook_likes','movie_facebook_likes']

X = movies[feature_columns]
y = movies['y_profit']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Create Decision Tree Classifier object
clf = DecisionTreeClassifier(criterion="gini", max_depth=14, splitter="best")

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:",metrics.confusion_matrix(y_test, y_pred))

# Report
print("Report:",metrics.classification_report(y_test, y_pred))

Accuracy: 0.9733806566104702
Confusion Matrix: [[923  10]
 [ 20 174]]
Report:               precision    recall  f1-score   support

           0       0.98      0.99      0.98       933
           1       0.95      0.90      0.92       194

    accuracy                           0.97      1127
   macro avg       0.96      0.94      0.95      1127
weighted avg       0.97      0.97      0.97      1127



In [11]:
# Column selection for feature and target variables for rating
feature_columns = ['num_critic_for_reviews','duration','director_facebook_likes','actor_3_facebook_likes','actor_1_facebook_likes','gross','num_voted_users','cast_total_facebook_likes','facenumber_in_poster','num_user_for_reviews','budget','actor_2_facebook_likes','movie_facebook_likes']

X = movies[feature_columns]
y = movies['y_rating']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Create Decision Tree Classifier object
clf = DecisionTreeClassifier(criterion="gini", max_depth=14, splitter="best")

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:",metrics.confusion_matrix(y_test, y_pred))

# Report
print("Report:",metrics.classification_report(y_test, y_pred))

Accuracy: 0.9680567879325643
Confusion Matrix: [[1061   20]
 [  16   30]]
Report:               precision    recall  f1-score   support

           0       0.99      0.98      0.98      1081
           1       0.60      0.65      0.63        46

    accuracy                           0.97      1127
   macro avg       0.79      0.82      0.80      1127
weighted avg       0.97      0.97      0.97      1127



In [12]:
# Column selection for feature and target variables for profit and rating
feature_columns = ['num_critic_for_reviews','duration','director_facebook_likes','actor_3_facebook_likes','actor_1_facebook_likes','gross','num_voted_users','cast_total_facebook_likes','facenumber_in_poster','num_user_for_reviews','budget','actor_2_facebook_likes','movie_facebook_likes']

X = movies[feature_columns]
y = movies['y_both']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Create Decision Tree Classifier object
clf = DecisionTreeClassifier(criterion="gini", max_depth=14, splitter="best")

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:",metrics.confusion_matrix(y_test, y_pred))

# Report
print("Report:",metrics.classification_report(y_test, y_pred))

Accuracy: 0.9840283939662822
Confusion Matrix: [[1100   10]
 [   8    9]]
Report:               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1110
           1       0.47      0.53      0.50        17

    accuracy                           0.98      1127
   macro avg       0.73      0.76      0.75      1127
weighted avg       0.98      0.98      0.98      1127



In [20]:
# Column selection for feature and target variables for profit and rating
feature_columns = ['num_critic_for_reviews','duration','director_facebook_likes','actor_3_facebook_likes','actor_1_facebook_likes','gross','num_voted_users','cast_total_facebook_likes','facenumber_in_poster','num_user_for_reviews','budget','actor_2_facebook_likes','movie_facebook_likes']

X = movies[feature_columns]
y = movies['y_rating']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

model = LogisticRegression(solver='liblinear', random_state=0)

model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)
model.fit(X, y)

# Step 4: Evaluate the model
p_pred = model.predict_proba(X)
y_pred = model.predict(X)
score_ = model.score(X, y)
conf_m = confusion_matrix(y, y_pred)
report = classification_report(y, y_pred)

In [21]:
print('score_:', score_, end='\n\n')
print('conf_m:', conf_m, sep='\n', end='\n\n')
print('report:', report, sep='\n')

score_: 0.9358359957401491

conf_m:
[[3432  166]
 [  75   83]]

report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.97      3598
           1       0.33      0.53      0.41       158

    accuracy                           0.94      3756
   macro avg       0.66      0.74      0.69      3756
weighted avg       0.95      0.94      0.94      3756

