# Neural Network
Neural Network Author: Erin Brown  
Preprocessing Author: Alex Moore

## Preprocessing

In [2]:
# Author: Alex Moore
import pandas as pd
import numpy as np

# Read in data 
data = pd.read_csv('tmdb_movies_data.csv')

In [3]:
# Author: Alex Moore
# Fill in missing features
num_cols = ['budget_adj', 'revenue_adj']
for col in num_cols:
        mean = data[col].mean()
        data[col].fillna(mean, inplace=True)

# Calculate difference between revenue_adj and budget_adj to create profit column
data['profit'] = data['revenue_adj'] - data['budget_adj']

# Parse keywords column
data['keyword_list'] = data['keywords'].str.split('|')

# Replace missing values with empty string
data['keywords'].fillna('no_value', inplace=True)

# Save updata data to new CSV file
data.to_csv('updated_tmdb_movies_data_list.csv', index=False)


In [4]:
#Author: Erin Brown
data.head(2)

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,profit,keyword_list
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015,137999939.3,1392446000.0,1254446000.0,"[monster, dna, tyrannosaurus rex, velociraptor..."
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015,137999939.3,348161300.0,210161400.0,"[future, chase, post-apocalyptic, dystopia, au..."


## Neural Network

In [5]:
#Author: Erin Brown
#Loading necessary libraries & Modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import pydotplus
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

In [6]:
#Author: Erin Brown
#dropping uneccessary columns/features
data.drop(data.columns[[0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22]], axis=1, inplace=True)
data.head()

Unnamed: 0,revenue,keywords,profit
0,1513528810,monster|dna|tyrannosaurus rex|velociraptor|island,1254446000.0
1,378436354,future|chase|post-apocalyptic|dystopia|australia,210161400.0
2,295238201,based on novel|revolution|dystopia|sequel|dyst...,170419100.0
3,2068178225,android|spaceship|jedi|space opera|3d,1718723000.0
4,1506249360,car race|speed|revenge|suspense|car,1210949000.0


In [7]:
#Author: Erin Brown
#Splitting keywords in keywords column by the delimiter '|' 
keywords_data = data['keywords'].str.split('|', expand=True)
keywords_data.head()

Unnamed: 0,0,1,2,3,4
0,monster,dna,tyrannosaurus rex,velociraptor,island
1,future,chase,post-apocalyptic,dystopia,australia
2,based on novel,revolution,dystopia,sequel,dystopic future
3,android,spaceship,jedi,space opera,3d
4,car race,speed,revenge,suspense,car


In [8]:
#Author: Erin Brown
#Applying label encoder to change keywords into numerical values
label_encoder = []
for col in keywords_data.columns:
    keywords_data[col] = LabelEncoder().fit_transform(keywords_data[col])
    label_encoder.append(LabelEncoder())

X = pd.concat([keywords_data], axis=1)
print (X)

          0     1     2     3     4
0      1347   777  3220  3253  1451
1       833   474  2362   972   152
2       164  2292   940  2703   832
3        61  2562  1627  2844     6
4       341  2568  2555  3003   410
...     ...   ...   ...   ...   ...
10861  2024  2670  3005  3410  3180
10862   341  2216  1204  3410  3180
10863   333  2844  2926  3410  3180
10864  1956  3066  3454  3410  3180
10865   782  1201   907  2618  1047

[10866 rows x 5 columns]


In [112]:
#Author: Erin Brown
# Neural Networks work well on binary classification
# So a threshold was made for the profit column where a profit above or equal to 10,000,000 is considered successful (1)
# Profits below the threshold will be considered unsuccessful (0) 
profit_threshold = 10000000 
data['successful'] = np.where(data['profit'] >= profit_threshold, 1, 0)
data.head()

Unnamed: 0,revenue,keywords,profit,successful
0,1513528810,monster|dna|tyrannosaurus rex|velociraptor|island,1254446000.0,1
1,378436354,future|chase|post-apocalyptic|dystopia|australia,210161400.0,1
2,295238201,based on novel|revolution|dystopia|sequel|dyst...,170419100.0,1
3,2068178225,android|spaceship|jedi|space opera|3d,1718723000.0,1
4,1506249360,car race|speed|revenge|suspense|car,1210949000.0,1


In [86]:
#Author: Erin Brown
y = data['successful']

print(X) #X is our keywords
print(y) #y is our binary classification of succesful or unsuccessful 
print(data.shape)

          0     1     2     3     4
0      1347   777  3220  3253  1451
1       833   474  2362   972   152
2       164  2292   940  2703   832
3        61  2562  1627  2844     6
4       341  2568  2555  3003   410
...     ...   ...   ...   ...   ...
10861  2024  2670  3005  3410  3180
10862   341  2216  1204  3410  3180
10863   333  2844  2926  3410  3180
10864  1956  3066  3454  3410  3180
10865   782  1201   907  2618  1047

[10866 rows x 5 columns]
0        1
1        1
2        1
3        1
4        1
        ..
10861    0
10862    0
10863    0
10864    0
10865    0
Name: successful, Length: 10866, dtype: int64
(10866, 4)


In [115]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
revenue,10866.0,39823320.0,117003500.0,0.0,0.0,0.0,24000000.0,2781506000.0
profit,10866.0,33813320.0,125215100.0,-413912431.0,0.0,0.0,12926170.0,2750137000.0
successful,10866.0,0.2620099,0.4397483,0.0,0.0,0.0,1.0,1.0


In [116]:
#Author: Erin Brown
#Training the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print(X_train.shape)
print(X_test.shape)

(8692, 5)
(2174, 5)


In [117]:
#Author: Erin Brown
#Neural Network Model
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(10, 8), activation='logistic', solver='adam', alpha=1e-5, momentum=0.1, max_iter=300, verbose=True)
mlp.fit(X_train, y_train)

Iteration 1, loss = 0.75235433
Iteration 2, loss = 0.67119345
Iteration 3, loss = 0.61789529
Iteration 4, loss = 0.59171745
Iteration 5, loss = 0.58099126
Iteration 6, loss = 0.57696349
Iteration 7, loss = 0.57514380
Iteration 8, loss = 0.57429640
Iteration 9, loss = 0.57338888
Iteration 10, loss = 0.57306042
Iteration 11, loss = 0.57207035
Iteration 12, loss = 0.57160503
Iteration 13, loss = 0.57079888
Iteration 14, loss = 0.57018501
Iteration 15, loss = 0.56939475
Iteration 16, loss = 0.56882859
Iteration 17, loss = 0.56809339
Iteration 18, loss = 0.56751545
Iteration 19, loss = 0.56888038
Iteration 20, loss = 0.56991920
Iteration 21, loss = 0.56994885
Iteration 22, loss = 0.56937505
Iteration 23, loss = 0.56915266
Iteration 24, loss = 0.56883596
Iteration 25, loss = 0.56904454
Iteration 26, loss = 0.56878982
Iteration 27, loss = 0.56908164
Iteration 28, loss = 0.56700768
Iteration 29, loss = 0.56681582
Iteration 30, loss = 0.56731743
Iteration 31, loss = 0.56674594
Iteration 32, los

In [118]:
#Author: Erin Brown
accuracy = mlp.score(X_train, y_train)
print("Accuracy:", accuracy)

Accuracy: 0.7320524620340543


In [119]:
#Author: Erin Brown
# Confusion Matrix on training data
from sklearn.metrics import classification_report,confusion_matrix
train_predictions = mlp.predict(X_train)
confusion_matrix = confusion_matrix(y_train, train_predictions)
print(confusion_matrix)

[[6327   62]
 [2267   36]]


In [120]:
#Author: Erin Brown
#Classification Report on training data
c_report = classification_report(y_train, train_predictions)
print(c_report)

              precision    recall  f1-score   support

           0       0.74      0.99      0.84      6389
           1       0.37      0.02      0.03      2303

    accuracy                           0.73      8692
   macro avg       0.55      0.50      0.44      8692
weighted avg       0.64      0.73      0.63      8692



In [121]:
#Author: Erin Brown
# Confusion Matrix on test data
from sklearn.metrics import classification_report,confusion_matrix
test_predictions = mlp.predict(X_test)
cf_matrix_test = confusion_matrix(y_test, test_predictions)
print(cf_matrix_test)

[[1619   11]
 [ 532   12]]


In [122]:
#Author: Erin B
c_report_test =  classification_report(y_test, test_predictions)
print(c_report_test)

              precision    recall  f1-score   support

           0       0.75      0.99      0.86      1630
           1       0.52      0.02      0.04       544

    accuracy                           0.75      2174
   macro avg       0.64      0.51      0.45      2174
weighted avg       0.69      0.75      0.65      2174

