# Data-601: 
# Final Project: Classification Problem over IMDb Movies Dataset

# Load the Libraries

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.linear_model import LogisticRegression
import graphviz
import matplotlib.pylab as py
import statsmodels.api as sm

In [None]:
%matplotlib notebook

# Read in the data

In [None]:
movies = pd.read_csv("./movies_with_weights_55.csv",error_bad_lines=False)
movies_df = movies[['success_flag','year','actor_weight','director_weight','writer_weight','producer_weight','genre_weight','production_co_weight']]

Check the size of our data.

In [None]:
movies_df.shape

(5697, 8)

Look at the first few entries of our data.

In [None]:
movies_df.head(5)

Unnamed: 0,success_flag,year,actor_weight,director_weight,writer_weight,producer_weight,genre_weight,production_co_weight
0,0,1988,-0.008245,0.996885,0.0,,0.333333,0.09999
1,0,1996,-0.427318,,-0.478256,-0.413427,0.515636,0.395749
2,0,2004,-0.254381,0.0,-0.064188,0.0,0.5,0.589293
3,0,2007,-0.433054,-0.682627,,-0.866263,0.369862,0.0
4,1,1980,0.373318,0.0,,0.0,0.0,0.3333


In [None]:
# Name the Features (Predictors)
feature_names=list(movies_df.columns.values[2:])
feature_names

['actor_weight',
 'director_weight',
 'writer_weight',
 'producer_weight',
 'genre_weight',
 'production_co_weight']

In [None]:
# Check the Data Types
movies_df.dtypes

success_flag              int64
year                      int64
actor_weight            float64
director_weight         float64
writer_weight           float64
producer_weight         float64
genre_weight            float64
production_co_weight    float64
dtype: object

### Are There Missing Data?

In [None]:
movies_df.isnull().any()

success_flag            False
year                    False
actor_weight             True
director_weight          True
writer_weight            True
producer_weight          True
genre_weight            False
production_co_weight    False
dtype: bool

#### Drop Remaining Missing Data

In [None]:
movies_analysis_df = movies_df.dropna()
movies_analysis_df.shape

(4056, 8)

In [None]:
movies_analysis_df.isnull().any()

success_flag            False
year                    False
actor_weight            False
director_weight         False
writer_weight           False
producer_weight         False
genre_weight            False
production_co_weight    False
dtype: bool

### Check the Distribution of the Data

In [None]:
class_0, class_1 = 0, 0
  
class_0 = ((movies_df[['success_flag']]) == 0).sum()
class_1 = ((movies_df[['success_flag']]) == 1).sum()

print("Class has", int(class_0), "zero values and", int(class_1), "one values.")

Class has 2845 zero values and 2852 one values.


------------------------

# Hold Out a Test Set (To be Used Later)
Before going much further with our analysis, let's hold out a test set for later.  All of the training and testing should be implemented on the remaining data and the hold out will be used afterwards to check for final accuracy of our model.

In [None]:
# Create a Dataframe with the 2019 and 2020 Data
hold_out_test_df = movies_analysis_df[movies_analysis_df["year"] > 2018] 
  
# Create a Dataframe with the Remaining Data  
movies_analysis_train_test_df = movies_analysis_df[movies_analysis_df["year"] < 2019]

In [None]:
# Check the Sizes of the New Dataframes
print("Hold Out Data Shape: ", hold_out_test_df.shape)
print("Remaining Data Shape: ", movies_analysis_train_test_df.shape)

Hold Out Data Shape:  (113, 8)
Remaining Data Shape:  (3943, 8)


#### Drop the Year Column

In [None]:
hold_out_test_df = hold_out_test_df.drop(["year"], axis=1)
movies_analysis_train_test_df = movies_analysis_train_test_df.drop(["year"], axis=1)

In [None]:
# Check the Final Sizes of the New Dataframes
print("Hold Out Data Final Shape: ", hold_out_test_df.shape)
print("Remaining Data Final Shape: ", movies_analysis_train_test_df.shape)

Hold Out Data Final Shape:  (113, 7)
Remaining Data Final Shape:  (3943, 7)


--------------------------------

# Review Training/Test Dataset

### Review the Scatterplot Matrix

In [None]:
# Build Scatter Plot Matrix
colors = ['green', 'red']
scatter_matrix = pd.plotting.scatter_matrix(movies_analysis_train_test_df, alpha=0.05, figsize=(10,10), 
                                c=movies_analysis_train_test_df.success_flag.apply(lambda x:colors[x]));
[s.xaxis.label.set_rotation(90) for s in scatter_matrix.reshape(-1)]
[s.yaxis.label.set_rotation(0) for s in scatter_matrix.reshape(-1)]
py.show()

<IPython.core.display.Javascript object>

#### Split Data in to Training and Testing 

In [None]:
dat = movies_analysis_train_test_df.copy()

x_full = dat.values[:,1:]
y_full = dat.values[:,0]

x_train,x_test,y_train,y_test=train_test_split(
    x_full,y_full,
    test_size=0.33,
    random_state=123)

In [None]:
# Check the Sizes of the New Dataframes
print("Training Data Shape: ", x_train.shape)
print("Testing Data Shape: ", x_test.shape)

Training Data Shape:  (2641, 6)
Testing Data Shape:  (1302, 6)


# Classification Tree Model

In [None]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 123, max_depth = 4)
clf_gini.fit(x_train,y_train)
py.figure(figsize=(10,10))
tree.plot_tree(clf_gini, feature_names=feature_names, filled=True, rounded=True)


<IPython.core.display.Javascript object>

[Text(387.5, 693.0, 'production_co_weight <= 0.517\ngini = 0.5\nsamples = 2641\nvalue = [1318, 1323]'),
 Text(193.75, 539.0, 'producer_weight <= 0.193\ngini = 0.466\nsamples = 1310\nvalue = [827, 483]'),
 Text(96.875, 385.0, 'writer_weight <= 0.271\ngini = 0.412\nsamples = 910\nvalue = [646, 264]'),
 Text(48.4375, 231.0, 'genre_weight <= 0.642\ngini = 0.385\nsamples = 749\nvalue = [554, 195]'),
 Text(24.21875, 77.0, 'gini = 0.365\nsamples = 640\nvalue = [486, 154]'),
 Text(72.65625, 77.0, 'gini = 0.469\nsamples = 109\nvalue = [68, 41]'),
 Text(145.3125, 231.0, 'actor_weight <= 0.259\ngini = 0.49\nsamples = 161\nvalue = [92, 69]'),
 Text(121.09375, 77.0, 'gini = 0.455\nsamples = 123\nvalue = [80, 43]'),
 Text(169.53125, 77.0, 'gini = 0.432\nsamples = 38\nvalue = [12, 26]'),
 Text(290.625, 385.0, 'producer_weight <= 0.878\ngini = 0.495\nsamples = 400\nvalue = [181, 219]'),
 Text(242.1875, 231.0, 'writer_weight <= 0.979\ngini = 0.5\nsamples = 289\nvalue = [144, 145]'),
 Text(217.96875, 77

### Apply to the Test Data

In [None]:
y_pred=clf_gini.predict(x_test)
print(f"Predicted values: {y_pred}")

Predicted values: [0. 0. 1. ... 1. 0. 0.]


#### Confusion Matrix

In [None]:
print(confusion_matrix(y_test,y_pred))

[[456 207]
 [227 412]]


#### Accuracy 

In [None]:
print(accuracy_score(y_test,y_pred)*100)

66.66666666666666


In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.67      0.69      0.68       663
         1.0       0.67      0.64      0.66       639

    accuracy                           0.67      1302
   macro avg       0.67      0.67      0.67      1302
weighted avg       0.67      0.67      0.67      1302



# Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)

LogisticRegression()

### Apply to Test Data

In [None]:
logreg_predict=logreg.predict(x_test)
logistic_cm=confusion_matrix(y_test, logreg_predict)

#### Confusion Matrix

In [None]:
print(logistic_cm)

[[450 213]
 [215 424]]


#### Accuracy

In [None]:
print(accuracy_score(y_test,logreg_predict)*100)

67.12749615975423


In [None]:
logit_model=sm.Logit(y_train,x_train)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.639986
         Iterations 5
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.077     
Dependent Variable: y                AIC:              3392.4086 
Date:               2020-12-20 09:08 BIC:              3427.6820 
No. Observations:   2641             Log-Likelihood:   -1690.2   
Df Model:           5                LL-Null:          -1830.6   
Df Residuals:       2635             LLR p-value:      1.3497e-58
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     5.0000                                       
--------------------------------------------------------------------
       Coef.     Std.Err.       z       P>|z|      [0.025     0.975]
--------------------------------------------------------------------
x1     0.5213      0.1107     4.7106    0.0000     0.3044     0.7382
x2     0.1850      0.0547     3.3821    0.0007     0.0

#### <font color=blue> All predictors show to be statistically significant. 

# Fit a Logistic Regression Model on the Union of Training and Testing Data

In [None]:
logreg_best = LogisticRegression()
logreg_best.fit(x_full, y_full)

LogisticRegression()

In [None]:
logit_best_model = sm.Logit(y_full,x_full)
result_best = logit_best_model.fit()
print(result_best.summary2())

Optimization terminated successfully.
         Current function value: 0.638280
         Iterations 5
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.079     
Dependent Variable: y                AIC:              5045.4776 
Date:               2020-12-20 09:13 BIC:              5083.1558 
No. Observations:   3943             Log-Likelihood:   -2516.7   
Df Model:           5                LL-Null:          -2733.0   
Df Residuals:       3937             LLR p-value:      2.7947e-91
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     5.0000                                       
--------------------------------------------------------------------
       Coef.     Std.Err.       z       P>|z|      [0.025     0.975]
--------------------------------------------------------------------
x1     0.5199      0.0915     5.6852    0.0000     0.3407     0.6992
x2     0.1400      0.0448     3.1235    0.0018     0.0

#### <font color=blue> All predictors show to be statistically significant. 


### Apply to Hold Out Data

In [None]:
hold_out_dat = hold_out_test_df.copy()

# Split Into x and y Values
hold_out_x = hold_out_dat.values[:,1:]
hold_out_y = hold_out_dat.values[:,0]

In [None]:
logreg_best_predict = logreg_best.predict(hold_out_x)
logistic_best_cm = confusion_matrix(hold_out_y, logreg_best_predict)

#### Confusion Matrix

In [None]:
print(logistic_best_cm)

[[15 23]
 [15 60]]


#### Accuracy

In [None]:
print(accuracy_score(hold_out_y,logreg_best_predict)*100)

66.3716814159292
