# Data-601: 
# Final Project: Classification Problem over IMDb Movies Dataset

# Load the Libraries

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.linear_model import LogisticRegression
import graphviz
import matplotlib.pylab as py
import statsmodels.api as sm

In [None]:
%matplotlib notebook

# Read in the data

In [None]:
movies = pd.read_csv("./movies_with_weights_0.csv",error_bad_lines=False)
movies_df = movies[['success_flag','year','actor_weight','director_weight','writer_weight','producer_weight','genre_weight','production_co_weight']]

Check the size of our data.

In [None]:
movies_df.shape

(5697, 8)

Look at the first few entries of our data.

In [None]:
movies_df.head(5)

Unnamed: 0,success_flag,year,actor_weight,director_weight,writer_weight,producer_weight,genre_weight,production_co_weight
0,0,1988,-0.152731,0.772756,0.0,,0.416667,0.40001
1,0,1996,-0.558818,,-0.552218,-0.622554,0.630212,0.5234
2,1,2004,0.138388,0.0,-0.319198,0.0,0.5,0.676592
3,0,2007,-0.517651,-0.278856,,-1.145967,0.512333,0.0
4,1,1980,0.287829,0.0,,0.0,0.0,0.3333


In [None]:
# Name the Features (Predictors)
feature_names=list(movies_df.columns.values[2:])
feature_names

['actor_weight',
 'director_weight',
 'writer_weight',
 'producer_weight',
 'genre_weight',
 'production_co_weight']

In [None]:
# Check the Data Types
movies_df.dtypes

success_flag              int64
year                      int64
actor_weight            float64
director_weight         float64
writer_weight           float64
producer_weight         float64
genre_weight            float64
production_co_weight    float64
dtype: object

### Are There Missing Data?

In [None]:
movies_df.isnull().any()

success_flag            False
year                    False
actor_weight             True
director_weight          True
writer_weight            True
producer_weight          True
genre_weight            False
production_co_weight    False
dtype: bool

#### Drop Remaining Missing Data

In [None]:
movies_analysis_df = movies_df.dropna()
movies_analysis_df.shape

(4056, 8)

In [None]:
movies_analysis_df.isnull().any()

success_flag            False
year                    False
actor_weight            False
director_weight         False
writer_weight           False
producer_weight         False
genre_weight            False
production_co_weight    False
dtype: bool

### Check the Distribution of the Data

In [None]:
class_0, class_1 = 0, 0
  
class_0 = ((movies_df[['success_flag']]) == 0).sum()
class_1 = ((movies_df[['success_flag']]) == 1).sum()

print("Class has", int(class_0), "zero values and", int(class_1), "one values.")

Class has 2166 zero values and 3531 one values.


------------------------

# Hold Out a Test Set (To be Used Later)
Before going much further with our analysis, let's hold out a test set for later.  All of the training and testing should be implemented on the remaining data and the hold out will be used afterwards to check for final accuracy of our model.

In [None]:
# Create a Dataframe with the 2019 and 2020 Data
hold_out_test_df = movies_analysis_df[movies_analysis_df["year"] > 2018] 
  
# Create a Dataframe with the Remaining Data  
movies_analysis_train_test_df = movies_analysis_df[movies_analysis_df["year"] < 2019]

In [None]:
# Check the Sizes of the New Dataframes
print("Hold Out Data Shape: ", hold_out_test_df.shape)
print("Remaining Data Shape: ", movies_analysis_train_test_df.shape)

Hold Out Data Shape:  (113, 8)
Remaining Data Shape:  (3943, 8)


#### Drop the Year Column

In [None]:
hold_out_test_df = hold_out_test_df.drop(["year"], axis=1)
movies_analysis_train_test_df = movies_analysis_train_test_df.drop(["year"], axis=1)

In [None]:
# Check the Final Sizes of the New Dataframes
print("Hold Out Data Final Shape: ", hold_out_test_df.shape)
print("Remaining Data Final Shape: ", movies_analysis_train_test_df.shape)

Hold Out Data Final Shape:  (113, 7)
Remaining Data Final Shape:  (3943, 7)


--------------------------------

# Review Training/Test Dataset

### Review the Scatterplot Matrix

In [None]:
# Build Scatter Plot Matrix
colors = ['green', 'red']
scatter_matrix = pd.plotting.scatter_matrix(movies_analysis_train_test_df, alpha=0.05, figsize=(10,10), 
                                c=movies_analysis_train_test_df.success_flag.apply(lambda x:colors[x]));
[s.xaxis.label.set_rotation(90) for s in scatter_matrix.reshape(-1)]
[s.yaxis.label.set_rotation(0) for s in scatter_matrix.reshape(-1)]
py.show()

<IPython.core.display.Javascript object>

#### Split Data in to Training and Testing 

In [None]:
dat = movies_analysis_train_test_df.copy()

x_full = dat.values[:,1:]
y_full = dat.values[:,0]

x_train,x_test,y_train,y_test=train_test_split(
    x_full,y_full,
    test_size=0.33,
    random_state=123)

In [None]:
# Check the Sizes of the New Dataframes
print("Training Data Shape: ", x_train.shape)
print("Testing Data Shape: ", x_test.shape)

Training Data Shape:  (2641, 6)
Testing Data Shape:  (1302, 6)


# Classification Tree Model

In [None]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 123, max_depth = 2)
clf_gini.fit(x_train,y_train)
py.figure(figsize=(10,10))
tree.plot_tree(clf_gini, feature_names=feature_names, filled=True, rounded=True)


<IPython.core.display.Javascript object>

[Text(387.5, 641.6666666666666, 'production_co_weight <= 0.62\ngini = 0.47\nsamples = 2641\nvalue = [999, 1642]'),
 Text(193.75, 385.0, 'producer_weight <= 0.159\ngini = 0.497\nsamples = 1193\nvalue = [639, 554]'),
 Text(96.875, 128.33333333333326, 'gini = 0.473\nsamples = 789\nvalue = [487, 302]'),
 Text(290.625, 128.33333333333326, 'gini = 0.469\nsamples = 404\nvalue = [152, 252]'),
 Text(581.25, 385.0, 'genre_weight <= 0.51\ngini = 0.374\nsamples = 1448\nvalue = [360, 1088]'),
 Text(484.375, 128.33333333333326, 'gini = 0.473\nsamples = 388\nvalue = [149, 239]'),
 Text(678.125, 128.33333333333326, 'gini = 0.319\nsamples = 1060\nvalue = [211, 849]')]

### Apply to the Test Data

In [None]:
y_pred=clf_gini.predict(x_test)
print(f"Predicted values: {y_pred}")

Predicted values: [0. 1. 1. ... 1. 1. 0.]


#### Confusion Matrix

In [None]:
print(confusion_matrix(y_test,y_pred))

[[260 234]
 [133 675]]


#### Accuracy 

In [None]:
print(accuracy_score(y_test,y_pred)*100)

71.81259600614439


In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.66      0.53      0.59       494
         1.0       0.74      0.84      0.79       808

    accuracy                           0.72      1302
   macro avg       0.70      0.68      0.69      1302
weighted avg       0.71      0.72      0.71      1302



# Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)

LogisticRegression()

### Apply to Test Data

In [None]:
logreg_predict=logreg.predict(x_test)
logistic_cm=confusion_matrix(y_test, logreg_predict)

#### Confusion Matrix

In [None]:
print(logistic_cm)

[[246 248]
 [119 689]]


#### Accuracy

In [None]:
print(accuracy_score(y_test,logreg_predict)*100)

71.81259600614439


In [None]:
logit_model=sm.Logit(y_train,x_train)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.592566
         Iterations 5
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.107     
Dependent Variable: y                AIC:              3141.9355 
Date:               2020-12-20 09:29 BIC:              3177.2090 
No. Observations:   2641             Log-Likelihood:   -1565.0   
Df Model:           5                LL-Null:          -1751.5   
Df Residuals:       2635             LLR p-value:      1.8250e-78
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     5.0000                                       
--------------------------------------------------------------------
        Coef.     Std.Err.       z       P>|z|      [0.025    0.975]
--------------------------------------------------------------------
x1      0.5907      0.1172     5.0407    0.0000     0.3610    0.8204
x2      0.0070      0.0594     0.1177    0.9063    -0.

### Model Only Based on Statistically Significant Predictors

In [None]:
x_4_predictors = movies_analysis_df.values[:,[1,3,4,6]]
y_4_predictors = movies_analysis_df.values[:,0]

x_4_predictors_train, x_4_predictors_test, y_4_predictors_train, y_4_predictors_test = train_test_split(
    x_4_predictors,y_4_predictors,
    test_size=0.33,
    random_state=123)

In [None]:
logreg2 = LogisticRegression()
logreg2.fit(x_4_predictors_train, y_4_predictors_train)

LogisticRegression()

#### Apply to Test Data

In [None]:
logreg2_predict = logreg2.predict(x_4_predictors_test)
logistic_cm2 = confusion_matrix(y_4_predictors_test, logreg2_predict)

#### Confusion Matrix

In [None]:
print(logistic_cm2)

[[ 77 433]
 [ 62 767]]


#### Accuracy

In [None]:
print(accuracy_score(y_4_predictors_test,logreg2_predict)*100)

63.032113517550414


# Fit a Logistic Regression Model on the Union of Training and Testing Data

In [None]:
logreg_best = LogisticRegression()
logreg_best.fit(x_full, y_full)

LogisticRegression()

In [None]:
logit_best_model = sm.Logit(y_full,x_full)
result_best = logit_best_model.fit()
print(result_best.summary2())

Optimization terminated successfully.
         Current function value: 0.587451
         Iterations 5
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.114      
Dependent Variable: y                AIC:              4644.6404  
Date:               2020-12-20 09:29 BIC:              4682.3185  
No. Observations:   3943             Log-Likelihood:   -2316.3    
Df Model:           5                LL-Null:          -2615.8    
Df Residuals:       3937             LLR p-value:      3.4780e-127
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     5.0000                                        
---------------------------------------------------------------------
         Coef.     Std.Err.       z       P>|z|      [0.025    0.975]
---------------------------------------------------------------------
x1       0.5892      0.0971     6.0662    0.0000     0.3989    0.7796
x2      -0.0003      0.0494    -0.0057   

### Apply to Hold Out Data

In [None]:
hold_out_dat = hold_out_test_df.copy()

# Split Into x and y Values
hold_out_x = hold_out_dat.values[:,1:]
hold_out_y = hold_out_dat.values[:,0]

In [None]:
logreg_best_predict = logreg_best.predict(hold_out_x)
logistic_best_cm = confusion_matrix(hold_out_y, logreg_best_predict)

#### Confusion Matrix

In [None]:
print(logistic_best_cm)

[[ 8 16]
 [13 76]]


#### Accuracy

In [None]:
print(accuracy_score(hold_out_y,logreg_best_predict)*100)

74.33628318584071


### Final Model Only Based on Statistically Significant Predictors

In [None]:
x_full_4_predictors = dat.values[:,[1,3,4,6]]
y_full_4_predictors = dat.values[:,0]

hold_out_x_4_predictors = hold_out_dat.values[:,[1,3,4,6]]
hold_out_y_4_predictors = hold_out_dat.values[:,0]

In [None]:
logreg2_best = LogisticRegression()
logreg2_best.fit(x_full_4_predictors, y_full_4_predictors)

LogisticRegression()

#### Apply to Hold Out Data

In [None]:
logreg2_best_predict = logreg2_best.predict(hold_out_x_4_predictors)
logistic_best_cm2 = confusion_matrix(hold_out_y_4_predictors, logreg2_best_predict)

#### Confusion Matrix

In [None]:
print(logistic_best_cm2)

[[ 9 15]
 [13 76]]


#### Accuracy

In [None]:
print(accuracy_score(hold_out_y_4_predictors,logreg2_best_predict)*100)

75.22123893805309


In [None]:
clf_gini_final = DecisionTreeClassifier(criterion = "gini", random_state = 123, max_depth = 2)
clf_gini_final.fit(x_full,y_full)
py.figure(figsize=(10,10))
tree.plot_tree(clf_gini_final, feature_names=feature_names, filled=True, rounded=True)

<IPython.core.display.Javascript object>

[Text(387.5, 641.6666666666666, 'production_co_weight <= 0.62\ngini = 0.471\nsamples = 3943\nvalue = [1493, 2450]'),
 Text(193.75, 385.0, 'producer_weight <= 0.245\ngini = 0.495\nsamples = 1775\nvalue = [976, 799]'),
 Text(96.875, 128.33333333333326, 'gini = 0.468\nsamples = 1254\nvalue = [785, 469]'),
 Text(290.625, 128.33333333333326, 'gini = 0.464\nsamples = 521\nvalue = [191, 330]'),
 Text(581.25, 385.0, 'genre_weight <= 0.51\ngini = 0.363\nsamples = 2168\nvalue = [517, 1651]'),
 Text(484.375, 128.33333333333326, 'gini = 0.468\nsamples = 588\nvalue = [220, 368]'),
 Text(678.125, 128.33333333333326, 'gini = 0.305\nsamples = 1580\nvalue = [297, 1283]')]

## Apply to the Test Data

In [None]:
y_final_pred=clf_gini.predict(hold_out_x)
print(f"Predicted values: {y_final_pred}")

Predicted values: [0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1.
 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0.]


#### Confusion Matrix

In [None]:
print(confusion_matrix(hold_out_y,y_final_pred))

[[11 13]
 [16 73]]


#### Accuracy

In [None]:
print(accuracy_score(hold_out_y,y_final_pred)*100)

74.33628318584071


In [None]:
print(classification_report(hold_out_y,y_final_pred))

              precision    recall  f1-score   support

         0.0       0.41      0.46      0.43        24
         1.0       0.85      0.82      0.83        89

    accuracy                           0.74       113
   macro avg       0.63      0.64      0.63       113
weighted avg       0.76      0.74      0.75       113

