In [1]:
# Importing libs into the python environment. These functions will be referenced later in the notebook code.

from __future__ import print_function
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import gzip
import numpy as np
import seaborn as sns
import itertools
from IPython.display import Markdown, display
from mpl_toolkits.mplot3d import axes3d, Axes3D  # <-- Note the capitalization!
%matplotlib inline

sns.set()


# Modules from scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, classification_report

from sklearn import decomposition
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from sklearn.preprocessing import StandardScaler


In [None]:
# Predict whether the movie or the TV show is going to be nominated or win an award

Assuming the role of lead data scientist in 2005, you're presented with a challenge: Amazon Studios wants to produce award-winning films and, therefore, focus the budget on projects with the best chance of winning those awards. Using the actual dataset from IMDb, an Amazon subsidiary, you begin your investigation by looking for movies made between 1990 and 2005.

The IMDb dataset is a feature-rich, comprehensive listing of all films released during that time period; it includes critical data such as cast and crew, synopsis, and other production data. Much of this data is published on the public IMDb.com site, while other features are embargoed for studio analytics.

Your task is to predict which movies will most likely be nominated for an award during the upcoming 2005 awards season by building an awards analysis prediction model. 

This is a notebook in which we read in fields of data from the IMDB database and build a model to make predictions of whether the movie is "nominated" or "Winner".

This data set is being provided to you by permission of IMDb and is subject to the terms of the AWS Digital Training Agreement (available at https://aws.amazon.com/training/digital-training-agreement).  You are expressly prohibited from copying, modifying, selling, exporting or using this data set in any way other than for the purpose of completing this lab.

## Importing the required libraries 

For this exercise, you will use the scikit-learn library to preprocess the models and make predictions. 
(You can add any other libraries that you need below as well.)

# Importing libs into the python environment. These functions will be referenced later in the notebook code.

from __future__ import print_function
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import gzip
import numpy as np
import seaborn as sns
import itertools
from IPython.display import Markdown, display
from mpl_toolkits.mplot3d import axes3d, Axes3D  # <-- Note the capitalization!
%matplotlib inline

sns.set()


# Modules from scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, classification_report

from sklearn import decomposition
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from sklearn.preprocessing import StandardScaler


## Thinking about your Data

The IMDB database contains a huge amount of information, so it's important to consider what kinds of information are relevant to your prediction. Here's the schema you're using.


<img src="Data-Schema-Capstone.png">

## Cleaning and Visualizing Your Data

Replace **<LabBucketName\>** with the resource name that was provided with your lab account.

import boto3
import botocore 
bucket = 'mlu-data-109836276268-us-west-2-qls-11926192-632c22f599f6ad0c' # Update this to the bucket that was created in your lab account as part of this enviroment.
prefix = 'data/'
 
s3 = boto3.resource('s3') 


Raw data files are in an S3 bucket in your AWS lab account. Six tables will be used (`title_genres`, `title_ratings`, `title_display`, `award_noms`, `title_awards`, `title_releases`). Raw tab-separated value files will be downloaded into your Amazon Sagemaker instance, and imported into a DataFrame, where it's easier to work  with the structured data. Raw files do not contain row headers, and thus labels are being assiged at import.

def download_and_display_file(filename,names, title):
    s3.Bucket(bucket).download_file(filename, filename)
    user_info = pd.read_csv(filename, sep='\t', encoding= 'latin1', names = names)
    display(Markdown("**" + title +" Table** \n"))
    display(user_info.head(5))
    return user_info

user_info_genres = download_and_display_file('title_genres.tsv', ['titleId','genres'], 'Genres')
user_info_ratings = download_and_display_file('title_ratings.tsv', ["titleId","rating","ratingCount","topRank","bottomRank","topRankTV"], 'Rating')
user_info_display = download_and_display_file('title_display.tsv', ["titleId","title","year","adult","runtimeMinutes","imageUri","imageId","type","originalTitle"], 'Display')
user_info_noms = download_and_display_file('award_noms.tsv', ["awardId","eventId","event","eventEditionId","award","category","year"], 'Nomination')
user_info_awards = download_and_display_file('title_awards.tsv', ["titleId","awardId","winner"], 'Awards')
user_info_releases = download_and_display_file('title_releases.tsv', ["titleId","ordering","date","region","premiere","wide","premiereType","festival","attributes"], 'Releases')



The data in table format (.tsv) are consumed into a Pandas DataFrame for data preprocessing. The data is split between six different files or DataFrames. Merge the data to obtain a unified DataFrame which will be used to do further data exploration, data engineering, visualization, and model building. You will use the built-in `merge` function in pandas to merge the DataFrames together. `TitleId` is a uniqueId that is assigned to each movie title in this dataset. A set of inner joins between `title_ratings`, `title_genres`, `title_display`, `title_releases` will merge all these tables together. 

df_first_merge = pd.merge(user_info_genres, user_info_ratings, on='titleId', how='inner')
df_second_merge = pd.merge(df_first_merge, user_info_display, on='titleId', how='inner')
df_third_merge = pd.merge(df_second_merge, user_info_releases, on='titleId', how='inner')

Duplicate titleId is dropped, prior to doing a outer join with (title_awards) table. The resulted duplicates in the DataFrame are dropped, as well as a few of the data columns. The IMDB dataset is huge with hundreds of fields - only the relevant fields are picked here that could possibly affect the model output.  After reading in these relevant tables, only the relevant fields are retained and the rest of them are dropped as shown below.

df_third_merge = df_third_merge.drop_duplicates(['titleId'])
df_fourth_merge = pd.merge(df_third_merge,user_info_awards,on='titleId', how='outer' )
    
df = df_fourth_merge.drop_duplicates(['titleId'])
df = df.drop(['imageUri','topRank','bottomRank','topRankTV','ordering','premiereType','festival' ], axis=1)

The resulted DataFrame is serialized and written to a flat file called the Pickle using the **Pickle** library. This file is then saved into the Amazon S3 bucket for later re-use of the data. You can generate a Pickle file using `pickle.dump` and save the raw datafile in that object and upload the file to the lab S3 bucket.

with open('df_pickle_nonoms_new.pkl', 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)
s3.Bucket(bucket).upload_file('df_pickle_nonoms_new.pkl','data/df_pickle_nonoms_new.pkl')

Review the top 30 rows of optimized table.

df.head(30)

Run some basic pandas descriptive statistics on your new DataFrame. 

`df.info()` prints more information about your DataFrame. This includes information such as index dtype, column dtypes, non-null values and memory usage. `df.describe()` generates descriptive statistics such as mean, median, mode etc.

df.info()

df.describe()

Load the Pickle file into a Pandas Dataframe and drop some of the irrelevant features.

s3.Bucket(bucket).download_file('data/df_pickle_nonoms_new.pkl', 'df_pickle_nonoms_new.pkl')
df = pickle.load(open('df_pickle_nonoms_new.pkl', 'rb'))
df = df[df.type == 'movie']
df = df.drop(['imageId', 'originalTitle', 'awardId', 'attributes' ], axis=1)

The resulted data requires normalization. The source data has `\N` as the value if the run time of a movie is not known. This null value will cause issues when trying to plot the data. You will change any `\N` for runtime to a zero. 

Similarly, if the film released year is not known, value is set to `\N`. You will change any `\N` for year to a zero in the cell below.

Below will display tables with runtimeMinutes `\N`.

# display full coloum width
pd.set_option('display.max_colwidth', -1)

df.head()

df[df.runtimeMinutes == r'\N'].head()

Below will display tables with year `\N`.

df[df.year == r'\N'].head()

for i, mins in df['runtimeMinutes'].iteritems():
    if mins == r'\N':
        better_name = '0'
        df.loc[[i],['runtimeMinutes']] = better_name

for i, year in df['year'].iteritems():
    if year == r'\N':
        better_name = '0'
        df.loc[[i],['year']] = better_name


A separate column called `nomination_winner` is added to the DataFrame. If winner column has either `0.0` or `1.0` value, it is assumed that the title has been nominated. Else, the title has not been nominated.

df['nomination_winner'] = 0
for i, winner in df['winner'].iteritems():
    if winner == (0.0):
        better_name = 1
        df.loc[[i],['nomination_winner']] = better_name
    if winner == (1.0):
        better_name = 1
        df.loc[[i],['nomination_winner']] = better_name

 You will also use the `fillna` function to fill the missing values in the `year` column and `runtimeMinutes` column.

df.runtimeMinutes = df.runtimeMinutes.astype(float).fillna(0.0)
df.year = df.year.astype(int).fillna(0.0)

Some titles that are included into your dataset have run times that don't seem to fit. You are going to limit this data with a runtime of longer then 1 hour (60 minutes) and no greater then 12 hours (720 minutes). YOU are also going to focus on movies with a `reviewCount` of less than 20,000.

Below, you will see sample data with `runtimesMinutes` of less than 60 minutes.

df[(df.runtimeMinutes) < 60].head()

Below, you will see sample data with `runtimesMinutes` of more than 720 minutes.

df[(df.runtimeMinutes) > 720].head()

df = df[(df.runtimeMinutes) > 60]
df = df[(df.runtimeMinutes) < 720]
df = df[(df.ratingCount) < 20000]

**Review top 30 columns of optimized data.**

df.head(30)

print('='*50)
df.info()
print('='*50)
df.describe()

**Below, you are saving dataframe for year 2005 as df_2005 for future use.**

df_2005 = df[(df.year) == 2005]
df_2005.head()

## Feature Selection and Feature Engineering
<a id='feature selection'></a>

**Feature Selection Box:** The toggle switches for various features and settings are below. A value of 0 disables the feature, and a value of 1 enables the feature.

# Selection of different features

feature_winner = 0          # Select this feature to make prediction on award winner. 
                            # Disable this feautre to make prediction on nomination winners.
    
feature_pca_2D = 0          # Select this feature to perform Principal Component Analysis of 2 components.
feature_pca_3D = 1          # Select this feature to perform Principal Component Analysis of 3 components.

feature_premiere = 0        # Select this feature to limit analysis on limited premiered movies.
feature_wide = 0            # Select this feature to limit analysis on world wide premiered movies.
feature_premiere_wide = 1   # Select this feature to include analysis on both limited and wide premiered movies.


# Normalize features
normalize_flag = 0

#Enable plotting
plot_flag = 1                 

#Feaure Selection 
US_flag = 1                 # Select this feature to limit analysis on US based movies.

#Model Selection flags
LR_flag = 1
DT_flag = 1
RF_flag = 1
GB_flag = 1
NN_flag = 1
SVM_flag = 1


**Clean up and constrain the data:** 
- Explore by limiting your dta to only **US** features. 
- Explore by choosing whether to use **nomination** or **winner** to be output target.

if plot_flag: 
    prob = df.region.value_counts(normalize=True)
    threshold = 0.02
    mask = prob > threshold
    tail_prob = prob.loc[~mask].sum()
    prob = prob.loc[mask]
    prob['other'] = tail_prob
    prob.plot(kind='bar')
    plt.xticks(rotation=25)
    plt.show()
    plt.close()



if US_flag:
    df = df[ (df.region) == 'US']


# This flag is set if "winner" is chosen as output label

if feature_winner:
    df = df[(df.nomination_winner) == 1]

    for i, winner in df['winner'].iteritems():
        if winner == (0.0):
            better_name = 0
            df.loc[[i],['nomination_winner']] = better_name
        if winner == (1.0):
            better_name = 1





You are going to be graphing the features to help better understand how features might relate to each other. Below is defining the function to be called later after the data has been imported. 

#plot Confusion Matrix function

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()
    plt.close()


# end Confusion Matrix function


# ROC Curve Plotting function

def plot_roc_curve(fpr,tpr):
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b',
             label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([-0.1, 1.2])
    plt.ylim([-0.1, 1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    plt.close()

#end ROC plotting function


# Precision Recall (PR) function

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])
    plt.show( )
    plt.close()

#end PR function
 

Below you will plot histograms of different features such as `rating`, `ratingCount`. Review the effect of `premier` flag on the output. Also look at the effects or cross correlation between the different features as well as the output.  You see that the `ratingCount` has the maximum effect on the output `nomination_winner` (could be `nomination` or `winner`). 


# Plot rating vs rating count & Histogram of Rating

if plot_flag:
    
    df['rating'].hist()
    plt.title('Histogram of Ratings')
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.show()
    plt.close()
    
    
    df_plot = df[(df.ratingCount) < 10000]
    df_plot['ratingCount'].hist(bins=100)
    plt.title('Histogram of Rating Count')
    plt.xlabel('Rating Count')
    plt.ylabel('Count')
    plt.show()
    plt.close()

    plot_hist1 = df[(df.nomination_winner) == 1]
    plot_hist0 = df[(df.nomination_winner) == 0]
    plt.hist(df.nomination_winner)
    plt.hist(plot_hist1.premiere)
    plt.xlabel('Nomination/Premiere')
    plt.ylabel('Count')
    plt.show()
    plt.close()

 
    #Plotting correlation  

    
    sns.pairplot(df[['nomination_winner', 'ratingCount','rating', 'runtimeMinutes' ]].head(5000));
    plt.show()
    plt.close()


**Experiment with different features and look at the roc scores in each model.** Run steps through **Feature Selection Box.**

<a href='#feature selection'>**Link to the feature selection box**</a>


if feature_premiere_wide:
    X_train = df[['rating', 'ratingCount', 'runtimeMinutes','premiere','wide']]
elif feature_premiere:
    X_train = df[['rating', 'ratingCount', 'runtimeMinutes','premiere' ]]
elif feature_wide:
    X_train = df[['rating', 'ratingCount',  'runtimeMinutes' ,'wide']]
else :
    X_train = df[['rating', 'ratingCount',  'runtimeMinutes']]


Y_train = df['nomination_winner']

if normalize_flag:
    X_train=(StandardScaler().fit_transform(X_train ))


Below you will Experiment with **Principal Component Analysis** and reduce the Feature set to three main components.

#PCA


if feature_pca_3D:
    pca = decomposition.PCA(n_components=3)
    
     
    principalComponents = pca.fit_transform(X_train )

    principalDf = pd.DataFrame(data = principalComponents
                 , columns = ['principal component 1', 'principal component 2','principal component 3'])
    
    
    finalDf = pd.concat([principalDf, Y_train], axis = 1)
    finalDf = finalDf.head(2000)
    targets = [1, 0 ]
    colors = ['r', 'g' ]
     
    my_dpi = 96
    
    fig = plt.figure(figsize=(10,6))
    
    ax = fig.add_subplot(111,projection='3d' )
    ax.set_xlabel('Principal Component 1', fontsize = 8)
    ax.set_ylabel('Principal Component 2', fontsize = 8)
    ax.set_zlabel('Principal Component 3', fontsize = 8)
    ax.set_title('3 component PCA', fontsize = 15)
    for target, color in zip(targets,colors):
        indicesToKeep = finalDf['nomination_winner'] == target
        
        
        ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
                    , finalDf.loc[indicesToKeep, 'principal component 2']
                   ,  finalDf.loc[indicesToKeep, 'principal component 3']
                    , c = color,  linewidth=0.5)
        
    ax.legend(targets)
    #ax.grid() 
    plt.show()
    plt.close()
    
    X_train_pca = pca.transform(X_train)
    
    # In order to use PCA_3D for testing, uncomment below line and run this cell.
    #X_train = X_train_pca 
    
elif feature_pca_2D:
    
     
    pca = decomposition.PCA(n_components=2)
    principalComponents = pca.fit_transform(X_train )

    
    
    principalDf = pd.DataFrame(data = principalComponents
                 , columns = ['principal component 1', 'principal component 2'] )
    
    finalDf = pd.concat([principalDf, Y_train], axis = 1)
    finalDf = finalDf.head(50)
    
    targets = [1, 0 ]
    colors = ['r', 'g' ]
     
    my_dpi = 96
  
    fig = plt.figure(figsize=(5,5))
    ax = fig.add_subplot(111 )

    ax.set_xlabel('Principal Component 1', fontsize = 8)
    ax.set_ylabel('Principal Component 2', fontsize = 8)

    ax.set_title('2 component PCA', fontsize = 15)
    for target, color in zip(targets,colors):
        indicesToKeep = finalDf['nomination_winner'] == target
        ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
                   , finalDf.loc[indicesToKeep, 'principal component 2']               
                    , c = color,  linewidth=0.5)

    ax.legend(targets)
    #ax.grid() 
    plt.show()
    plt.close()
    
    X_train_pca = pca.transform(X_train)
    
    # In order to use PCA_2D for testing, uncomment below line and run this cell.
    #X_train = X_train_pca 
    
  

#PCA ends

You will notice during the analysis that the "nomination/winner" 3D points (Red dots) are within the clusters of the non "nomination/winner" 3D points (Green dots). This suggests it is hard to classify your data in 3D model using PCA analysis.

Split the dataset into training and test data sets.


x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.25, random_state=0)
 

## Algorithm Comparison and Selection

Now you will build models using Logistic Regression, Support Vector Machine (SVM), Random Forest (RF), Decision Tree (DT), Gradient Boosting (GB) and Multi-Layer Perceptron (NN) classification schemes. The various scores such as Precision, Recall, ROC, F1, Accuracy are measured. You will also plot the ROC curves for the different models. 

if LR_flag:

# Logistic Regression Model

    logisticRegr = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='multinomial', max_iter=1000)
    logisticRegr.fit(x_train, y_train)
    y_test_pred_LR = cross_val_predict(logisticRegr, x_test, y_test, cv=3)
    score = logisticRegr.score(x_test, y_test)
    y_test_pred = y_test_pred_LR
    print("LR Accuracy",accuracy_score(y_test, y_test_pred))
    roc = roc_auc_score(y_test, y_test_pred)
    print("roc score", roc)
    print("Classification Report")
    print("="*50)
    LR_CR = classification_report(y_test, y_test_pred)
    print(classification_report(y_test, y_test_pred))
    # Plot   confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    np.set_printoptions(precision=2)

    plot_confusion_matrix(cnf_matrix, classes=['Class 0', 'Class 1'],
                      title='Confusion matrix, without normalization')
    
    # Compute ROC curve and ROC area
    #fpr, tpr, thrsehold = roc_curve(y_test , y_test_pred)
  

    
    logit_roc_auc = roc_auc_score(y_test_pred, logisticRegr.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, logisticRegr.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Logistic Regression ' )
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
     
    plt.show()



if SVM_flag:
    # SVM Model

    sgd_clf = SGDClassifier(random_state=0, loss="log", max_iter=1000, tol=3 )
    sgd_clf.fit(x_train, y_train)
    cross_val_score(sgd_clf, x_train, y_train, cv=3, scoring="accuracy")
    y_test_pred_SVC = cross_val_predict(sgd_clf, x_test, y_test, cv=3)
    predictions = sgd_clf.predict(x_test)
    score = sgd_clf.score(x_test, y_test)
    y_test_pred = y_test_pred_SVC
    print("SVM Accuracy",accuracy_score(y_test, y_test_pred))
    roc = roc_auc_score(y_test, y_test_pred_SVC)
    print("roc score", roc)
    SVM_CR = classification_report(y_test, y_test_pred)
    print("Classification Report")
    print("="*50)
    print(SVM_CR)
    # Plot   confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    np.set_printoptions(precision=2)

    plot_confusion_matrix(cnf_matrix, classes=['Class 0', 'Class 1'],
                      title='Confusion matrix, without normalization')
    
    # Compute micro-average ROC curve and ROC area
    fpr, tpr, thrsehold = roc_curve(y_test , y_test_pred)
     
  
    
    
    roc_auc = roc_auc_score(y_test_pred, sgd_clf.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, sgd_clf.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='SVM  ' )
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
     
    plt.show()





if DT_flag:
    # Decision Tree Model

    DT = DecisionTreeClassifier(random_state=42)
    DT.fit(x_train, y_train)
    cross_val_score(DT, x_train, y_train, cv=3, scoring="accuracy")
    y_test_pred_DT = cross_val_predict(DT, x_test, y_test, cv=3)
    predictions = DT.predict(x_test)
    score = DT.score(x_test, y_test)
    y_test_pred = y_test_pred_DT
    print("DT Accuracy",accuracy_score(y_test, y_test_pred))
    roc = roc_auc_score(y_test, y_test_pred)
    print("roc score", roc)
    DT_CR = classification_report(y_test, y_test_pred)
    print("Classification Report")
    print("="*50)
    print(DT_CR)
    # Plot   confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    np.set_printoptions(precision=2)

    plot_confusion_matrix(cnf_matrix, classes=['Class 0', 'Class 1'],
                      title='Confusion matrix, without normalization')
    
    # Compute micro-average ROC curve and ROC area
    fpr, tpr, thrsehold = roc_curve(y_test , y_test_pred)
 
    
 
    
    roc_auc = roc_auc_score(y_test_pred, DT.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, DT.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Decision Tree '  )
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
     
    plt.show()   





if RF_flag:
    # Ensemble Random Forest Model

    RF = RandomForestClassifier(random_state=42, n_estimators=100)
    RF.fit(x_train, y_train)
    cross_val_score(RF, x_train, y_train, cv=3, scoring="accuracy")
    y_test_pred_RF = cross_val_predict(RF, x_test, y_test, cv=3)
    predictions = RF.predict(x_test)
    score = RF.score(x_test, y_test)
    y_test_pred = y_test_pred_RF
    print("RF Accuracy",accuracy_score(y_test, y_test_pred))
    roc = roc_auc_score(y_test, y_test_pred)
    print("roc score", roc)
    RF_CR = classification_report(y_test, y_test_pred)
    print("Classification Report")
    print("="*50)
    print(RF_CR)
    # Plot   confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    np.set_printoptions(precision=2)

    plot_confusion_matrix(cnf_matrix, classes=['Class 0', 'Class 1'],
                      title='Confusion matrix, without normalization')
    
    
    # Compute micro-average ROC curve and ROC area
    fpr, tpr, thrsehold = roc_curve(y_test , y_test_pred)
 
    roc_auc = roc_auc_score(y_test_pred, sgd_clf.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, sgd_clf.predict_proba(x_test)[:,1])
 
    
    
    roc_auc = roc_auc_score(y_test_pred, RF.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, RF.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Random Forest ' )
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
     
    plt.show()





if GB_flag:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(x_train, y_train)

    cross_val_score(gb_clf, x_train, y_train, cv=3, scoring="accuracy")
    y_test_pred_gb = cross_val_predict(gb_clf, x_test, y_test, cv=3)
    predictions = gb_clf.predict(x_test)
    score = gb_clf.score(x_test, y_test)
    y_test_pred = y_test_pred_gb
    print("GB Accuracy",accuracy_score(y_test, y_test_pred))
    roc = roc_auc_score(y_test, y_test_pred)
    print("roc score", roc)
    GB_CR = classification_report(y_test, y_test_pred)
    print("Classification Report")
    print("="*50)
    print(GB_CR)
    
    # Plot   confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    np.set_printoptions(precision=2)

    plot_confusion_matrix(cnf_matrix, classes=['Class 0', 'Class 1'],
                      title='Confusion matrix, without normalization')
    
    
    # Compute micro-average ROC curve and ROC area
    fpr, tpr, thrsehold = roc_curve(y_test , y_test_pred)
    roc_auc  = auc(fpr , tpr )
 
 
    roc_auc = roc_auc_score(y_test_pred, gb_clf.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, gb_clf.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Gradient Boosting  '  )
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
     
    plt.show()


if NN_flag:


    mlp = MLPClassifier(hidden_layer_sizes=(20, 20, 20), max_iter=1000)
    mlp.fit(x_test, y_test)

    cross_val_score(mlp, x_train, y_train, cv=3, scoring="accuracy")

    y_test_pred_NN = cross_val_predict(mlp, x_test, y_test, cv=3)
    predictions = mlp.predict(x_test)
    score = mlp.score(x_test, y_test)

    y_test_pred = y_test_pred_NN
    print("NN Accuracy",accuracy_score(y_test, y_test_pred))
    roc = roc_auc_score(y_test, y_test_pred)
    print("roc score", roc)
    NN_CR = classification_report(y_test, y_test_pred)
    print("Classification Report")
    print("="*50)
    print(NN_CR)
    
    # Plot   confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    np.set_printoptions(precision=2)

    plot_confusion_matrix(cnf_matrix, classes=['Class 0', 'Class 1'],
                          title='Confusion matrix, without normalization')

    # Compute micro-average ROC curve and ROC area
    fpr, tpr, thrsehold = roc_curve(y_test , y_test_pred)
    roc_auc  = auc(fpr , tpr )
  
 
    
     
    roc_auc = roc_auc_score(y_test_pred, mlp.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, mlp.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='MLP  '  )
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()

Below will display classification report of respective models.

report_list = [LR_CR,SVM_CR,DT_CR,RF_CR,GB_CR,NN_CR]
model_list = ['LR','SVM','DT','RF','GB','NN']
def display_report(model,report):
    display(Markdown(model+" **Model Classification Report** \n"))
    print('\n',report)
    
for (model,report) in zip(model_list, report_list):
    display_report(model,report)
  

# Question
Based on **Precision, Recall and F1 metrics**, which **model** do you think best fit our prediction?

**Now run 2005 data against your identified model.**

# Model Test

Run below code to review **2005 data.**

df_2005.head(30)

Select X_train and Y_train data based on selected feature in feature selection box. You can review your selection by clicking below link.

<a href='#feature selection'>**Link to the feature selection box**</a>

if feature_premiere_wide:
    X_train = df_2005[['rating', 'ratingCount', 'runtimeMinutes','premiere','wide']]
elif feature_premiere:
    X_train = df_2005[['rating', 'ratingCount', 'runtimeMinutes','premiere' ]]
elif feature_wide:
    X_train = df_2005[['rating', 'ratingCount',  'runtimeMinutes' ,'wide']]
else :
    X_train = df_2005[['rating', 'ratingCount',  'runtimeMinutes']]


Y_train = df_2005['nomination_winner']

if normalize_flag:
    X_train=(StandardScaler().fit_transform(X_train ))

Split the dataset into training and test data sets.

x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.25, random_state=0)

**Run the selected model** Copy the code from the best determined model box in the previous step, and paste it into the box below.


if DT_flag:
    # Decision Tree Model

    DT = DecisionTreeClassifier(random_state=42)
    DT.fit(x_train, y_train)
    cross_val_score(DT, x_train, y_train, cv=3, scoring="accuracy")
    y_test_pred_DT = cross_val_predict(DT, x_test, y_test, cv=3)
    predictions = DT.predict(x_test)
    score = DT.score(x_test, y_test)
    y_test_pred = y_test_pred_DT
    print("DT Accuracy",accuracy_score(y_test, y_test_pred))
    roc = roc_auc_score(y_test, y_test_pred)
    print("roc score", roc)
    DT_CR = classification_report(y_test, y_test_pred)
    print("Classification Report")
    print("="*50)
    print(DT_CR)
    # Plot   confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    np.set_printoptions(precision=2)

    plot_confusion_matrix(cnf_matrix, classes=['Class 0', 'Class 1'],
                      title='Confusion matrix, without normalization')
    
    # Compute micro-average ROC curve and ROC area
    fpr, tpr, thrsehold = roc_curve(y_test , y_test_pred)
 
    
 
    
    roc_auc = roc_auc_score(y_test_pred, DT.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, DT.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Decision Tree '  )
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
     
    plt.show()   




In [None]:
{
  "db": "BLUDB",
  "dsn": "DATABASE=BLUDB;HOSTNAME=dashdb-txn-sbox-yp-dal09-03.services.dal.bluemix.net;PORT=50000;PROTOCOL=TCPIP;UID=hpd33015;PWD=423^xqwns3mfwsk6;",
  "host": "dashdb-txn-sbox-yp-dal09-03.services.dal.bluemix.net",
  "hostname": "dashdb-txn-sbox-yp-dal09-03.services.dal.bluemix.net",
  "https_url": "https://dashdb-txn-sbox-yp-dal09-03.services.dal.bluemix.net",
  "jdbcurl": "jdbc:db2://dashdb-txn-sbox-yp-dal09-03.services.dal.bluemix.net:50000/BLUDB",
  "parameters": {},
  "password": "423^xqwns3mfwsk6",
  "port": 50000,
  "ssldsn": "DATABASE=BLUDB;HOSTNAME=dashdb-txn-sbox-yp-dal09-03.services.dal.bluemix.net;PORT=50001;PROTOCOL=TCPIP;UID=hpd33015;PWD=423^xqwns3mfwsk6;Security=SSL;",
  "ssljdbcurl": "jdbc:db2://dashdb-txn-sbox-yp-dal09-03.services.dal.bluemix.net:50001/BLUDB:sslConnection=true;",
  "uri": "db2://hpd33015:423%5Exqwns3mfwsk6@dashdb-txn-sbox-yp-dal09-03.services.dal.bluemix.net:50000/BLUDB",
  "username": "hpd33015"
}

In [None]:
select * from EMPLOYEES;
select * from DEPARTMENTS;
select * from JOBS;
select * from JOB_HISTORY;
select * from  LOCATIONS;
select EMP_ID, F_NAME  from EMPLOYEES where ADDRESS like '%Elgin,IL';
select EMP_ID, F_NAME  from EMPLOYEES where B_DATE like '197%';
select EMP_ID, F_NAME  from EMPLOYEES where SALARY between 60000 and 70000;
select EMP_ID, F_NAME, DEP_ID  from EMPLOYEES order by DEP_ID;
select DEP_ID, count (*) as NUM_EMPL from EMPLOYEES group by DEP_ID;
select DEP_ID, count (*) as NUM_EMPLOYEES, avg(SALARY) as AVG_SALARY from EMPLOYEES group by DEP_ID;
select DEP_ID, count (*) as NUM_EMPLOYEES, avg(SALARY) as AVG_SALARY from EMPLOYEES group by DEP_ID order by AVG_SALARY;
select DEP_ID, count (*) as NUM_EMPLOYEES, avg(SALARY) as AVG_SALARY from EMPLOYEES group by DEP_ID 
having count(*) < 4;
select D.DEP_NAME, E.F_NAME, E.L_NAME from EMPLOYEES as E,  DEPARTMENTS as D where E.DEP_ID = D.DEPT_ID_DEP 
order by D.DEP_NAME, E.L_NAME desc;

In [None]:
%sql select count(*) from CHICAGO_CRIME_DATA
%sql select * from CHICAGO_CRIME_DATA limit 10
%sql select count(*) from CHICAGO_CRIME_DATA where ARREST = True
%sql select PRIMARY_TYPE, LOCATION_DESCRIPTION from CHICAGO_CRIME_DATA where LOCATION_DESCRIPTION = 'GAS STATION'
%sql select COMMUNITY_AREA_NAME from CENCUS_DATA where COMMUNITY_AREA_NAME like 'B%'
%sql select NAME_OF_SCHOOL, COMMUNITY_AREA_NUMBER, HEALTHY_SCHOOL_CERTIFIED \
from CHICAGO_PUBLIC_SCHOOLS where (COMMUNITY_AREA_NUMBER between 10 and 15 and HEALTHY_SCHOOL_CERTIFIED = 'Yes')
%sql select avg(SAFETY_SCORE) from CHICAGO_PUBLIC_SCHOOLS
%sql select COMMUNITY_AREA_NAME, COLLEGE_ENROLLMENT from CHICAGO_PUBLIC_SCHOOLS order by COLLEGE_ENROLLMENT desc limit 5
%sql select COMMUNITY_AREA_NAME, SAFETY_SCORE from CHICAGO_PUBLIC_SCHOOLS where SAFETY_SCORE = (select min(SAFETY_SCORE) from CHICAGO_PUBLIC_SCHOOLS)
%sql SELECT c.PER_CAPITA_INCOME, c.COMMUNITY_AREA_NAME \
    FROM CENCUS_DATA c, CHICAGO_PUBLIC_SCHOOLS s \
    WHERE s.COMMUNITY_AREA_NUMBER  = c.COMMUNITY_AREA_NUMBER AND s.SAFETY_SCORE = '1'

In [1]:
```js
// Javascript code with syntax highlighting.
var fun = function lang(l) {
  dateformat.i18n = require('./lang/' + l)
  return true;
}
```

```ruby
# Ruby code with syntax highlighting
GitHubPages::Dependencies.gems.each do |gem, version|
  s.add_dependency(gem, "= #{version}")
end
```

#### Header 4

*   This is an unordered list following a header.
*   This is an unordered list following a header.
*   This is an unordered list following a header.

##### Header 5

1.  This is an ordered list following a header.
2.  This is an ordered list following a header.
3.  This is an ordered list following a header.

###### Header 6

| head1        | head two          | three |
|:-------------|:------------------|:------|
| ok           | good swedish fish | nice  |
| out of stock | good and plenty   | nice  |
| ok           | good `oreos`      | hmm   |
| ok           | good `zoute` drop | yumm  |

### There's a horizontal rule below this.

* * *

### Here is an unordered list:

*   Item foo
*   Item bar
*   Item baz
*   Item zip

### And an ordered list:

1.  Item one
1.  Item two
1.  Item three
1.  Item four

### And a nested list:

- level 1 item
  - level 2 item
  - level 2 item
    - level 3 item
    - level 3 item
- level 1 item
  - level 2 item
  - level 2 item
  - level 2 item
- level 1 item
  - level 2 item
  - level 2 item
- level 1 item

### Small image

![Octocat](https://github.githubassets.com/images/icons/emoji/octocat.png)

### Large image

![Branching](https://guides.github.com/activities/hello-world/branching.png)


### Definition lists can be used with HTML syntax.

<dl>
<dt>Name</dt>
<dd>Godzilla</dd>
<dt>Born</dt>
<dd>1952</dd>
<dt>Birthplace</dt>
<dd>Japan</dd>
<dt>Color</dt>
<dd>Green</dd>
</dl>

```
Long, single-line code blocks should not wrap. They should horizontally scroll if they are too long. This line should be long enough to demonstrate this.
```

```
The final element.
```


SyntaxError: invalid syntax (<ipython-input-1-99a61d151ea2>, line 1)

In [3]:
conda install -c conda-forge jupyterlab-git

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/Cho/opt/anaconda3

  added / updated specs:
    - jupyterlab-git


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py37_0         148 KB  conda-forge
    conda-4.8.3                |   py37hc8dfbb8_0         3.0 MB  conda-forge
    gitdb-4.0.2                |             py_0          46 KB  conda-forge
    gitpython-3.1.0            |             py_0         335 KB  conda-forge
    jupyterlab-git-0.9.0       |             py_0         145 KB  conda-forge
    nbdime-2.0.0               |   py37hc8dfbb8_0         4.6 MB  conda-forge
    nodejs-11.14.0             |       h6de7cb9_1        15.4 MB  conda-forge
    python_abi-3.7             |          1_cp37m           4 KB  conda-forge
    smmap-3.0.1              

In [4]:
conda install -c conda-forge/label/cf201901 jupyterlab-git

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/Cho/opt/anaconda3

  added / updated specs:
    - jupyterlab-git


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py37_0         153 KB
    jupyterlab-git-0.4.4       |             py_0          60 KB  conda-forge/label/cf201901
    ------------------------------------------------------------
                                           Total:         213 KB

The following packages will be SUPERSEDED by a higher-priority channel:

  certifi                                       conda-forge --> pkgs/main
  jupyterlab-git     conda-forge::jupyterlab-git-0.9.0-py_0 --> conda-forge/label/cf201901::jupyterlab-git-0.4.4-py_0



Downloading and Extracting Packages
jupyterlab-git-0.4.4 | 60 KB     | #########################

In [2]:
jupyter labextension install @jupyterlab/github

SyntaxError: invalid syntax (<ipython-input-2-6331377fa060>, line 1)

In [1]:
jupyter serverextension enable --sys-prefix jupyterlab_github

SyntaxError: invalid syntax (<ipython-input-1-5a99dc44ab60>, line 1)

In [6]:
pip install jupyterlab-git
jupyter lab build

SyntaxError: invalid syntax (<ipython-input-6-1cde7c885ebd>, line 1)

In [8]:
jupyter serverextension enable --py jupyterlab_git

SyntaxError: invalid syntax (<ipython-input-8-6c2e43bd9c6e>, line 1)

In [1]:
pip install --upgrade jupyterlab-git

Collecting jupyterlab-git
  Using cached jupyterlab_git-0.9.0-py3-none-any.whl (144 kB)
Installing collected packages: jupyterlab-git
  Attempting uninstall: jupyterlab-git
    Found existing installation: jupyterlab-git 0.4.4
    Uninstalling jupyterlab-git-0.4.4:
      Successfully uninstalled jupyterlab-git-0.4.4
Successfully installed jupyterlab-git-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
jupyter serverextension enable --py juoyterlab_git

SyntaxError: invalid syntax (<ipython-input-5-68ccd730df68>, line 1)

In [4]:
pip install jupyter_contrib_nbextensions

Collecting jupyter_contrib_nbextensions
  Downloading jupyter_contrib_nbextensions-0.5.1-py2.py3-none-any.whl (20.9 MB)
[K     |████████████████████████████████| 20.9 MB 2.3 MB/s eta 0:00:01
[?25hCollecting jupyter-latex-envs>=1.3.8
  Downloading jupyter_latex_envs-1.4.6.tar.gz (861 kB)
[K     |████████████████████████████████| 861 kB 32.0 MB/s eta 0:00:01
[?25hCollecting jupyter-nbextensions-configurator>=0.4.0
  Downloading jupyter_nbextensions_configurator-0.4.1.tar.gz (479 kB)
[K     |████████████████████████████████| 479 kB 29.2 MB/s eta 0:00:01
Collecting jupyter-highlight-selected-word>=0.1.1
  Downloading jupyter_highlight_selected_word-0.2.0-py2.py3-none-any.whl (11 kB)
Collecting jupyter-contrib-core>=0.3.3
  Downloading jupyter_contrib_core-0.3.3-py2.py3-none-any.whl (18 kB)
Building wheels for collected packages: jupyter-latex-envs, jupyter-nbextensions-configurator
  Building wheel for jupyter-latex-envs (setup.py) ... [?25ldone
[?25h  Created wheel for jupyter-late

In [6]:
jupyter serverextension list

SyntaxError: invalid syntax (<ipython-input-6-9683ae3d70f1>, line 1)

In [9]:
jupyter serverextension enable --py jupyterlab_git --sys-prefix

SyntaxError: invalid syntax (<ipython-input-9-7a1db07b31e6>, line 1)