# Classifier tests

This script tests the utility of various Scikit learn classifiers towards the task of classifying text for livertox ratings.

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca

https://stackabuse.com/text-classification-with-python-and-scikit-learn/


https://buhrmann.github.io/tfidf-analysis.html

https://stackabuse.com/understanding-roc-curves-with-python/

https://stackabuse.com/overview-of-classification-methods-in-python-with-scikit-learn/

about lemmatization https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word
https://stackoverflow.com/questions/28475620/wordnet-lemmatizer-in-nltk-is-not-working-for-adverbs

In [1]:
import pandas as pd
from pandas import read_csv
from tqdm.notebook import trange, tqdm
import ipywidgets 
import widgetsnbextension
import time
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

In [2]:
## Function to clean up the text for analysis
def clean_up_text(data_set):
    data_set['clean_text'] = data_set['section_text'].str.replace(r'\W', ' ')
    data_set['clean_text'] = data_set['clean_text'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    data_set['clean_text'] = data_set['clean_text'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    data_set['clean_text'] = data_set['clean_text'].str.lower()
    return(data_set)

## Generate Training and test sets from just the LiverTox corpus

In [9]:
#txttype = 'WARNING_PRECAUTION'
#txttype = 'OVERDOSE'
#txttype = 'ADVERSE REACT'
#txttype = 'INTERACTION'
#txttype = 'INDICATION'
#txttype = 'POPULATION'
#txttype = 'WARNBOX'
txttype = 'all_toxsxns'

livertox_file = read_csv('results/openfda/classifier/livertox_dataset_'+txttype+'.tsv', 
                         delimiter='\t',header=0, index_col=0)
#print(livertox_file.head(n=2))

## Create the binary subset
training_set_pos = livertox_file.loc[(livertox_file['likelihood_score']=='A')|
                                     (livertox_file['likelihood_score']=='B')|
                                     (livertox_file['likelihood_score']=='C')].copy()
training_set_pos['target'] = 'livertoxic'
training_set_neg = livertox_file.loc[livertox_file['likelihood_score']=='E'].copy()
training_set_neg['target'] = 'not livertoxic'

training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)
print(len(training_set_pos), len(training_set_neg))
#print(training_set.head(n=2))
#print(len(training_set))

#### Clean up the text for analysis
training_set = clean_up_text(training_set)

164 166


In [None]:
#### Consider lemmatizing the text -- Note, this is a slow step, skip altogether if not needed
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag, word_tokenize

lem = WordNetLemmatizer()
#import nltk
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

def get_wordnet_pos(word):
    pos_label = (pos_tag(word_tokenize(word))[0][1][0]).lower()
    if pos_label == 'j': 
        pos_label = 'a' 
    if pos_label in ['r','a', 's', 'v']: # For adjectives and verbs
        return lem.lemmatize(word, pos=pos_label)
    else:   # For nouns and everything else as it is the default kwarg
        return lem.lemmatize(word)

i=0
for i in tqdm(range(len(training_set))):
    try:
        tmp_list = training_set.iloc[i]['clean_text'].split()
        cleanlist = [get_wordnet_pos(word) for word in tmp_list]
        clean_text = ' '.join(cleanlist)
        training_set.iloc[i]['clean_text'] = clean_text
    except:
        print(i,'splitting failed')
    i=i+1
    
print(training_set.head(n=2))

#### The lemmatization does not appear to affect the results in the training and testing

In [None]:
#### Export the lemmatized text since this step is time-consuming
#training_set.to_csv('results/openfda/livertox_expanded_lemmatized_training_data.tsv',sep='\t',header=True,encoding='UTF-8')

#### Import lemmatized dataset
training_set = read_csv('results/openfda/livertox_expanded_lemmatized_training_data.tsv',delimiter='\t',header=0, index_col=0,encoding='UTF-8')


In [10]:
####Vectorize the text for classifier

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(training_set['clean_text'])
features = vectorizer.get_feature_names()
print(X.shape)

#### Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, training_set.target, test_size=0.2, random_state=0)

(330, 18439)


## Generate the Training and Test sets from expanded corpus

In [13]:
#### Use the Expanded Set of all toxicity sections
txttype = 'all_toxsxns'

livertox_file = read_csv('results/openfda/classifier/livertox_expanded_set_'+txttype+'.tsv', 
                         delimiter='\t',header=0, index_col=0)
#print(livertox_file.head(n=2))

## Create the binary subset
training_set_pos = livertox_file.loc[(livertox_file['likelihood_score']=='A')|
                                     (livertox_file['likelihood_score']=='B')|
                                     (livertox_file['likelihood_score']=='C')|
                                     (livertox_file['likelihood_score']=='Y')].copy()
training_set_pos['target'] = 'livertoxic'
training_set_neg = livertox_file.loc[(livertox_file['likelihood_score']=='E')|
                                     (livertox_file['likelihood_score']=='Z')].copy()
training_set_neg['target'] = 'not livertoxic'

print(len(training_set_pos),len(training_set_neg))

training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)
print(len(training_set_pos), len(training_set_neg))
#print(training_set.head(n=2))
#print(len(training_set))

#### Clean up the text for analysis
training_set = clean_up_text(training_set)

311 438
311 438


In [14]:
####Vectorize the text for classifier

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(training_set['clean_text'])
features = vectorizer.get_feature_names()
print(X.shape)

#### Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, training_set.target, test_size=0.2, random_state=0)

(749, 23719)


## Generating Training from expanded corpus and LiverTox set, Test set from Livertox only

In [3]:
txttype = 'all_toxsxns'

livertox_file = read_csv('results/openfda/classifier/livertox_expanded_set_'+txttype+'.tsv', 
                         delimiter='\t',header=0, index_col=0)
#print(livertox_file.head(n=2))

## Create the binary subset
training_set_pos = livertox_file.loc[(livertox_file['likelihood_score']=='A')|
                                     (livertox_file['likelihood_score']=='B')|
                                     (livertox_file['likelihood_score']=='C')].copy()
training_set_pos['target'] = 'livertoxic'
training_set_neg = livertox_file.loc[livertox_file['likelihood_score']=='E'].copy()
training_set_neg['target'] = 'not livertoxic'
print(len(training_set_pos),len(training_set_neg))

test_set_pos = training_set_pos.sample(frac=0.23,random_state=1)
test_set_neg = training_set_neg.sample(frac=0.23,random_state=1)
print(len(test_set_pos), len(test_set_neg))

expanded_set_pos = livertox_file.loc[livertox_file['likelihood_score']=='Y'].copy()
expanded_set_pos['target'] = 'livertoxic'
expanded_set_neg = livertox_file.loc[livertox_file['likelihood_score']=='Z'].copy()
expanded_set_neg['target'] = 'not livertoxic'
print(len(expanded_set_pos),len(expanded_set_neg))

total_set = pd.concat((training_set_pos,training_set_neg,expanded_set_pos,expanded_set_neg),ignore_index=True)
test_set = pd.concat((test_set_pos,test_set_neg),ignore_index=True)
training_set = pd.merge(total_set,test_set, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)
print(len(training_set))

#### Clean up the text for analysis
training_set = clean_up_text(training_set)
total_set = clean_up_text(total_set)
test_set = clean_up_text(test_set)

164 166
38 38
147 272
673


In [10]:
####Vectorize the text for classifier using the training set and apply to test set

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(training_set['clean_text'])
X_test = vectorizer.transform(test_set['clean_text'])
features = vectorizer.get_feature_names()
y_train = training_set.target
y_test = test_set.target

print(X.shape)
print(y_test.shape)


(673, 22421)
(76,)


## Try out different classifiers

In [11]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [None]:
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=50, max_iter=5, tol=None)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.neural_network import MLPClassifier
classifier = MLPClassifier(alpha=1, max_iter=1000)
classifier.fit(X_train, y_train)

In [None]:
from sklearn import tree
classifier = tree.DecisionTreeClassifier(max_depth=5)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(3)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel="linear", C=0.025)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
classifier = GaussianProcessClassifier(1.0 * RBF(1.0))
classifier.fit(X_train, y_train)

In [16]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
classifier = AdaBoostClassifier()
classifier.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

## Review predictive performance of classifiers

In [12]:
y_pred = classifier.predict(X_test)

In [7]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score


print(confusion_matrix(y_test,y_pred))
#print(classification_report(y_test,y_pred))
#print(accuracy_score(y_test, y_pred))
report = classification_report(y_test,y_pred,output_dict=True)
print(pd.DataFrame(report))
#print(report)
## Calculate AUC
probs = classifier.predict_proba(X_test)
probs = probs[:, 1]
auc = roc_auc_score(y_test, probs)
print(auc)

[[26  4]
 [ 6 30]]
                   0          1  accuracy  macro avg  weighted avg
precision   0.812500   0.882353  0.848485   0.847426      0.850602
recall      0.866667   0.833333  0.848485   0.850000      0.848485
f1-score    0.838710   0.857143  0.848485   0.847926      0.848764
support    30.000000  36.000000  0.848485  66.000000     66.000000
0.9296296296296296


## Run classifier comparisons

In [15]:
#### Load the classifiers

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

classifiers = {
    'Random Forest':RandomForestClassifier(n_estimators=1000, random_state=0),
    'MultinomialNB':MultinomialNB(),
    'SGDClassifier':SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=50, max_iter=5, tol=None),
    'Neural Net':MLPClassifier(alpha=1, max_iter=1000),
    'Decision Tree':tree.DecisionTreeClassifier(max_depth=5),
    'Nearest Neighbor':KNeighborsClassifier(3),
    'Linear SVM':SVC(kernel="linear", C=0.025),
    'RBF SVM':SVC(gamma=2, C=1),
    #'Gaussian Process':GaussianProcessClassifier(1.0 * RBF(1.0)),
    'AdaBoost':AdaBoostClassifier(),
    #'Naive Bayes':GaussianNB(),
    #'QDA':QuadraticDiscriminantAnalysis(),
    'Logistic Regression':LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')}


In [16]:
#### Loop through them 
classifiers_list = list(classifiers.keys())
classifier_results_list = []

for i in tqdm(range(len(classifiers_list))):
    classifier = classifiers[classifiers_list[i]]
    ## Train via classifier
    classifier.fit(X_train, y_train)
    ## Test classifier
    y_pred = classifier.predict(X_test)
    ## Get Metrics
    report = classification_report(y_test,y_pred,output_dict=True)
    ## Calculate AUC
    try:
        probs = classifier.predict_proba(X_test)
        probs = probs[:, 1]
        auc = roc_auc_score(y_test, probs)
    except:
        auc = 'not calculated'
    ## Save metrics
    metrics_dict = {'classifier':classifiers_list[i],
                    'CM_0_0':confusion_matrix(y_test,y_pred)[0][0],
                    'CM_0_1':confusion_matrix(y_test,y_pred)[0][1],
                    'CM_1_0':confusion_matrix(y_test,y_pred)[1][0],
                    'CM_1_1':confusion_matrix(y_test,y_pred)[1][1],
                    'livertoxic_precision':report['livertoxic']['precision'],
                    'livertoxic_recall':report['livertoxic']['recall'],
                    'livertoxic_f':report['livertoxic']['f1-score'],
                    'livertoxic_support':report['livertoxic']['support'],
                    'not_livertoxic_precision':report['not livertoxic']['precision'],
                    'not_livertoxic_recall':report['not livertoxic']['recall'],
                    'not_livertoxic_f':report['not livertoxic']['f1-score'],
                    'not_livertoxic_support':report['not livertoxic']['support'],
                    'macro_avg_precision':report['macro avg']['precision'],
                    'macro_avg_recall':report['macro avg']['recall'],
                    'macro_avg_f':report['macro avg']['f1-score'],
                    'macro_avg_support':report['macro avg']['support'],   
                    'wt_avg_precision':report['weighted avg']['precision'],
                    'wt_avg_recall':report['weighted avg']['recall'],
                    'wt_avg_f':report['weighted avg']['f1-score'],
                    'wt_avg_support':report['weighted avg']['support'],
                    'accuracy':report['accuracy'],
                    'AUC':auc
                   }
    classifier_results_list.append(metrics_dict)
    

classifier_results_df = pd.DataFrame(classifier_results_list)
print(classifier_results_df.head(n=2))
classifier_results_df.to_csv('results/openfda/classifier_results_df.tsv',sep='\t',header=True,encoding='UTF-8')

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

  _warn_prf(average, modifier, msg_start, len(result))



      classifier  CM_0_0  CM_0_1  CM_1_0  CM_1_1  livertoxic_precision  \
0  Random Forest      51       8      12      79              0.809524   
1  MultinomialNB      22      37       2      89              0.916667   

   livertoxic_recall  livertoxic_f  livertoxic_support  \
0           0.864407      0.836066                  59   
1           0.372881      0.530120                  59   

   not_livertoxic_precision  ...  macro_avg_precision  macro_avg_recall  \
0                  0.908046  ...             0.858785          0.866269   
1                  0.706349  ...             0.811508          0.675452   

   macro_avg_f  macro_avg_support  wt_avg_precision  wt_avg_recall  wt_avg_f  \
0     0.861853                150          0.869294       0.866667  0.867354   
1     0.675198                150          0.789074       0.740000  0.706148   

   wt_avg_support  accuracy       AUC  
0             150  0.866667  0.941237  
1             150  0.740000  0.891786  

[2 rows x 23 

## Apply the classifier to the LiverTox dataset and predict if an SPL is livertoxic or not

https://levelup.gitconnected.com/scikit-learn-machine-learning-classification-101-c431de2dc2b2


https://machinelearningmastery.com/make-predictions-scikit-learn/

https://machinelearningmastery.com/train-final-machine-learning-model/

https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/




In [30]:
#### Verify performance of top 2 classifiers using K-folds cross validator
from sklearn.model_selection import KFold
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(training_set['clean_text'])
y = training_set.target
features = vectorizer.get_feature_names()
print(X.shape)

kf = KFold(n_splits=5, random_state=None, shuffle=True)

classifier_results_list =[]
classifiers = {
    'Random Forest':RandomForestClassifier(n_estimators=1000, random_state=0),
    'AdaBoost':AdaBoostClassifier()}
classifiers_list = list(classifiers.keys())

for i in tqdm(range(len(classifiers_list))):
    classifier = classifiers[classifiers_list[i]]
    classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
    for train_index, test_index in kf.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        ## Train via classifier
        classifier.fit(X_train, y_train)
        ## Test classifier
        y_pred = classifier.predict(X_test)
        ## Get Metrics
        report = classification_report(y_test,y_pred,output_dict=True)
        ## Calculate AUC
        try:
            probs = classifier.predict_proba(X_test)
            probs = probs[:, 1]
            auc = roc_auc_score(y_test, probs)
        except:
            auc = 'not calculated'
        ## Save metrics
        metrics_dict = {'classifier':classifiers_list[i],
                        'CM_0_0':confusion_matrix(y_test,y_pred)[0][0],
                        'CM_0_1':confusion_matrix(y_test,y_pred)[0][1],
                        'CM_1_0':confusion_matrix(y_test,y_pred)[1][0],
                        'CM_1_1':confusion_matrix(y_test,y_pred)[1][1],
                        'livertoxic_precision':report['livertoxic']['precision'],
                        'livertoxic_recall':report['livertoxic']['recall'],
                        'livertoxic_f':report['livertoxic']['f1-score'],
                        'livertoxic_support':report['livertoxic']['support'],
                        'not_livertoxic_precision':report['not livertoxic']['precision'],
                        'not_livertoxic_recall':report['not livertoxic']['recall'],
                        'not_livertoxic_f':report['not livertoxic']['f1-score'],
                        'not_livertoxic_support':report['not livertoxic']['support'],
                        'macro_avg_precision':report['macro avg']['precision'],
                        'macro_avg_recall':report['macro avg']['recall'],
                        'macro_avg_f':report['macro avg']['f1-score'],
                        'macro_avg_support':report['macro avg']['support'],   
                        'wt_avg_precision':report['weighted avg']['precision'],
                        'wt_avg_recall':report['weighted avg']['recall'],
                        'wt_avg_f':report['weighted avg']['f1-score'],
                        'wt_avg_support':report['weighted avg']['support'],
                        'accuracy':report['accuracy'],
                        'AUC':auc
                       }
        classifier_results_list.append(metrics_dict)

classifier_results_df = pd.DataFrame(classifier_results_list)
classifier_results_df.to_csv('results/openfda/top_two_classifier_cross_validation_results_df.tsv',sep='\t',header=True,encoding='UTF-8')

(749, 23719)


HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




In [51]:
#### Run the classifier on the entire training dataset without splitting
####Vectorize the training set for classifier

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(training_set['clean_text'])
features = vectorizer.get_feature_names()
print(X.shape)

## Save the vectorizer
vectorizerfile = "results/openfda/classifier/models_vectorizer.pickle"
pickle.dump(vectorizer, open(vectorizerfile, "wb"))

#### train the model on all the data
classifier = RandomForestClassifier(n_estimators=1000, random_state=None)
classifier.fit(X, training_set.target)

## Save the Model
filename = 'results/openfda/classifier/models_randomforest.sav'
pickle.dump(classifier, open(filename, 'wb'))


(749, 23719)


In [None]:
## Load the saved models
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)

In [52]:
#### Apply the model

## Pull out all the data 
spl_alltox = read_csv('results/openfda/all_toxsxns_sample_text_per_code.tsv', delimiter='\t',header=0, index_col=0)
#print(spl_alltox.head(n=2))

## Remove active_codes with no text
nonan = spl_alltox.loc[~spl_alltox['section_text'].isna()]
#print(len(spl_alltox), len(nonan))

## Vectorize the text based on the previously trained vectorizer and run the classifier
labels = nonan['active_code']
M = vectorizer.transform(nonan['section_text'])
prediction = classifier.predict(M)

## Save the results
classifier_results = pd.DataFrame(list(zip(labels, prediction)), columns =['active_code', 'livertox_prediction'])
print(classifier_results.head(n=2))
classifier_results.to_csv('results/openfda/classifier/livertox_predictions_all_toxsxns_randomforest.tsv',
                          sep='\t', header=True)

  active_code livertox_prediction
0  2RQ1L9N089      not livertoxic
1  PDC6A3C0OX      not livertoxic


In [59]:
#### Check the model againt the original data
livertox_scores = training_set[['active_code','likelihood_score','target']]
#classifier_results_check = classifier_results.merge(livertox_scores, on='active_code', how='inner')
#classifier_failures = classifier_results_check.loc[classifier_results_check['livertox_prediction']!=classifier_results_check['target']]
#print(classifier_failures)

classifier_results_check = classifier_results.merge(livertox_scores, on='active_code', how='left')
classifier_results_check.to_csv('results/openfda/classifier/livertox_predictions_all_toxsxns_randomforest_chk.tsv',
                          sep='\t', header=True)

In [4]:
classifier_results_check = read_csv('results/openfda/classifier/livertox_predictions_all_toxsxns_randomforest_chk.tsv',
                          delimiter='\t', header=0, index_col=0)
#print(classifier_results_check.head(n=2))

not_in_wd_or_livertox = classifier_results_check.loc[(classifier_results_check['likelihood_score'].isna())|
                                                     (classifier_results_check['likelihood_score']=='Y')|
                                                     (classifier_results_check['likelihood_score']=='Z')]
print(len(not_in_wd_or_livertox))
print(not_in_wd_or_livertox.head(n=2))

1195
  active_code livertox_prediction likelihood_score target
0  2RQ1L9N089      not livertoxic              NaN    NaN
1  PDC6A3C0OX      not livertoxic              NaN    NaN


In [None]:
#### QC the results
## In WD, map UNIIs to drug classes with ratings in Livertox
## Apply drug class livertox ratings to individual drugs

In [None]:
#### Check FDA SPLs for non-compliance

## Get spls from drugs with livertoxicity

## pull/merge sections for each drug and run classifier

## Look for spls which are classified as not toxic for drugs where active ingredient is toxic

In [None]:
"""
## save plot
from sklearn.metrics import roc_curve

def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

probs = classifier.predict_proba(X_test)
probs = probs[:, 1]
auc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(testy, probs)
plot_roc_curve(fpr, tpr)

"""

In [None]:
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable
            ]

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=42)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
               edgecolors='k')
    # Plot the testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
               edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                   edgecolors='k')
        # Plot the testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   edgecolors='k', alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1

plt.tight_layout()
plt.show()
"""