In [None]:
#read in the csv to pandas
import pandas as pd

#previous files 'bird_train.csv' 'bird_train_test.csv' 'brid_train (copy).csv'
filename =  'bird_train_2_6.csv' #'bird_train_2_6.csv_all'#
filename = '/'.join( ('feature_tables',  filename) )
data = pd.read_csv(filename)
data.columns = [dd.strip('\n').strip() for dd in data.columns]
#create a copy to chop off the target values
data_copy = data.copy(deep=True)
data.tail()

In [None]:
##drop unimportant features to see if performance improves...
#df = data.drop( columns=['num_samples', 'sample_rate', 'species'] )#, inplace=True )

In [None]:
#rempove certain species from the dataframe
df = data.copy()
rem_spec = 'Bald Eagle'
data = df.drop(df[df['species'] == rem_spec].sample(frac=1.0).index)
#data = df.drop(df[df['species'] == 'Mallard'].sample(frac=1.0).index)
data.species.value_counts().sum()

In [None]:
##for the '2_6_all' training table, there are some vastly underrepresended sepcies
###to deal with this, remove all spcies under a certain count
thresh = 20 #remove species with less than 'thresh' samples
vc = data.species.value_counts()# < 10
to_rem = vc[ vc <100 ].index
new_data = data[ ~data.species.isin( to_rem )]
new_data.shape, to_rem, new_data.species.value_counts().shape
data = new_data.copy()

In [None]:
data.species.value_counts().shape

In [None]:
#import seaborn as sns
#import numpy as np
#import matplotlib.pyplot as plt

ax = data.species.value_counts().plot(kind='bar')
if 'all' in filename:
    fontsize = 6
else:
    fontsize=12
ax.xaxis.set_ticklabels(ax.xaxis.get_ticklabels(), rotation=70, ha='right', fontsize=fontsize )
ax.set_ylabel('number of files', fontsize=12)
ax.set_title('Clip Counts per Species', fontsize=14)

## Model Training Begins here:
- split the targets and the features
- perform one hot encoding, for other model comparison
- use a Gradient Boosting Classifier descision tree model
- plot the confusion matrix
- hyper-parameter tuning

In [None]:
#use OneHot encoding to transform categorical data into something useful
from sklearn.preprocessing import OneHotEncoder
#simple imputer will handle missing values
from sklearn.impute import SimpleImputer
#chain together multiple transformations in one custom filter
from sklearn.pipeline import Pipeline
#select (by column header/key) which columns get which kind of transformation
from sklearn.compose import ColumnTransformer

In [None]:
#choose which columns get transformed
cat_cols = ['species']
#create the pipeline
cat_si_step = ('si', SimpleImputer(strategy='constant',
                   fill_value='MISSING'))
cat_ohe_step = ('ohe', OneHotEncoder(sparse=False,
                    handle_unknown='ignore'))

#combine the two transformations into a single Pipeline
cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)

cat_transformers = [('cat', cat_pipe, cat_cols)]
ct = ColumnTransformer(transformers=cat_transformers)

target_fit_transformed = ct.fit_transform(data)
target_transformed = ct.transform(data) #fit shouldnt matter for one-hot encoding

pd.DataFrame( target_fit_transformed ).tail(6)

In [None]:
#remove the species column from the test dataframe
species_col = data.pop('species')

In [None]:
#get the feature names from the transformation
all_steps = ct.named_transformers_['cat']
ohe = all_steps.named_steps['ohe']
cat_feature_names = ohe.get_feature_names()
#clean up the feature names to make more readable
ohe_column_names = [cfn.strip('x0_').strip() for cfn in cat_feature_names]
cat_feature_names, ohe_column_names 

Use stratified random sample for the test/train split
this will preserve species ratios of the data in train/test split

In [None]:
#write the resulting model parameters with joblib
from joblib import dump, load
#dump(cbg, 'cbg_model_100samples.joblib') 
#load the model for testing
#cbg_loaded = load('cbg_model_150samples.joblib') 

In [None]:
#f1 score comparifon funtion
from sklearn.metrics import f1_score

def GetF1Scores( y_test, y_pred ):
    '''
    f1 = 2*prec*recall/(prec+recall)
    marco is the average f1 score across all species
    weighted is the weighted average of all f1 
        (i.e. taking the support number for each class into account)
    micro is the f1 computed with micro averaged prec and recall
        micro prec+recall are treating combining the results from all classes
    f1macro = f1_score(y_test, y_pred, average='macro')
    f1micro = f1_score(y_test, y_pred, average='micro')
    f1weighted = f1_score(y_test, y_pred, average='weighted')
    f1none = f1_score(y_test, y_pred, average=None)

    return f1macro, f1micro, f1weighted#, f1none

In [None]:
#string target
import numpy as np

X,y = np.array(data), np.array(species_col)

from sklearn.ensemble import GradientBoostingClassifier
#recalls
params_noBE = {'learning_rate': 0.2, 'loss': 'deviance', 'max_depth': 4, 'n_estimators': 125} #best params for 2_6 no bald
params_recall_2_6 = {'learning_rate': 0.2, 'loss': 'deviance', 'max_depth': 4, 'n_estimators': 150}
params_prec = {'learning_rate': 0.2, 'loss': 'deviance', 'max_depth': 4, 'n_estimators': 200} #prec hyper-tuned for v2 9_species
params = {'learning_rate': 0.2, 'loss': 'deviance', 'max_depth': 6, 'n_estimators': 150} #params hyper-tuned for v2 (9 birds 14 features)
cbg = GradientBoostingClassifier( )#**params_noBE )# **params_recall_2_6 )

In [None]:
#try different under-sample resampling techniques
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss
##near miss under-sample
nm1 = NearMiss(version=1)
X_resampled, y_resampled = nm1.fit_resample(X, y)
##centroid cluster under-sample
#cc = ClusterCentroids(random_state=0)
#X_resampled, y_resampled = cc.fit_resample(X, y)
##random under-sample
#rus = RandomUnderSampler(random_state=0)
#X_resampled, y_resampled = rus.fit_resample(X, y)
pd.DataFrame( y_resampled )[0].value_counts(), pd.DataFrame(y)[0].value_counts()

In [None]:
#try different under-sample resampling techniques
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
##random over-sample (contains repeats...)
#ros = RandomOverSampler(random_state=0)
#X_resampled, y_resampled = ros.fit_resample(X, y)
##SMOTE -- Synthetic Minority Oversampling Technique 
#X_resampled, y_resampled = SMOTE().fit_resample(X, y) #0.87...need to verify
##ADASYN -- Adaptive Synthetic sampling method
X_resampled, y_resampled = ADASYN().fit_resample(X, y) #0.834...need to verify

#pd.DataFrame( y_resampled )[0].value_counts(), pd.DataFrame(y)[0].value_counts()

In [None]:
#loop through a series of splits to score the model and check the consistency across splits
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.7, random_state=700) #145 userd for <200 #122 for <100 and<300
from sklearn.metrics import classification_report

ds = []
outdict = True

X, y = X_resampled, y_resampled

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    cbg.fit(X_train, y_train)
    y_pred = cbg.predict( X_test )
    #confusion_matrix = MakeConfusionMatrix( y_test, y_pred )
    #PrintConfusionMatrix(confusion_matrix.values, confusion_matrix.columns, normalize=True);
    print(GetF1Scores( y_test, y_pred ) )
    CR = classification_report(y_test, y_pred, output_dict=outdict )
    if not outdict:
        print( CR )
    ds.append( CR )#classification_report(y_test, y_pred))#, output_dict=True ) )

In [None]:
(0.5741902466703128, 0.6382155225096761, 0.6302993684382815)
(0.5828416983016473, 0.6451415766958647, 0.637782821420117)
(0.5486738365971205, 0.6180484823793033, 0.6064974813980757)

#with tuned hyper papams

In [None]:
(0.6036591179305091, 0.6498268486453452, 0.6428665567363259)

In [None]:
def GetSplitPreds( X, y, model, nsplits=1, testsize=0.7, random=122):
    reports = []
    sss = StratifiedShuffleSplit(n_splits=nsplits, test_size=testsize, random_state=random)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        #cbg.fit(X_train, y_train)
        y_pred = model.predict( X_test )
        CR = classification_report(y_test, y_pred, output_dict=outdict )
        if not outdict:
            print( CR )
        ds.append( CR )
    return y_test, y_pred, reports

y_test, y_pred, reports = GetSplitPreds( X, y, cbg)#_loaded )
GetF1Scores( y_test, y_pred )

## model testing/verification

In [None]:
#how the model works...be able to explain
#grid based hyper parameter searach
#sklearn paramater search...
#"I did the parameter tuning"

#cbg.fit(X_train, y_train)

y_pred = cbg_loaded.predict(X_test)
cbg_loaded.score(X_test, y_test)
#print( r2_score(y_test, y_pred) )

if they ask about what I would do next, check feature importance with recursive feature elimination
to eliminate features and speed up processing time, especially for GridSearchCV hyperparameter search

NameError: name 'y_test' is not defined

In [None]:
#get the results from the loop of splits

def GetReportResults( ds ):
    accuracies = [d['accuracy'] for d in ds]
    #macro scores
    macro_pres = [d['macro avg']['precision'] for d in ds]
    macro_recall = [d['macro avg']['recall'] for d in ds]
    macro_f1 = [d['macro avg']['f1-score'] for d in ds]
    macro_support = [d['macro avg']['support'] for d in ds]
    #weighted scores
    weighted_pres = [d['weighted avg']['precision'] for d in ds]
    weighted_recall = [d['weighted avg']['recall'] for d in ds]
    weighted_f1 = [d['weighted avg']['f1-score'] for d in ds]
    weighted_support = [d['weighted avg']['support'] for d in ds]
    print('avg accuracy: {:.3f}'.format( np.average(accuracies) ) )
    print('macro')
    print(' avg_prescision avg_recall avg_macro')
    print('{:.3} {:.3} {:.3}'.format( np.average(macro_pres), np.average(macro_recall), np.average(macro_f1)))
    print('weighted')
    print( '{:.3} {:.3} {:.3}'.format( np.average(weighted_pres), np.average(weighted_recall), np.average(weighted_f1)))

GetReportResults( ds )

In [None]:
#f1 scores for default values
#they are actually worse...change the hyperparameter tuning properties...
GetF1Scores( y_test, y_pred )
y_pred.shape, y_test.shape

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#from sklearn.metrics import plot_confusion_matrix

def MakeConfusionMatrix( y_test, y_pred ):
    data = {'y_Actual':   y_test,
            'y_Predicted': y_pred
            }

    df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
    confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'], margins = True)
    
    print( confusion_matrix.shape )
    confusion_matrix = confusion_matrix[:-1]
    print( confusion_matrix.shape )
    confusion_matrix.drop(columns=['All'], inplace=True)
    

    ##plt.figure(figsize=(14,10))
    ##sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues')
    '''disp = plot_confusion_matrix(classifier, X_test, y_test,
                                     display_labels=class_names,
                                     cmap=plt.cm.Blues,
                                     normalize=True)'''
    # fix for mpl bug that cuts off top/bottom of seaborn viz
    #b, t = plt.ylim() # discover the values for bottom and top
    #b += 0.5 # Add 0.5 to the bottom
    #t -= 0.5 # Subtract 0.5 from the top
    ##plt.ylim(b, t) # update the ylim(bottom, top) values
    ##plt.show() # ta-da!
    return confusion_matrix

confusion_matrix = MakeConfusionMatrix( y_test, y_pred )
confusion_matrix.columns.shape
#confusion_matrix.values, confusion_matrix.columns

In [None]:
#modified from https://gist.github.com/shaypal5/94c53d765083101efc0240d776a23823
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def print_confusion_matrix(confusion_matrix, class_names, normalize=False, figsize = (10,7), fontsize=6):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    if normalize:
        confusion_matrix = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]
        #print(confusion_matrix)
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')


    fmt = '.2f' if normalize else 'd'
    
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    numbers = False
    try:
        heatmap = sns.heatmap(df_cm, annot=numbers, fmt=fmt, cmap='Blues', square=True, xticklabels=True, yticklabels=True)#, cbar_kws={'label': 'accuracy'})
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=70, ha='right', fontsize=fontsize)
    heatmap.yaxis.set_label_position('right')
    heatmap.xaxis.set_label_position('top')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    # fix for mpl bug that cuts off top/bottom of seaborn viz
    b, t = plt.ylim() # discover the values for bottom and top
    b += 0.5 # Add 0.5 to the bottom
    t -= 0.5 # Subtract 0.5 from the top
    plt.ylim(b, t) # update the ylim(bottom, top) values
    return fig

confusion_matrix = MakeConfusionMatrix( y_test, y_pred )
print_confusion_matrix(confusion_matrix.values, confusion_matrix.columns, normalize=True);
print_confusion_matrix(confusion_matrix.values, confusion_matrix.columns);

## Feature Importance

In [None]:
#plot the entire feature importance bar graph.
dfin = []
for feat,imp in sorted( zip(data.columns, cbg.feature_importances_), key=lambda l:l[1]):#, reverse=True):
    #zxprint(feat.strip(), '{:.3f}'.format(imp) )
    dfin.append( (feat.strip(), '{:.3f}'.format(imp)) )
df = pd.DataFrame(dfin)#.drop(np.arange(4,40))
df.columns = 'features', 'importance'
#df.set_index('features', drop=True, inplace=True)
df.importance = df.importance.astype(float)
f,ax = plt.subplots( figsize=(10,16) )
ax.get_xaxis().set_ticks_position('both')
#ax.get_xaxis().
df.plot.barh(x='features', y='importance', ax=ax ).legend(bbox_to_anchor=(0.95, 0.075))
ax.set_title( 'Feature Importance' )

### https://librosa.github.io/librosa/generated/librosa.feature.spectral_rolloff.html
The roll-off frequency is defined for each frame as the center frequency for a spectrogram bin such that at least roll_percent (0.85 by default) of the energy of the spectrum in this frame is contained in this bin and the bins below

In [None]:
#only plot the top few features...
dfin = []
i=0
for feat,imp in sorted( zip(data.columns, cbg.feature_importances_), key=lambda l:l[1]):#, reverse=True):
    #zxprint(feat.strip(), '{:.3f}'.format(imp) )
    dfin.append( (feat.strip(), '{:.3f}'.format(imp)) )
    #if i==4: break
    #i+=1
    
df = pd.DataFrame(dfin)[-10:]#.drop(np.arange(4,40))
df.columns = 'features', 'importance'
#df.set_index('features', drop=True, inplace=True)/
df.importance = df.importance.astype(float)
f,ax = plt.subplots()#figsize=(10,16) )

df.plot.barh(x='features', y='importance', ax=ax ).legend(bbox_to_anchor=(0.76, 0.15))
#ax.set_yticklabels(reversed( ('Prominent Freq. at Peak Volume', 'Std Deviation Contrast in Band 6', 'Mean Contrast in Band 4', 'Std Deviation Contrast in Band 6', 'Max Contrast Band 6', 'Std Deviation Contrast in Band 5' ) ), fontsize=12)
#corresponding = 'ampmax_0 std_cont6 mean_cont4 std_cont4 max_cont6 std_cont5'.split()
ax.set_ylabel('importance', fontsize=14)
ax.set_xlabel('importance', fontsize=14)
ax.set_title( 'Feature Importance', fontsize=18 )

In [None]:
cbg.classes_

In [None]:
from sklearn.feature_selection import RFE

In [None]:
feature_list = 'prominent frequency', 'prominent Q-power frequency', 'prominent mel-frequency'
data.columns

In [None]:
#hyperparameter tuning for gradient boost classifier
#modified from  https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
from sklearn.model_selection import GridSearchCV
tuned_parameters = {
    "loss":["deviance"], #, "exponential" requires 2 target classes (not-multi...)
    "learning_rate": [0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25, 0.3],
    #"min_samples_split": np.linspace(0.1, 0.5, 12),
    #"min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[2,3,4,5],
    #"max_features":["log2","sqrt"],
    #"criterion": ["friedman_mse",  "mae"],
    #"subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[ 25, 50, 100, 125, 150]
    }

#score on prescision and recall...
#clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv=10, n_jobs=-1)

scores = ['f1']# ['recall']
weight = 'weighted'
output = False

for score in scores:
    print(f'# Tuning hyper-parameters for {score}\n')
    #using macro scoring...try weighted after? ...for micro, prescision=acuracy...for multiclass
    clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters,
                       scoring=f'{score}_{weight}', cv=10, n_jobs=-1)
    #clf = GridSearchCV( svm.SVC(), tuned_parameters, scoring=f'{score}_macro' )
    clf.fit( X_train, y_train )
    print('best params found on development set\n')
    print( clf.best_params_ )
    if output:
        print('\ngrid scores on development set:')
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean,std,params in zip( means, stds, clf.cv_results_['params'] ):
            print('{:.3f} +/-{:.3f} for {}'.format( mean, 2*std, params) )
            print('classification report:\n')
            y_true, y_pred = y_test, clf.predict( X_test )
            print( classification_report(y_true, y_pred ) )
    

In [None]:
for feat,imp in zip(data.columns, regr_rf.feature_importances_):
    print(feat.strip(), imp)

In [None]:

regr_rf.fit(X_train, y_train)
#evaluate the fit accuracy
regr_rf.score(X_test, y_test)
#y_pred = regr_rf.predict(X_test)#, y_test)#, multioutput='uniform_average')

#r2_score(y_test, y_pred) #...          multioutput='variance_weighted')
#y_pred, y_pred.shape, y_test.shape

In [None]:
#from sklearn.model_selection import train_test_split
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html
from sklearn.metrics import r2_score

from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
sss = StratifiedShuffleSplit(n_splits=4, test_size=0.7, random_state=42)

#one hot encoding target
X,y = np.array(data), np.array(target_transformed)
#string target
#X,y = np.array(data), np.array(species_col)




In [None]:
for train_index, test_index in sss.split(X, y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #fit the model with the new vaues
    regr_rf.fit(X_train, y_train)
    #evaluate the fit accuracy
    #print(regr_rf.score(X_test, y_test))
    y_pred = regr_rf.predict(X_test)#, y_test)#, multioutput='uniform_average')
    print( r2_score(y_test, y_pred) )

In [None]:
i = 20
for i in range(20):
    single = X_test[i].reshape(1,-1) 
    single_ans = y_test[i]

    ab = cbg.predict( single )
    probas = cbg.predict_proba( single )
    #log_probas = cbg.predict_log_proba( single )

    top_results = sorted( zip( cbg.classes_, probas[0] ), key=lambda l: l[1], reverse=True )


    print( f'actual {y_test[i]}, predicted {ab}, top_proba=predicted? {ab[0]==top_results[0][0]}' )
    for res in zip(  top_results[:3]  ):
        print(res)
    print()
    

In [None]:
#best params found on development set

best_params = {'criterion': 'friedman_mse', 'learning_rate': 0.2, 'loss': 'deviance', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.17272727272727273, 'n_estimators': 10, 'subsample': 0.9}

In [None]:
from sklearn.feature_selection import RFE
selector = RFE(estimator, 5, step=1)
selector = selector.fit(X, y)

In [None]:
#compare default with best params found from hyper paramter tuning
cbg = GradientBoostingClassifier( **best_params )
cbg.fit(X_train, y_train)

y_pred = cbg.predict(X_test)
cbg.score(X_test, y_test)

sensitivity and specificity, prescision recall, -> short blub about what it says __ talk about two that people ususally use...how well it can predict.  how good is it at false discovery 

roc curve, what youd like to see goes stainght up then stairght down

get rid of bottom row...

reason of why XGboost...why did i choose it

Export the model parameters to feed into the App

In [None]:
#write the resulting model parameters with joblib
from joblib import dump, load
dump(cbg, 'cbg_model_300samples.joblib') 
#test the loading of the model to verify it's functionality
#cbg_loaded = load('cbg_model_all.joblib') 

In [None]:
#cbg.fit(X_train, y_train)

y_pred = cbg_loaded.predict(X_test)
cbg_loaded.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report
print( classification_report(y_true, y_pred ) )

In [None]:
from sklearn import svm
clf = svm.SVC(decision_function_shape='ovo', gamma='auto')
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

tuned_parameters = [ {'kernel': ['rbf'], 'gamma': [1e-3],# 1e-4],
                      'C': [1, 10]},#, 100, 1000]},
                     {'kernel': ['linear'], 'C': [1, 10]}]#, 100, 1000]} ]

scores = ['precision', 'recall']

for score in scores:
    print(f'# Tuning hyper-parameters for {score}\n')
    
    clf = GridSearchCV(
        svm.SVC(), tuned_parameters, scoring=f'{score}_macro', n_jobs=-1 )
    clf.fit( X_train, y_train )
    print('best params found on development set\n')
    print( clf.best_params_ )
    print('\ngrid scores on development set:\n')
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean,std,params in zip( means, stds, clf.cv_results_['params'] ):
        print('{:.3f} +/-{:.3f} for {}'.format( mean, 2*std, params) )
        print('classification report:\n')
        y_true, y_pred = y_test, clf.predict( X_test )
        print( classification_report(y_true, y_pred ) )
    break

In [None]:
y.shape, y

In [None]:
#Reciever operator curve for multi class classifier
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
#https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html#sklearn.multiclass.OneVsRestClassifier
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import StratifiedShuffleSplit

# Import some data to play with
##iris = datasets.load_iris()
##X = iris.data
##y = iris.target

# Binarize the output
##y = label_binarize(y, classes=[0, 1, 2])
y = label_binarize(y, classes=ohe_column_names )
n_classes = y.shape[1]

# Add noisy features to make the problem harder
random_state = np.random.RandomState(42)
#n_samples, n_features = X.shape
#X = np.c_[X, random_state.randn(n_samples, 20 * n_features)]

# shuffle and split training and test sets
##X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.7, random_state=random_state)

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Learn to predict each class against the other
#classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
classifier = OneVsRestClassifier(GradientBoostingClassifier( random_state=random_state),n_jobs=-1)
                                 #random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
#First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)


##colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
colors = cycle(['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink',
                'tab:gray', 'tab:olive', 'tab:cyan'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
#plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.title('Receiver Operating Characteristic Curves')
plt.legend(loc="lower right", fontsize=8)
plt.show()

In [None]:
for mean,std,params in zip( means, stds, clf.cv_results_['params'] ):
        print('{:.3f} +/-{:.3f} for {}'.format( mean, 2*std, params) )
        print('classification report:\n')
        y_true, y_pred = y_test, clf.predict( X_test )
        print( classification_report(y_true, y_pred ) )

take the ratio for the 'all' row and column for each species, to determine the percentage of each species that was predicted corretly
- if it is over 1, then it was 'over-fitted' to that species (i.e. it predicted that species more that there actually were'
- if it is less, then (i would think) this is better since it just mis-identified things and not thought everything was just "a blue jay" because the data set was mostly bue jays

In [None]:
#tn, tp are diagonals
#fp = top predicted true, but actually false
#fn = predicted false, but actually true
#tn,fp,fn,tp = confusion_matrix.ravel()
CM = np.array(confusion_matrix)
tnfp = CM.ravel()
tp = 0
n = 0
fp = 0
fn = 0
print(CM.shape)
for i,row in enumerate(CM):
    for j,col in enumerate(row):
        n += col
        #print(i,j, CM[i][j], col)
        if i==j:
            tp += col #CM[i][j]
        if i>j:
            fn += col
        if i<j: fp += col

            
#confusion_matrix.all
tp, n, fp, fn

In [None]:
CM = multilabel_confusion_matrix(y_test, y_pred)

In [None]:
import xgboost as xgb
model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
model.fit(x_train, y_train)
model.score(x_test,y_test)

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
import matplotlib.pyplot as plt
#multilabel_confusion_matrix(y_test, y_predicted)  #give error due to continous output of RF regressor
#y_pred = (y_pred > 0.5) 
CM = multilabel_confusion_matrix(y_test, y_pred)
#CM = confusion_matrix(y_test, y_pred)
#CMml, CMml.shape
plt.figure()
sns.heatmap(CM[:,:,1], annot=True)
plt.tight_layout()

plt.figure()

sns.heatmap(CM[:,:,0], annot=True)
plt.tight_layout()
CM.shape, CM[:,:,0]
#sn.heatmap(CM, annot=True)
pd.DataFrame(y_pred).head()

In [None]:
from sklearn.multioutput import MultiOutputRegressor


regr_multirf = MultiOutputRegressor(RandomForestRegressor(n_estimators=1000, max_depth=max_depth, random_state=12))
regr_multirf.fit(X_train, y_train)
regr_multirf.score(X_test, y_test)

In [None]:
label_one = ('zero', 'one')
label_two = ohe_column_names
cols = pd.MultiIndex.from_product([label_one, label_two])

pd.DataFrame(CM.T.reshape(2, -1), columns=cols)

In [None]:
import seaborn as sn
#get the prediction results
y_predicted = regr_rf.predict(X_test)
#store the actual answers and the predicted answers in a DataFrame
prediction_data = {'y_Actual':   y_test,
        'y_Predicted': y_predicted }

df = pd.DataFrame( y_predicted, y_test )
#df = pd.DataFrame(prediction_data)#, columns=['y_Actual','y_Predicted'])
#calculate the confusion matrix
##confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'], margins = True)

#plot the heatmap using seaborn
##sn.heatmap(confusion_matrix, annot=True)
#df = pd.DataFrame(prediction_data, columns=['y_Actual','y_Predicted'])
#y_test
df

In [None]:
#ratio of species in train vs test (ideally should be ~3 for all)
train, test = pd.DataFrame( y_train ), pd.DataFrame( y_test )
train.columns, test.columns = ['species'], ['species']
train.species.value_counts()/test.species.value_counts(), test.species.value_counts()


In [None]:
lin_clf = svm.LinearSVC()
lin_clf.fit(X_train, y_train)
clf.score(X_test, y_test)


In [None]:
predicted = regr_rf.predict(X_test)
for i,prediction in enumerate(predicted):
    print(i, y_test[i] - prediction)
    #print( sum(y_test[i] - prediction))

In [None]:
import seaborn as sns
sns.set(style="ticks", color_codes=True)

a = ((-1, 1, 2, 'a'),
     (3, 4, 2, 'b'),
     (6, 7, 3, 'c'),
     (9, 10, 3, 'd'))

df = pd.DataFrame( a )
df.columns = 'first second third letters'.split()
g_works = sns.pairplot(df)
g_broken = sns.pairplot(df, hue="letters")