In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

import csv
import numpy as np


from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import scores

In [14]:

# # Reading in the Data

path_f=os.getcwd()

path_f_1=os.path.join(path_f, 'data')


names=[]
for files_txts in os.listdir(path_f_1):
    if files_txts.endswith(".csv"):
        #print(files_txts)
        names.append(files_txts)
        
path_train=os.path.join(path_f_1, names[0])
path_test=os.path.join(path_f_1, names[1])

df_train=pd.read_csv(path_train)
df_train.shape


# ## Data Manipulation
print('Training Data is being read ....')
#  - Transforming the outcome to a numpy vector

stab_vector=df_train['stabilityVec'].values
y=[]
for x in stab_vector:
    #print(x)
    a=np.fromstring(x[1:-1],sep=',').astype(int)
    y.append(a)
y=np.array(y) 

df_tmp = pd.DataFrame(y, columns = ['A', 'A91B', 'A82B','A73B','A64B','A55B','A46B','A37B','A28B','A19B','B'])
stab_vec_list=[ 'A91B', 'A82B','A73B','A64B','A55B','A46B','A37B','A28B','A19B']

df_train=df_train.drop("stabilityVec",axis=1) #removing the results which originally are a string
feature_cols=list(df_train)

print(df_train.shape)

df_train['formulaA']=df_train['formulaA_elements_Number']
df_train['formulaB']=df_train['formulaB_elements_Number']

df_train=pd.concat([df_train, df_tmp],axis=1)
print(df_train.shape)

# ### Input Data Normalization and Feature Engineering
print('Training Data has been read and feature engineering is being performed....')

y_all=df_train[stab_vec_list]
df_tmp_stable = pd.DataFrame( columns = ['Stable_compunds'])
df_tmp_stable['Stable_compunds']=np.logical_not(y_all.sum(axis=1)==0).astype(int) ## A one means it has a stable value  a 0 

df_train=pd.concat([df_train, df_tmp_stable],axis=1)
print(df_train.shape)

df_train.head()





Training Data is being read ....
(2572, 98)
(2572, 109)
Training Data has been read and feature engineering is being performed....
(2572, 110)


Unnamed: 0,formulaA,formulaB,formulaA_elements_AtomicVolume,formulaB_elements_AtomicVolume,formulaA_elements_AtomicWeight,formulaB_elements_AtomicWeight,formulaA_elements_BoilingT,formulaB_elements_BoilingT,formulaA_elements_BulkModulus,formulaB_elements_BulkModulus,...,A82B,A73B,A64B,A55B,A46B,A37B,A28B,A19B,B,Stable_compunds
0,89,47,37.433086,17.075648,227.0,107.8682,3473.0,2435.0,0.0,100.0,...,0,1,0,1,0,0,0,0,1,1
1,89,13,37.433086,16.594425,227.0,26.981539,3473.0,2792.0,0.0,76.0,...,0,1,0,0,0,0,0,0,1,1
2,89,33,37.433086,21.723966,227.0,74.9216,3473.0,887.0,0.0,22.0,...,0,0,0,0,0,0,0,0,1,0
3,89,56,37.433086,64.969282,227.0,137.327,3473.0,2143.0,0.0,9.6,...,0,0,0,0,0,0,0,0,1,0
4,89,83,37.433086,35.483459,227.0,208.9804,3473.0,1837.0,0.0,31.0,...,0,0,0,0,0,0,0,0,1,0


In [3]:
print(names)

['training_data.csv', 'test_data.csv']


## Selecting Output for Component 1 of Stability Vector

In [54]:
## Observing how many element pairs produce a stable compound per % and overall

y_all=df_train[stab_vec_list]

count=8
    
y = df_train[stab_vec_list[count]]
print(y.value_counts())

stable_comp=df_train.loc[y==1,['formulaA','formulaB']] # Find the elements that create a stable element in this vector component
print('Compound being analyzed is',stab_vec_list[count])
stable_comp_num=stable_comp.values
stable_A=np.unique(stable_comp_num[:,0])
stable_B=np.unique(stable_comp_num[:,1])
    
df_unique= pd.DataFrame()

y_unique= pd.DataFrame()
    
for cnt in range(stable_A.shape[0]):

    df_tmp1=y.loc[df_train['formulaA']==stable_A[cnt]]
    y_unique=pd.concat([y_unique, df_tmp1],axis=0)
        
    df_tmp=df_train.loc[df_train['formulaA']==stable_A[cnt]]
    df_unique=pd.concat([df_unique, df_tmp],axis=0)
        

    


for cnt in range(stable_B.shape[0]):
    df_tmp1=y.loc[df_train['formulaB']==stable_B[cnt]]
    y_unique=pd.concat([y_unique, df_tmp1],axis=0)
        
    df_tmp=df_train.loc[df_train['formulaB']==stable_B[cnt]]
    df_unique=pd.concat([df_unique, df_tmp],axis=0)

    
y_unique=y.iloc[y_unique.index.unique()]
df_unique=df_train.iloc[df_unique.index.unique()]
print(y_unique.value_counts())
print('The elements in these compounds create a stable compound for this component of the stability vector:',y_unique.shape)
    
    
y_stable=y_unique.loc[np.logical_not(y_all.sum(axis=1)==0)]
df_stable=df_unique.loc[np.logical_not(y_all.sum(axis=1)==0)]
print(y_stable.value_counts())
print('The elements in these compounds create a stable compound for this component of the stability vector and create at least one stable compound:',y_stable.shape)



0    2512
1      60
Name: A19B, dtype: int64
Compound being analyzed is A19B
0    1240
1      60
Name: A19B, dtype: int64
The elements in these compounds create a stable compound for this component of the stability vector: (1300,)
0    545
1     60
Name: A19B, dtype: int64
The elements in these compounds create a stable compound for this component of the stability vector and create at least one stable compound: (605,)


## Pearson Correlation and Input Normalization

In [56]:
# Pearson Correlation to Identify the features that influence the most on the output 
print('Pearson Correlation has been calculated to build the model in the most relevant features ....')
X_train_new_all=df_stable[feature_cols] #This means we will only train on the elements that create a stable compound for this component of the stability vector and have at least one stable compound

y_new=y_stable
print('Number of Results to train on:',y_new.shape)
print('Number of Training Features before Pearson correlation:', X_train_new_all.shape[1])

corr_df=pd.concat([X_train_new_all, y_new],axis=1)
a=corr_df.corr()
#a['Stable_compunds'].hist(bins=7, figsize=(18, 12), xlabelsize=10)

## Incorporating the Features that contribute the most based on a pearson correlation coefficient threshold

thr=.13

corr_variables=list(a[a[stab_vec_list[count]].abs()>thr].index)

del(corr_variables[-1])


print('Pearson Correlation has identified', len(corr_variables), 'with ', str(thr) )

## Normalization of Input Data

## Using Un-normalized data as input
X_train_new=df_stable[corr_variables]

print('Number of Training Features after Pearson correlation:', X_train_new.shape[1])


# Normalizing such that the magnitude is one
from sklearn.preprocessing import normalize

X_train_new_mag_1=normalize(X_train_new, axis=1) # vector magnitude is one
print(X_train_new_mag_1.shape)


## Normalizing by Zscore
from scipy.stats import zscore
X_train_new_Z_score=X_train_new.apply(zscore)
print(X_train_new_Z_score.shape)



## Normalizing so that range is 0-1
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_train_new_0_1=min_max_scaler.fit_transform(X_train_new)
print(X_train_new_0_1.shape)


## Normalizing so that range is -1 to 1
from sklearn import preprocessing
max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_new_m1_p1=max_abs_scaler.fit_transform(X_train_new)
print(X_train_new_m1_p1.shape)


# Using PCA as input
X_train_4_PCA=df_stable[feature_cols]
indx_4_PC=X_train_4_PCA.index
X_train_new_mag_1_PCA=normalize(X_train_4_PCA, axis=1)


pca = PCA()
pca.fit(X_train_new_mag_1_PCA)
components = pca.components_[:20,:]
new_data = np.dot(X_train_new_mag_1_PCA, components.T)
X_train_new_PCA=new_data

print(X_train_new_PCA.shape)

## Using Pearson Correlation in PCA
df1= pd.DataFrame(data=X_train_new_PCA, index=indx_4_PC)
print(df1.shape)

corr_df_PCA=pd.concat([df1, y_new],axis=1)


a_PCA=corr_df_PCA.corr()

thr=.05
corr_variables_PCA=list(a_PCA[a_PCA[stab_vec_list[count]].abs()>thr].index)


del(corr_variables_PCA[-1])

print('Pearson Correlation in PCA Space has identified', len(corr_variables_PCA), 'with ', str(thr) )

X_train_PCA_PC=df1[corr_variables_PCA]

print('Number of Training Features after Pearson correlation in PCA Space:', X_train_PCA_PC.shape[1])








Pearson Correlation has been calculated to build the model in the most relevant features ....
Number of Results to train on: (605,)
Number of Training Features before Pearson correlation: 98
Pearson Correlation has identified 30 with  0.13
Number of Training Features after Pearson correlation: 30
(605, 30)
(605, 30)
(605, 30)
(605, 30)
(605, 20)
(605, 20)
Pearson Correlation in PCA Space has identified 12 with  0.05
Number of Training Features after Pearson correlation in PCA Space: 12


## Model Generation

In [29]:
print('Training Model Using Z-normalized Data')
## test-train split
X_train, X_test, y_train, y_test = train_test_split(X_train_new_Z_score, y_new,
                                                    test_size=.15,
                                                    shuffle=True,
                                                    random_state=42)

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

Training Model Using Z-normalized Data
(1040, 29) (1040,)
(184, 29) (184,)


In [30]:
print(y_train.mean())

0.4894230769230769


# Initial Hyper Parameter Tuning to identify best Classifier and its values

In [None]:
# Hyper-Parameter Search Grid Using 10-Fold CV and Test
print(' -- Random Forest --')

#first pass
n_estimators = [1,3,5,10,50,100]
criterion=['entropy']
bootstrap= [True, False]
max_depth=[2,5,10]

min_samples_splits=[2,3,4,6,7,8,9,10,20]
min_samples_leafs=[1,2,5,10]
min_impurity_splits=[5e-7 ,1e-6]

#second pass
#n_estimators = [10,20,50]
#criterion=['entropy']
#bootstrap= [True, False]
#max_depth=[5,6]
#min_samples_splits=[2,3,4,5,6]
#min_samples_leafs=[1,3,5]
#min_impurity_splits=[3e-7, 5e-7,1e-6]

#n_estimators = [1,3,5,8]
#criterion=['entropy']
#bootstrap= [True, False]
#max_depth=[1,3,4]


#min_samples_splits=[2,3,4,5]
#min_samples_leafs=[1]
#min_impurity_splits=[3e-7, 5e-7,8e-7]

df_results_RF=scores.hp_tune_Random_Forest(X_train,y_train,X_test,y_test,2,n_estimators,criterion,bootstrap,max_depth,min_samples_splits,min_samples_leafs,min_impurity_splits)





print('This are the best Parameters for Random Forest:')
print(df_results_RF[df_results_RF['test_accuracy']==df_results_RF['test_accuracy'].max()].head())


# # Decision Trees


# Hyper-Parameter Search Grid Using 10-Fold CV and Test
print(' -- Decision Trees --')


criterion=['entropy','gini']
bootstrap= [True, False]
max_depth=[1,2,5,10,100,250,1000]
split=['random','best']
min_samples_splits=[2,3,4,6,7,8,9,10]
min_samples_leafs=[1]
min_impurity_splits=[5e-7 ,1e-6]

#second pass
#criterion=['entropy']
#max_depth=[10,11,15]
#split=['random','best']
#min_samples_splits=[2,3,4,6]
#min_samples_leafs=[1,3,5]
#min_impurity_splits=[3e-7, 5e-7,1e-6]

#criterion=['entropy']
#max_depth=[1,3,510]
#split=['best']
#min_samples_splits=[2,3]
#min_samples_leafs=[1]
#min_impurity_splits=[3e-7, 5e-7,8e-5]

df_results_DT=scores.hp_tune_Decision_tree(X_train,y_train,X_test,y_test,2,criterion,max_depth,split,min_samples_splits,min_samples_leafs,min_impurity_splits)

print('This are the best Parameters for Decision Tree:')
print(df_results_DT[df_results_DT['test_results_auc']==df_results_DT['test_results_auc'].max()].head())



# # KNN 


# Hyper-Parameter Search Grid Using 10-Fold CV and Test
print(' -- KNN --')

criterion=['distance', 'uniform']
neighbors=[1,2,3,10,50,100]
distances = [1, 2, 3, 4, 5]

df_results_KNN=scores.hp_tune_KNN(X_train,y_train,X_test,y_test,2,criterion,neighbors,distances)




print('This are the best Parameters for KNN :')
print(df_results_KNN[df_results_KNN['test_results_auc']==df_results_KNN['test_results_auc'].max()].head())


# # SVM


# Hyper-Parameter Search Grid Using 10-Fold CV and Test
print(' -- SVM --')

kernel=['rbf', 'linear', 'poly', 'sigmoid']
gammas = [.001,.1,1,3,5]
cs = [.0001,.1,1,5,10,15,20]

df_results_SVM=scores.hp_tune_SVM(X_train,y_train,X_test,y_test,10,kernel,gammas,cs)



print('This are the best Parameters for SVM :')
print(df_results_SVM[df_results_SVM['test_results_auc']==df_results_SVM['test_results_auc'].max()].head())


# # Logistic Regression

# Hyper-Parameter Search Grid Using 10-Fold CV and Test
print(' -- Logistic Regression --')

criterion=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

df_results_log_reg=scores.hp_tune_log_reg(X_train,y_train,X_test,y_test,10,criterion)

print('This are the best Parameters for Logistic Regression :')
print(df_results_log_reg[df_results_log_reg['test_results_auc']==df_results_log_reg['test_results_auc'].max()].head())


## Fitting the Best Model

In [13]:
## Fitting best Model
print(' -- Optimal Random Forest --')
from sklearn.ensemble import RandomForestClassifier

rfc_opt = RandomForestClassifier(n_estimators=100,criterion='gini',bootstrap=False,max_depth=10, 
                                 min_samples_split=10,
                                 min_samples_leaf=5,
                                 min_impurity_decrease=5e-07,
                                 random_state=0
                                 ,n_jobs=-1,class_weight={0:.1, 1:0.9})
rfc_opt.fit(X_train, y_train)

train_pred = rfc_opt.predict(X_train)
    
precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_train,train_pred)
print('Training precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Training Confusion matrix')
print(confusion)
print('Training AUC:',roc_auc)


y_pred = rfc_opt.predict(X_test)


precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)


## Compare to Default Model
print(' -- Default Random Forest --')
from sklearn.ensemble import RandomForestClassifier
rfc_def = RandomForestClassifier()
rfc_def.fit(X_train, y_train)



train_pred = rfc_def.predict(X_train)
    
precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_train,train_pred)
print('DEF Training precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('DEF Training Confusion matrix')
print(confusion)
print('DEF Training AUC:',roc_auc)

y_pred = rfc_def.predict(X_test)



precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Defualt Model precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Defualt ModelConfusion matrix')
print(confusion)
print('Defualt ModelAUC:',roc_auc)

 -- Optimal Random Forest --
Training precision:  0.7241379310344828   recall:  1.0   F1:  0.8400000000000001   accuracy:  0.9700934579439252
Training Confusion matrix
[[477  16]
 [  0  42]]
Training AUC: 0.9837728194726166
Optimal precision:  0.4   recall:  0.5   F1:  0.4444444444444445   accuracy:  0.8947368421052632
optimal Confusion matrix
[[81  6]
 [ 4  4]]
Optimal AUC: 0.7155172413793104
 -- Default Random Forest --
DEF Training precision:  1.0   recall:  0.8809523809523809   F1:  0.9367088607594937   accuracy:  0.9906542056074766
DEF Training Confusion matrix
[[493   0]
 [  5  37]]
DEF Training AUC: 0.9404761904761905
Defualt Model precision:  0.6   recall:  0.375   F1:  0.4615384615384615   accuracy:  0.9263157894736842
Defualt ModelConfusion matrix
[[85  2]
 [ 5  3]]
Defualt ModelAUC: 0.6760057471264368


In [None]:
## Fitting best Model
print(' -- Optimal Decision Tree --')
## Fitting best Model

rfc_opt = sklearn.tree.DecisionTreeClassifier(class_weight={0:1-y_train.mean(), 1:y_train.mean()},
                                          criterion='entropy',max_depth=510,
                                          random_state=0, 
                                          splitter='best',
                                          min_samples_split=3,
                                          min_samples_leaf=1,
                                          min_impurity_decrease=3e-7)
rfc_opt.fit(X_train, y_train)

train_pred = rfc_opt.predict(X_train)
    
precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_train,train_pred)
print('Training precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Training Confusion matrix')
print(confusion)
print('Training AUC:',roc_auc)


y_pred = rfc_opt.predict(X_test)


precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)


## Compare to Default Model
print(' -- Default Decision Tree --')
from sklearn.ensemble import RandomForestClassifier
rfc_def = sklearn.tree.DecisionTreeClassifier()

rfc_def.fit(X_train, y_train)
train_pred = rfc_def.predict(X_train)
    
precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_train,train_pred)
print('DEF Training precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('DEF Training Confusion matrix')
print(confusion)
print('DEF Training AUC:',roc_auc)

y_pred = rfc_def.predict(X_test)



precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Defualt Model precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Defualt ModelConfusion matrix')
print(confusion)
print('Defualt ModelAUC:',roc_auc)

In [None]:
## Fitting best Model
print(' -- Optimal SVM --')
import sklearn.svm

rfc = sklearn.svm.SVC(kernel='poly', gamma=3,C=10,random_state=0,class_weight={0:1-y_train.mean(), 1:y_train.mean()})

rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)


print(' -- Defualt SVM --')

rfc = sklearn.svm.SVC()
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Defualt Model precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Defualt ModelConfusion matrix')
print(confusion)
print('Defualt ModelAUC:',roc_auc)

In [None]:
## Fitting best Model
print(' -- Optimal KNN --')
from sklearn.neighbors import KNeighborsClassifier

rfc = KNeighborsClassifier(algorithm='auto',metric='minkowski',n_jobs=-1, n_neighbors=1, p=3,weights='distance')

rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)


print(' -- Defualt KNN --')

rfc = KNeighborsClassifier()
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Defualt Model precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Defualt ModelConfusion matrix')
print(confusion)
print('Defualt ModelAUC:',roc_auc)

# Boosting and Bagging of Best Performing Models

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
print('------- Extra Trees Classifier-------')
clf_ex = ExtraTreesClassifier(class_weight={0:1-y_train.mean(), 1:y_train.mean()},
                              min_samples_split=2,
                                 min_samples_leaf=1,
                                 min_impurity_decrease=3e-07)


clf_ex.fit(X_train, y_train)

train_pred = rfc_opt.predict(X_train)
    
precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_train,train_pred)
print('Training precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Training Confusion matrix')
print(confusion)
print('Training AUC:',roc_auc)

y_pred = clf_ex.predict(X_test)



precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Defualt Model precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Defualt ModelConfusion matrix')
print(confusion)
print('Defualt ModelAUC:',roc_auc)

y_scores = clf_ex.predict_proba(X_test)[:, 1]

y_pred_adj = scores.adjusted_classes(y_scores, .1)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred_adj)
print('Adjusted Threshold precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Adjusted Threshold Confusion matrix')
print(confusion)
print('Adjusted Threshold AUC:',roc_auc)


In [None]:
## ADAboosting
print('------ ADAboosting with Decision Tree ----')

from sklearn.ensemble import AdaBoostClassifier
rfc_opt = sklearn.tree.DecisionTreeClassifier(class_weight={0:1-y_train.mean(), 1:y_train.mean()},
                                          criterion='entropy',max_depth=10,
                                          random_state=0, 
                                          splitter='best',
                                          min_samples_split=2,
                                          min_samples_leaf=1,
                                          min_impurity_decrease=5e-7)

clf = AdaBoostClassifier(base_estimator=rfc_opt, n_estimators=100,learning_rate=.01)

all_accuracies = cross_val_score(estimator=clf,X=X_train, y=y_train, cv=10,scoring='roc_auc')

print(all_accuracies.mean())

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)



precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Defualt Model precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Defualt ModelConfusion matrix')
print(confusion)
print('Defualt ModelAUC:',roc_auc)

y_scores = clf.predict_proba(X_test)[:, 1]

y_pred_adj = scores.adjusted_classes(y_scores, .1)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred_adj)
print('Adjusted Threshold precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Adjusted Threshold Confusion matrix')
print(confusion)
print('Adjusted Threshold AUC:',roc_auc)

In [None]:
## ADAboosting
print('------ ADAboosting with Random Forest ----')

from sklearn.ensemble import AdaBoostClassifier

rfc_opt = RandomForestClassifier(n_estimators=50,criterion='entropy',bootstrap=True,max_depth=5, 
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 min_impurity_decrease=5e-07,
                                 random_state=0
                                 ,n_jobs=-1)

clf = AdaBoostClassifier(base_estimator=rfc_opt, n_estimators=10,learning_rate=.01)

all_accuracies = cross_val_score(estimator=clf,X=X_train, y=y_train, cv=10,scoring='roc_auc')

print(all_accuracies.mean())

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)



precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Defualt Model precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Defualt ModelConfusion matrix')
print(confusion)
print('Defualt ModelAUC:',roc_auc)

y_scores = clf.predict_proba(X_test)[:, 1]

y_pred_adj = scores.adjusted_classes(y_scores, .1)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred_adj)
print('Adjusted Threshold precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Adjusted Threshold Confusion matrix')
print(confusion)
print('Adjusted Threshold AUC:',roc_auc)

In [None]:
## Gradient boosting
print('------ Gradient Boosting with Decision Trees ----')
from sklearn.ensemble import GradientBoostingClassifier


clf  = GradientBoostingClassifier(n_estimators=60, learning_rate=.1,min_samples_split=2,
                                          min_samples_leaf=1,
                                          min_impurity_decrease=5e-7)

all_accuracies = cross_val_score(estimator=clf,X=X_train, y=y_train, cv=10,scoring='roc_auc')

print(all_accuracies.mean())

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)



precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Defualt Model precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Defualt ModelConfusion matrix')
print(confusion)
print('Defualt ModelAUC:',roc_auc)

y_scores = clf.predict_proba(X_test)[:, 1]

y_pred_adj = scores.adjusted_classes(y_scores, .25)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred_adj)
print('Adjusted Threshold precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Adjusted Threshold Confusion matrix')
print(confusion)
print('Adjusted Threshold AUC:',roc_auc)