In [1]:

# coding: utf-8

import os
import pandas as pd
import matplotlib.pyplot as plt

import csv
import numpy as np


from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import scores


# # Reading in the Data

path_f=os.getcwd()

path_f_1=os.path.join(path_f, 'data')


names=[]
for files_txts in os.listdir(path_f_1):
    if files_txts.endswith(".csv"):
        #print(files_txts)
        names.append(files_txts)
        
path_train=os.path.join(path_f_1, names[1])
path_test=os.path.join(path_f_1, names[0])

df_train=pd.read_csv(path_train)
df_train.shape


# ## Data Manipulation
print('Training Data is being read ....')
#  - Transforming the outcome to a numpy vector

stab_vector=df_train['stabilityVec'].values
y=[]
for x in stab_vector:
    #print(x)
    a=np.fromstring(x[1:-1],sep=',').astype(int)
    y.append(a)
y=np.array(y) 

df_tmp = pd.DataFrame(y, columns = ['A', 'A91B', 'A82B','A73B','A64B','A55B','A46B','A37B','A28B','A19B','B'])
stab_vec_list=[ 'A91B', 'A82B','A73B','A64B','A55B','A46B','A37B','A28B','A19B']

df_train=df_train.drop("stabilityVec",axis=1) #removing the results which originally are a string
feature_cols=list(df_train)

print(df_train.shape)

df_train['formulaA']=df_train['formulaA_elements_Number']
df_train['formulaB']=df_train['formulaB_elements_Number']

df_train=pd.concat([df_train, df_tmp],axis=1)
print(df_train.shape)

# ### Input Data Normalization and Feature Engineering
print('Training Data has been read and feature engineering is being performed....')

y_all=df_train[stab_vec_list]
df_tmp_stable = pd.DataFrame( columns = ['Stable_compunds'])
df_tmp_stable['Stable_compunds']=np.logical_not(y_all.sum(axis=1)==0).astype(int) ## A one means it has a stable value  a 0 

df_train=pd.concat([df_train, df_tmp_stable],axis=1)
print(df_train.shape)

df_train.head()

# Pearson Correlation to Identify the features that influence the most on the output 
print('Pearson Correlation has been calculated to build the model in the most relevant features ....')

X_train_new=df_train[feature_cols]
y_new=df_train['Stable_compunds']

corr_df=pd.concat([X_train_new, y_new],axis=1)
a=corr_df.corr()
#a['Stable_compunds'].hist(bins=7, figsize=(18, 12), xlabelsize=10)

## Incorporating the Features that contribute the most based on a pearson correlation coefficient threshold

thr=.1

corr_variables=list(a[a['Stable_compunds'].abs()>thr].index)

del(corr_variables[-1])


print('Pearson Correlation has identified', len(corr_variables), 'with ', str(thr) )

## Normalization of Input Data

## Using Un-normalized data as input
X_train_new=df_train[corr_variables]

print(X_train_new.shape)


# Normalizing such that the magnitude is one
from sklearn.preprocessing import normalize

X_train_new_mag_1=normalize(X_train_new, axis=1) # vector magnitude is one
print(X_train_new_mag_1.shape)


## Normalizing by Zscore
from scipy.stats import zscore
X_train_new_Z_score=X_train_new.apply(zscore)
print(X_train_new_Z_score.shape)



## Normalizing so that range is 0-1
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_train_new_0_1=min_max_scaler.fit_transform(X_train_new)
print(X_train_new_0_1.shape)


## Normalizing so that range is -1 to 1
from sklearn import preprocessing
max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_new_m1_p1=max_abs_scaler.fit_transform(X_train_new)
print(X_train_new_m1_p1.shape)


# Using PCA as input
X_train_4_PCA=df_train[feature_cols]
print(X_train_4_PCA.shape)
X_train_new_mag_1_PCA=normalize(X_train_4_PCA, axis=1)
print(X_train_new_mag_1_PCA.shape)

pca = PCA()
pca.fit(X_train_new_mag_1_PCA)
components = pca.components_[:20,:]
new_data = np.dot(X_train_new_mag_1_PCA, components.T)
X_train_new_PCA=new_data

print(X_train_new_PCA.shape)


## Using Pearson Correlation in PCA
df1= pd.DataFrame(data=X_train_new_PCA)
print(df1.shape)


corr_df_PCA=pd.concat([df1, y_new],axis=1)

print(corr_df_PCA.shape)
a_PCA=corr_df_PCA.corr()
#a_PCA['Stable_compunds'].hist(bins=7, figsize=(18, 12), xlabelsize=10)


thr=.01
corr_variables_PCA=list(a_PCA[a_PCA['Stable_compunds'].abs()>thr].index)

del(corr_variables_PCA[-1])


X_train_PCA_PC=df1[corr_variables_PCA]



# ### First we will build a model to determine if the input elements will produce at least one stable compound

y_new=df_train['Stable_compunds']


# # Model Generation

print('Training Model Using Z-normalized Data')
## test-train split
X_train, X_test, y_train, y_test = train_test_split(X_train_new_Z_score, y_new,
                                                    test_size=.1,
                                                    shuffle=True,
                                                    random_state=42)

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

# Hyper-Parameter Search Grid Using 10-Fold CV and Test
print(' -- Random Forest --')

#first pass
n_estimators = [1,3,5,10,50,100]
criterion=['entropy','gini']
bootstrap= [True, False]
max_depth=[2,5,10]

min_samples_splits=[2,3,4,6,7,8,9,10,20]
min_samples_leafs=[1,2,5,10]
min_impurity_splits=[5e-7 ,1e-6]

#second pass
#n_estimators = [10,20,50]
#criterion=['entropy']
#bootstrap= [True, False]
#max_depth=[5,6]
#min_samples_splits=[2,3,4,5,6]
#min_samples_leafs=[1,3,5]
#min_impurity_splits=[3e-7, 5e-7,1e-6]

#n_estimators = [1,3,5,8]
#criterion=['entropy']
#bootstrap= [True, False]
#max_depth=[1,3,4]


#min_samples_splits=[2,3,4,5]
#min_samples_leafs=[1]
#min_impurity_splits=[3e-7, 5e-7,8e-7]

df_results_RF=scores.hp_tune_Random_Forest(X_train,y_train,X_test,y_test,5,n_estimators,criterion,bootstrap,max_depth,min_samples_splits,min_samples_leafs,min_impurity_splits)





print('This are the best Parameters for Random Forest:')
print(df_results_RF[['test_results_auc','test_recall','features']][df_results_RF['test_accuracy']==df_results_RF['test_accuracy'].max()].head())


# # Decision Trees


# Hyper-Parameter Search Grid Using 10-Fold CV and Test
print(' -- Decision Trees --')


criterion=['entropy','gini']
bootstrap= [True, False]
max_depth=[1,2,5,10,100,250,1000]
split=['random','best']
min_samples_splits=[2,3,4,6,7,8,9,10]
min_samples_leafs=[1]
min_impurity_splits=[5e-7 ,1e-6]

#second pass
#criterion=['entropy']
#max_depth=[10,11,15]
#split=['random','best']
#min_samples_splits=[2,3,4,6]
#min_samples_leafs=[1,3,5]
#min_impurity_splits=[3e-7, 5e-7,1e-6]

#criterion=['entropy']
#max_depth=[1,3,510]
#split=['best']
#min_samples_splits=[2,3]
#min_samples_leafs=[1]
#min_impurity_splits=[3e-7, 5e-7,8e-5]

df_results_DT=scores.hp_tune_Decision_tree(X_train,y_train,X_test,y_test,5,criterion,max_depth,split,min_samples_splits,min_samples_leafs,min_impurity_splits)

print('This are the best Parameters for Decision Tree:')
print(df_results_DT[df_results_DT[['test_results_auc','test_recall','features']]['test_results_auc']==df_results_DT['test_results_auc'].max()].head())



# # KNN 


# Hyper-Parameter Search Grid Using 10-Fold CV and Test
print(' -- KNN --')

criterion=['distance', 'uniform']
neighbors=[1,2,3,10,50,100]
distances = [1, 2, 3, 4, 5]

df_results_KNN=scores.hp_tune_KNN(X_train,y_train,X_test,y_test,5,criterion,neighbors,distances)




print('This are the best Parameters for KNN :')
print(df_results_KNN[df_results_KNN[['test_results_auc','test_recall','features']]['test_results_auc']==df_results_KNN['test_results_auc'].max()].head())


# # SVM


# Hyper-Parameter Search Grid Using 10-Fold CV and Test
print(' -- SVM --')

kernel=['rbf', 'linear', 'poly', 'sigmoid']
gammas = [.001,.1,1,3,5]
cs = [.0001,.1,1,5,10,15,20]

df_results_SVM=scores.hp_tune_SVM(X_train,y_train,X_test,y_test,10,kernel,gammas,cs)



print('This are the best Parameters for SVM :')
print(df_results_SVM[df_results_SVM[['test_results_auc','test_recall','features']]['test_results_auc']==df_results_SVM['test_results_auc'].max()].head())


# # Logistic Regression

# Hyper-Parameter Search Grid Using 10-Fold CV and Test
print(' -- Logistic Regression --')

criterion=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

df_results_log_reg=scores.hp_tune_log_reg(X_train,y_train,X_test,y_test,10,criterion)

print('This are the best Parameters for Logistic Regression :')
print(df_results_log_reg[df_results_log_reg[['test_results_auc','test_recall','features']]['test_results_auc']==df_results_log_reg['test_results_auc'].max()].head())



Training Data is being read ....
(2572, 98)
(2572, 109)
Training Data has been read and feature engineering is being performed....
(2572, 110)
Pearson Correlation has been calculated to build the model in the most relevant features ....
Pearson Correlation has identified 38 with  0.1
(2572, 38)
(2572, 38)
(2572, 38)
(2572, 38)
(2572, 38)
(2572, 98)
(2572, 98)
(2572, 20)
(2572, 20)
(2572, 21)
Training Model Using Z-normalized Data
(2314, 38) (2314,)
(258, 38) (258,)
 -- Random Forest --
 -- Random Forest --
This are the best Parameters for Random Forest:
      test_results_auc  test_recall                               features
2144          0.903148     0.949153  [5, entropy, False, 10, 10, 1, 5e-07]
2145          0.903148     0.949153  [5, entropy, False, 10, 10, 1, 1e-06]
4256          0.903148     0.949153     [50, gini, False, 10, 3, 1, 5e-07]
4257          0.903148     0.949153     [50, gini, False, 10, 3, 1, 1e-06]
4265          0.902482     0.940678     [50, gini, False, 10, 4, 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


This are the best Parameters for SVM :
    train_results_mean  train_results_std  train_results_auc  \
10            0.873408           0.022131           0.966419   

    test_results_auc  test_accuracy  test_precision  train_recall  \
10          0.894673       0.891473        0.846154       0.98018   

    test_recall       features  
10     0.932203  [rbf, 0.1, 5]  
 -- Logistic Regression --
 -- Logistic Regression --
This are the best Parameters for Logistic Regression :
   train_results_mean  train_results_std  train_results_auc  test_results_auc  \
0            0.745904           0.035418           0.761530          0.732506   
1            0.745904           0.035418           0.761530          0.732506   
2            0.746337           0.035584           0.761115          0.732506   
3            0.745904           0.035418           0.761530          0.732506   
4            0.745904           0.035418           0.761530          0.732506   

   test_accuracy  test_precisio

In [2]:
## Fitting best Model
print(' -- Optimal Random Forest --')
from sklearn.ensemble import RandomForestClassifier

rfc_opt = RandomForestClassifier(n_estimators=5,criterion='entropy',bootstrap=False,max_depth=10, 
                                 min_samples_split=10,
                                 min_samples_leaf=1,
                                 min_impurity_decrease=5e-07,
                                 random_state=0
                                 ,n_jobs=-1,class_weight={0:y_train.mean(), 1:1-y_train.mean()})
rfc_opt.fit(X_train, y_train)

train_pred = rfc_opt.predict(X_train)
    
precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_train,train_pred)
print('Training precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Training Confusion matrix')
print(confusion)
print('Training AUC:',roc_auc)


y_pred = rfc_opt.predict(X_test)


precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)


## Compare to Default Model
print(' -- Default Random Forest --')
from sklearn.ensemble import RandomForestClassifier
rfc_def = RandomForestClassifier(class_weight={0:y_train.mean(), 1:1-y_train.mean()})
rfc_def.fit(X_train, y_train)



train_pred = rfc_def.predict(X_train)
    
precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_train,train_pred)
print('DEF Training precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('DEF Training Confusion matrix')
print(confusion)
print('DEF Training AUC:',roc_auc)

y_pred = rfc_def.predict(X_test)



precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Defualt Model precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Defualt ModelConfusion matrix')
print(confusion)
print('Defualt ModelAUC:',roc_auc)

 -- Optimal Random Forest --
Training precision:  0.9054393305439331   recall:  0.9747747747747748   F1:  0.9388286334056399   accuracy:  0.939066551426102
Training Confusion matrix
[[1091  113]
 [  28 1082]]
Training AUC: 0.9404604770883842
Optimal precision:  0.8484848484848485   recall:  0.9491525423728814   F1:  0.896   accuracy:  0.8992248062015504
optimal Confusion matrix
[[120  20]
 [  6 112]]
Optimal AUC: 0.9031476997578695
 -- Default Random Forest --
DEF Training precision:  0.9972850678733032   recall:  0.9927927927927928   F1:  0.9950338600451466   accuracy:  0.9952463267070009
DEF Training Confusion matrix
[[1201    3]
 [   8 1102]]
DEF Training AUC: 0.9951505492203168
Defualt Model precision:  0.8455284552845529   recall:  0.8813559322033898   F1:  0.8630705394190872   accuracy:  0.872093023255814
Defualt ModelConfusion matrix
[[121  19]
 [ 14 104]]
Defualt ModelAUC: 0.872820823244552


In [6]:
## ADAboosting
print('------ ADAboosting with Random Forest ----')

from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import roc_curve, auc

rfc_opt = RandomForestClassifier(n_estimators=5,criterion='entropy',bootstrap=False,max_depth=10, 
                                 min_samples_split=10,
                                 min_samples_leaf=1,
                                 min_impurity_decrease=5e-07,
                                 random_state=0
                                 ,n_jobs=-1,class_weight={0:y_train.mean(), 1:1-y_train.mean()})

clf = AdaBoostClassifier(base_estimator=rfc_opt, n_estimators=100,learning_rate=1)

all_accuracies = cross_val_score(estimator=clf,X=X_train, y=y_train, cv=10,scoring='roc_auc')

print(all_accuracies.mean())

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)



precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Defualt Model precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('Defualt ModelConfusion matrix')
print(confusion)
print('Defualt ModelAUC:',roc_auc)


------ ADAboosting with Random Forest ----
0.9608907874816964
Defualt Model precision:  0.8449612403100775   recall:  0.923728813559322   F1:  0.8825910931174089   accuracy:  0.8875968992248062
Defualt ModelConfusion matrix
[[120  20]
 [  9 109]]
Defualt ModelAUC: 0.8904358353510896
