In [2]:

import os
import pandas as pd
import matplotlib.pyplot as plt

import csv
import numpy as np


from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import scores


# In[14]:



# # Reading in the Data

path_f=os.getcwd()

path_f_1=os.path.join(path_f, 'data')


names=[]
for files_txts in os.listdir(path_f_1):
    if files_txts.endswith(".csv"):
        #print(files_txts)
        names.append(files_txts)
        
path_train=os.path.join(path_f_1, names[0])
path_test=os.path.join(path_f_1, names[1])

df_train=pd.read_csv(path_train)
df_train.shape


# ## Data Manipulation
print('Training Data is being read ....')
#  - Transforming the outcome to a numpy vector

stab_vector=df_train['stabilityVec'].values
y=[]
for x in stab_vector:
    #print(x)
    a=np.fromstring(x[1:-1],sep=',').astype(int)
    y.append(a)
y=np.array(y) 

df_tmp = pd.DataFrame(y, columns = ['A', 'A91B', 'A82B','A73B','A64B','A55B','A46B','A37B','A28B','A19B','B'])
stab_vec_list=[ 'A91B', 'A82B','A73B','A64B','A55B','A46B','A37B','A28B','A19B']

df_train=df_train.drop("stabilityVec",axis=1) #removing the results which originally are a string
feature_cols=list(df_train)

print(df_train.shape)

df_train['formulaA']=df_train['formulaA_elements_Number']
df_train['formulaB']=df_train['formulaB_elements_Number']

df_train=pd.concat([df_train, df_tmp],axis=1)
print(df_train.shape)

# ### Input Data Normalization and Feature Engineering
print('Training Data has been read and feature engineering is being performed....')

y_all=df_train[stab_vec_list]
df_tmp_stable = pd.DataFrame( columns = ['Stable_compunds'])
df_tmp_stable['Stable_compunds']=np.logical_not(y_all.sum(axis=1)==0).astype(int) ## A one means it has a stable value  a 0 

df_train=pd.concat([df_train, df_tmp_stable],axis=1)
print(df_train.shape)

df_train.head()





# In[3]:


print(names)


# ## Selecting Output for Component 1 of Stability Vector

# In[15]:


## Observing how many element pairs produce a stable compound per % and overall

y_all=df_train[stab_vec_list]

count=1
    
y = df_train[stab_vec_list[count]]
print(y.value_counts())

stable_comp=df_train.loc[y==1,['formulaA','formulaB']] # Find the elements that create a stable element in this vector component
print('Compound being analyzed is',stab_vec_list[count])
stable_comp_num=stable_comp.values
stable_A=np.unique(stable_comp_num[:,0])
stable_B=np.unique(stable_comp_num[:,1])
    
df_unique= pd.DataFrame()

y_unique= pd.DataFrame()
    
for cnt in range(stable_A.shape[0]):

    df_tmp1=y.loc[df_train['formulaA']==stable_A[cnt]]
    y_unique=pd.concat([y_unique, df_tmp1],axis=0)
        
    df_tmp=df_train.loc[df_train['formulaA']==stable_A[cnt]]
    df_unique=pd.concat([df_unique, df_tmp],axis=0)
        

    


for cnt in range(stable_B.shape[0]):
    df_tmp1=y.loc[df_train['formulaB']==stable_B[cnt]]
    y_unique=pd.concat([y_unique, df_tmp1],axis=0)
        
    df_tmp=df_train.loc[df_train['formulaB']==stable_B[cnt]]
    df_unique=pd.concat([df_unique, df_tmp],axis=0)

    
y_unique=y.iloc[y_unique.index.unique()]
df_unique=df_train.iloc[df_unique.index.unique()]
print(y_unique.value_counts())
print('The elements in these compounds create a stable compound for this component of the stability vector:',y_unique.shape)
    
    
y_stable=y_unique.loc[np.logical_not(y_all.sum(axis=1)==0)]
df_stable=df_unique.loc[np.logical_not(y_all.sum(axis=1)==0)]
print(y_stable.value_counts())
print('The elements in these compounds create a stable compound for this component of the stability vector and create at least one stable compound:',y_stable.shape)



# ## Pearson Correlation and Input Normalization

# In[17]:


# Pearson Correlation to Identify the features that influence the most on the output 
print('Pearson Correlation has been calculated to build the model in the most relevant features ....')
X_train_new_all=df_stable[feature_cols] #This means we will only train on the elements that create a stable compound for this component of the stability vector and have at least one stable compound

y_new=y_stable
print('Number of Results to train on:',y_new.shape)
print('Number of Training Features before Pearson correlation:', X_train_new_all.shape[1])

corr_df=pd.concat([X_train_new_all, y_new],axis=1)
a=corr_df.corr()
#a['Stable_compunds'].hist(bins=7, figsize=(18, 12), xlabelsize=10)

## Incorporating the Features that contribute the most based on a pearson correlation coefficient threshold

thr=.09

corr_variables=list(a[a[stab_vec_list[count]].abs()>thr].index)

del(corr_variables[-1])


print('Pearson Correlation has identified', len(corr_variables), 'with ', str(thr) )

## Normalization of Input Data

## Using Un-normalized data as input
X_train_new=df_stable[corr_variables]

print('Number of Training Features after Pearson correlation:', X_train_new.shape[1])


# Normalizing such that the magnitude is one
from sklearn.preprocessing import normalize

X_train_new_mag_1=normalize(X_train_new, axis=1) # vector magnitude is one
print(X_train_new_mag_1.shape)


## Normalizing by Zscore
from scipy.stats import zscore
X_train_new_Z_score=X_train_new.apply(zscore)
print(X_train_new_Z_score.shape)



## Normalizing so that range is 0-1
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_train_new_0_1=min_max_scaler.fit_transform(X_train_new)
print(X_train_new_0_1.shape)


## Normalizing so that range is -1 to 1
from sklearn import preprocessing
max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_new_m1_p1=max_abs_scaler.fit_transform(X_train_new)
print(X_train_new_m1_p1.shape)


# Using PCA as input
X_train_4_PCA=df_stable[feature_cols]
indx_4_PC=X_train_4_PCA.index
X_train_new_mag_1_PCA=normalize(X_train_4_PCA, axis=1)


pca = PCA()
pca.fit(X_train_new_mag_1_PCA)
components = pca.components_[:20,:]
new_data = np.dot(X_train_new_mag_1_PCA, components.T)
X_train_new_PCA=new_data

print(X_train_new_PCA.shape)

## Using Pearson Correlation in PCA
df1= pd.DataFrame(data=X_train_new_PCA, index=indx_4_PC)
print(df1.shape)

corr_df_PCA=pd.concat([df1, y_new],axis=1)


a_PCA=corr_df_PCA.corr()

thr=.05
corr_variables_PCA=list(a_PCA[a_PCA[stab_vec_list[count]].abs()>thr].index)


del(corr_variables_PCA[-1])

print('Pearson Correlation in PCA Space has identified', len(corr_variables_PCA), 'with ', str(thr) )

X_train_PCA_PC=df1[corr_variables_PCA]

print('Number of Training Features after Pearson correlation in PCA Space:', X_train_PCA_PC.shape[1])








# ## Model Generation

# In[9]:


print('Training Model Using Z-normalized Data')
## test-train split
X_train, X_test, y_train, y_test = train_test_split(X_train_new_Z_score, y_new,
                                                    test_size=.15,
                                                    shuffle=True,
                                                    random_state=42)

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)



Training Data is being read ....
(2572, 98)
(2572, 109)
Training Data has been read and feature engineering is being performed....
(2572, 110)
['training_data.csv', 'test_data.csv']
0    2484
1      88
Name: A82B, dtype: int64
Compound being analyzed is A82B
0    1702
1      88
Name: A82B, dtype: int64
The elements in these compounds create a stable compound for this component of the stability vector: (1790,)
0    813
1     88
Name: A82B, dtype: int64
The elements in these compounds create a stable compound for this component of the stability vector and create at least one stable compound: (901,)
Pearson Correlation has been calculated to build the model in the most relevant features ....
Number of Results to train on: (901,)
Number of Training Features before Pearson correlation: 98
Pearson Correlation has identified 30 with  0.09
Number of Training Features after Pearson correlation: 30
(901, 30)
(901, 30)
(901, 30)
(901, 30)
(901, 20)
(901, 20)
Pearson Correlation in PCA Space has i

In [5]:
## Fitting best Model
from sklearn.ensemble import RandomForestClassifier
import sklearn.tree
from sklearn.neighbors import KNeighborsClassifier
import sklearn.svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
print(' -- Optimal Random Forest --')


rfc_opt = RandomForestClassifier(n_estimators=1,
                                 criterion='gini',
                                 bootstrap=False,
                                 max_depth=10, 
                                 class_weight={0:y_train.mean(), 1:1-y_train.mean()},
                                 min_samples_split=10,
                                 min_samples_leaf=1,
                                 min_impurity_decrease=5e-7,
                                 random_state=0,
                                 n_jobs=-1)
rfc_opt.fit(X_train, y_train)
y_pred = rfc_opt.predict(X_test)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)

## Fitting best Model
print(' -- Optimal Decision Tree --')
## Fitting best Model
#[cr,max_d,sp,min_sample_sp,min_samples_le,min_impurity_sp]
rfc_opt_DT = sklearn.tree.DecisionTreeClassifier(class_weight={0:y_train.mean(), 1:1-y_train.mean()},
                                                 criterion='gini',
                                                 max_depth=100,
                                                 random_state=0, 
                                                 splitter='random',
                                                 min_samples_split=8,
                                                 min_samples_leaf=1,
                                                 min_impurity_decrease=5e-7)


rfc_opt_DT.fit(X_train, y_train)
y_pred = rfc_opt_DT.predict(X_test)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)

print(' -- Optimal KNN --')
rf_opt_KNN=KNeighborsClassifier(algorithm='auto',
                                metric='minkowski',
                                n_jobs=-1, 
                                n_neighbors=1,
                                p=1,
                                weights='distance')

rf_opt_KNN.fit(X_train, y_train)
y_pred = rf_opt_KNN.predict(X_test)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)


print(' -- Optimal SVM --')


rfc_opt_SVM = sklearn.svm.SVC(kernel='rbf', 
                      gamma=.1,C=15,
                      random_state=0,
                      class_weight={0:y_train.mean(), 1:1-y_train.mean()})

rfc_opt_SVM.fit(X_train, y_train)

y_pred = rfc_opt_SVM.predict(X_test)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)


print('-------- Bagging and Boosting Result-------')

print('------- ADA Boosting Random Forest Classifier-------')

#[estimator,cr,boots,max_d,min_sample_sp,min_samples_le,min_impurity_sp,num_e,lr]

rfc_opT_rf = RandomForestClassifier(n_estimators=1,
                                 criterion='gini',
                                 bootstrap=False,
                                 max_depth=10, 
                                 class_weight={0:y_train.mean(), 1:1-y_train.mean()},
                                 min_samples_split=10,
                                 min_samples_leaf=10,
                                 min_impurity_decrease=5e-7,
                                 random_state=0,
                                 n_jobs=-1)
clf_RF = AdaBoostClassifier(base_estimator=rfc_opT_rf,
                         n_estimators=1000,
                         learning_rate=.0001)

clf_RF.fit(X_train, y_train)

y_pred = clf_RF.predict(X_test)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)

print('------- ADA Boosting Decision Tree Classifier-------')

#([cr,max_d,sp,min_sample_sp,min_samples_le,min_impurity_sp,num_e,lr]

rfc = sklearn.tree.DecisionTreeClassifier(class_weight={0:y_train.mean(), 1:1-y_train.mean()},
                                                 criterion='gini',
                                                 max_depth=100,
                                                 random_state=0, 
                                                 splitter='random',
                                                 min_samples_split=8,
                                                 min_samples_leaf=1,
                                                 min_impurity_decrease=5e-7)

clf = AdaBoostClassifier(base_estimator=rfc,
                         n_estimators=1,
                         learning_rate=10)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)

print('------- Gradient Boosting Classifier-------')

#[max_d,min_sample_sp,min_samples_le,min_impurity_sp,num_e,lr]
rfc_opt_GRAD=GradientBoostingClassifier(n_estimators=1,
                                         learning_rate=10,
                                         min_samples_split=8,
                                         min_samples_leaf=1,
                                         max_depth=100,
                                         random_state=0)

rfc_opt_GRAD.fit(X_train, y_train)

y_pred = rfc_opt_GRAD.predict(X_test)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)

print('------- Extra Trees Classifier-------')

#[estimator,cr,boots,max_d,min_sample_sp,min_samples_le,min_impurity_sp]

rfc_opt_Extra= ExtraTreesClassifier(n_estimators=1,
                                     criterion='gini',
                                     bootstrap=False,
                                     max_depth=10,
                                     class_weight={0:y_train.mean(), 1:1-y_train.mean()},
                                     min_samples_split=10,
                                     min_samples_leaf=1,
                                     min_impurity_decrease=5e-7,
                                     random_state=0,n_jobs=-1)

rfc_opt_Extra.fit(X_train, y_train)

y_pred = rfc_opt_Extra.predict(X_test)

precision,recall,F1,accuracy,confusion,roc_auc=scores.scores(y_test,y_pred)
print('Optimal precision: ', precision, '  recall: ', recall, '  F1: ', F1, '  accuracy: ', accuracy)
print('optimal Confusion matrix')
print(confusion)
print('Optimal AUC:',roc_auc)


 -- Optimal Random Forest --
Optimal precision:  0.3548387096774194   recall:  0.6875   F1:  0.4680851063829787   accuracy:  0.8161764705882353
optimal Confusion matrix
[[100  20]
 [  5  11]]
Optimal AUC: 0.7604166666666666
 -- Optimal Decision Tree --
Optimal precision:  0.34615384615384615   recall:  0.5625   F1:  0.4285714285714286   accuracy:  0.8235294117647058
optimal Confusion matrix
[[103  17]
 [  7   9]]
Optimal AUC: 0.7104166666666667
 -- Optimal KNN --
Optimal precision:  0.5294117647058824   recall:  0.5625   F1:  0.5454545454545455   accuracy:  0.8897058823529411
optimal Confusion matrix
[[112   8]
 [  7   9]]
Optimal AUC: 0.7479166666666667
 -- Optimal SVM --
Optimal precision:  0.36363636363636365   recall:  0.5   F1:  0.4210526315789474   accuracy:  0.8382352941176471
optimal Confusion matrix
[[106  14]
 [  8   8]]
Optimal AUC: 0.6916666666666667
-------- Bagging and Boosting Result-------
------- ADA Boosting Random Forest Classifier-------
Optimal precision:  0.666666

In [None]:
## Saving the Best Model

Selecting the model that has the highest AUC.

In [6]:
from sklearn.externals import joblib

filename = 'Best_model_component_2.sav'
model=rfc_opt
joblib.dump(model, filename)

['Best_model_component_2.sav']