# Case Study: Biodegradability Analysis

# Import Libraries

**Import the usual libraries **

In [92]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

# Data Set : QSAR BioDegradation Data Set

The data have been used to develop QSAR (Quantitative Structure Activity Relationships) models for the study of the relationships between chemical structure and biodegradation of molecules. Biodegradation experimental values of 1055 chemicals were collected.

Data can be found on UCI Machine Learning Repository: 
https://archive.ics.uci.edu/ml/datasets/QSAR+biodegradation

Attribute Information:

1) SpMax_L: Leading eigenvalue from Laplace matrix <br>
2) J_Dz(e): Balaban-like index from Barysz matrix weighted by Sanderson electronegativity <br>
3) nHM: Number of heavy atoms <br>
4) F01[N-N]: Frequency of N-N at topological distance 1 <br>
5) F04[C-N]: Frequency of C-N at topological distance 4 <br>
6) NssssC: Number of atoms of type ssssC <br>
7) nCb-: Number of substituted benzene C(sp2) <br>
8) C%: Percentage of C atoms <br>
9) nCp: Number of terminal primary C(sp3) <br>
10) nO: Number of oxygen atoms <br>
11) F03[C-N]: Frequency of C-N at topological distance 3 <br>
12) SdssC: Sum of dssC E-states <br>
13) HyWi_B(m): Hyper-Wiener-like index (log function) from Burden matrix weighted by mass <br>
14) LOC: Lopping centric index <br>
15) SM6_L: Spectral moment of order 6 from Laplace matrix <br>
16) F03[C-O]: Frequency of C - O at topological distance 3 <br>
17) Me: Mean atomic Sanderson electronegativity (scaled on Carbon atom) <br>
18) Mi: Mean first ionization potential (scaled on Carbon atom) <br>
19) nN-N: Number of N hydrazines <br>
20) nArNO2: Number of nitro groups (aromatic) <br>
21) nCRX3: Number of CRX3 <br>
22) SpPosA_B(p): Normalized spectral positive sum from Burden matrix weighted by polarizability <br>
23) nCIR: Number of circuits <br>
24) B01[C-Br]: Presence/absence of C - Br at topological distance 1 <br>
25) B03[C-Cl]: Presence/absence of C - Cl at topological distance 3 <br>
26) N-073: Ar2NH / Ar3N / Ar2N-Al / R..N..R <br>
27) SpMax_A: Leading eigenvalue from adjacency matrix (Lovasz-Pelikan index) <br>
28) Psi_i_1d: Intrinsic state pseudoconnectivity index - type 1d <br>
29) B04[C-Br]: Presence/absence of C - Br at topological distance 4 <br>
30) SdO: Sum of dO E-states <br>
31) TI2_L: Second Mohar index from Laplace matrix <br>
32) nCrt: Number of ring tertiary C(sp3) <br>
33) C-026: R--CX--R <br>
34) F02[C-N]: Frequency of C - N at topological distance 2 <br>
35) nHDon: Number of donor atoms for H-bonds (N and O) <br>
36) SpMax_B(m): Leading eigenvalue from Burden matrix weighted by mass <br>
37) Psi_i_A: Intrinsic state pseudoconnectivity index - type S average <br>
38) nN: Number of Nitrogen atoms <br>
39) SM6_B(m): Spectral moment of order 6 from Burden matrix weighted by mass <br>
40) nArCOOR: Number of esters (aromatic) <br>
41) nX: Number of halogen atoms <br>
42) experimental class: ready biodegradable (RB) and not ready biodegradable (NRB)<br>

## Get the Data

** Use pandas to read data as a dataframe called df.**

In [93]:
names = ['SpMax_L', 'J_Dz', 'nHM', 'F01[N-N]', 'F04[C-N]', 'NssssC', 'nCb-', 'C%','nCp','nO','F03[C-N]','SdssC','HyWi_B','LOC','SM6_L','F03[C-O]','Me','Mi','nN-N','nArNO2','nCRX3','SpPosA','nCIR','B01[C-Br]','B03[C-Cl]','N-073','SpMax_A','Psi_i_1d','B04[C-Br]','SdO','TI2_L','nCrt','C-026','F02[C-N]','nHDon','SpMax_B','Psi_i_A','nN','SM6_B','nArCOOR','nX','experimentalclass']
df = pd.read_csv('bio-degradabale-data.csv',sep=";",names=names)
df.head()


Unnamed: 0,SpMax_L,J_Dz,nHM,F01[N-N],F04[C-N],NssssC,nCb-,C%,nCp,nO,...,C-026,F02[C-N],nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX,experimentalclass
0,3.919,2.6909,0,0,0,0,0,31.4,2,0,...,0,0,0,2.949,1.591,0,7.253,0,0,RB
1,4.17,2.1144,0,0,0,0,0,30.8,1,1,...,0,0,0,3.315,1.967,0,7.257,0,0,RB
2,3.932,3.2512,0,0,0,0,0,26.7,2,4,...,0,0,1,3.076,2.417,0,7.601,0,0,RB
3,3.0,2.7098,0,0,0,0,0,20.0,0,2,...,0,0,1,3.046,5.0,0,6.69,0,0,RB
4,4.236,3.3944,0,0,0,0,0,29.4,2,4,...,0,0,0,3.351,2.405,0,8.003,0,0,RB


In [94]:
# Check the data , there should be no missing values 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 42 columns):
SpMax_L              1055 non-null float64
J_Dz                 1055 non-null float64
nHM                  1055 non-null int64
F01[N-N]             1055 non-null int64
F04[C-N]             1055 non-null int64
NssssC               1055 non-null int64
nCb-                 1055 non-null int64
C%                   1055 non-null float64
nCp                  1055 non-null int64
nO                   1055 non-null int64
F03[C-N]             1055 non-null int64
SdssC                1055 non-null float64
HyWi_B               1055 non-null float64
LOC                  1055 non-null float64
SM6_L                1055 non-null float64
F03[C-O]             1055 non-null int64
Me                   1055 non-null float64
Mi                   1055 non-null float64
nN-N                 1055 non-null int64
nArNO2               1055 non-null int64
nCRX3                1055 non-null int64
SpPosA        

In [95]:
# Check Distribution of Output Class

In [96]:
df.groupby('experimentalclass').count()

Unnamed: 0_level_0,SpMax_L,J_Dz,nHM,F01[N-N],F04[C-N],NssssC,nCb-,C%,nCp,nO,...,nCrt,C-026,F02[C-N],nHDon,SpMax_B,Psi_i_A,nN,SM6_B,nArCOOR,nX
experimentalclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NRB,699,699,699,699,699,699,699,699,699,699,...,699,699,699,699,699,699,699,699,699,699
RB,356,356,356,356,356,356,356,356,356,356,...,356,356,356,356,356,356,356,356,356,356


Get the name of features in an array

In [97]:
feature_names = names[0:-1]
print (feature_names)

['SpMax_L', 'J_Dz', 'nHM', 'F01[N-N]', 'F04[C-N]', 'NssssC', 'nCb-', 'C%', 'nCp', 'nO', 'F03[C-N]', 'SdssC', 'HyWi_B', 'LOC', 'SM6_L', 'F03[C-O]', 'Me', 'Mi', 'nN-N', 'nArNO2', 'nCRX3', 'SpPosA', 'nCIR', 'B01[C-Br]', 'B03[C-Cl]', 'N-073', 'SpMax_A', 'Psi_i_1d', 'B04[C-Br]', 'SdO', 'TI2_L', 'nCrt', 'C-026', 'F02[C-N]', 'nHDon', 'SpMax_B', 'Psi_i_A', 'nN', 'SM6_B', 'nArCOOR', 'nX']


#### Convert experimental class  column to 1/0 and store in new column target


In [98]:
from sklearn.preprocessing import LabelEncoder

In [99]:
# # Encode label diagnosis
# # RB -> 1
# # NRB -> 0

In [100]:
# Get All rows, but only last column
target_data=df["experimentalclass"]


encoder = LabelEncoder()
target_data = encoder.fit_transform(target_data)
#print (target_data)

#### Store the encoded column in dataframe and drop the diagnosis column for simpilcity

In [101]:
df.drop(["experimentalclass"],axis = 1, inplace = True)

## Ada Boosting

In [102]:
from sklearn.ensemble import AdaBoostClassifier #For Classification
from sklearn.ensemble import AdaBoostRegressor #For Regression
from sklearn.tree import DecisionTreeClassifier

In [103]:
from sklearn import model_selection

In [104]:
random_state = 101
kfold = model_selection.KFold(n_splits=5, random_state=100)

#### In case of Ada Boosting, result will change based on number of estimators

In [105]:
def get_result_score(num_trees):
    model = AdaBoostClassifier(n_estimators=num_trees, random_state=random_state)
    results = model_selection.cross_val_score(model, df, target_data, cv=kfold)
    return results.mean()

** Compute the the mean score for different values of n_estimators**

In [109]:
num_tree_list = [50,75,100,150,200,250,300]
for num_tree_count in num_tree_list:
    score = get_result_score(num_tree_count)
    print ("Score for {:d} count is {:f} :".format( num_tree_count,score))


Score for 50 count is 0.797156 :
Score for 75 count is 0.810427 :
Score for 100 count is 0.811374 :
Score for 150 count is 0.822749 :
Score for 200 count is 0.813270 :
Score for 250 count is 0.813270 :
Score for 300 count is 0.805687 :


# End of Case Study