# Solubility Challenge

### originally by team: C Di Paola, J. Manson and K. Makobe
### revised and adapted By: C. Di Paola

## Import necessary initial libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(threshold=np.inf)
#%load_ext autotime

## Read files for analyis and prediction

We need solubility data and DRAGON 2D descriptors from the training set and DRAGON 2D descriptors for prediction data set

In [2]:
#solub=pd.read_excel('soldata.xls')
solub_train_data=pd.read_excel('soldata_trainingset.xls')
solub_train_descriptors=pd.read_excel('Solubility_training_descriptors_cleaned.xlsx')
solub_pred_descriptors=pd.read_excel('Solubility_prediction_descriptors_cleaned.xlsx')

## Pre-processing data

* training data shape anf formatting

In [3]:
solub_train_data.head()

Unnamed: 0,Substance,Temperature,assays,Ionic Strength (M),S0 (mM),SD of S0 (mM),Kinetic Solubility (mM),SD of Kinetic Solubility (mM),Unnamed: 8,Unnamed: 9,SMILES,InChI
0,1_naphthol,25.89,4.0,0.17121,10432.3,408.616,18026.0,11884.0,,,c1ccc2c(cccc2O)c1,InChI=1/C10H8O/c11-10-7-3-5-8-4-1-2-6-9(8)10/h...
1,2_amino_5_bromobenzoic_acid,25.0,5.0,0.16295,842.692,14.6303,2562.0,863.3,,,Brc1cc(C(O)=O)c(N)cc1,InChI=1/C7H6BrNO2/c8-4-1-2-6(9)5(3-4)7(10)11/h...
2,4_iodophenol,25.74,4.0,0.218635,19312.0,604.678,25091.0,8427.0,,,c1cc(ccc1O)I,"InChI=1/C6H5IO/c7-5-1-3-6(8)4-2-5/h1-4,8H"
3,5_bromo_2_4_dihydroxybenzoic_acid,25.05,5.0,0.186497,2397.22,40.1944,8726.0,753.8,,,Oc1c(Br)cc(C(O)=O)c(O)c1,InChI=1/C7H5BrO4/c8-4-1-3(7(11)12)5(9)2-6(4)10...
4,5_fluorouracil,25.1,,No precipitation detected. Kinetic solubility ...,,,,,,,Fc1c(=O)[nH]c(=O)[nH]c1,"InChI=1/C4H3FN2O2/c5-2-1-6-4(9)7-3(2)8/h1H,(H2..."


In [4]:
solub_train_descriptors.head()

Unnamed: 0,No.,NAME,MW,AMW,Sv,Mv,Me,Mp,Mi,nBM,...,Depressant-50,Psychotic-80,Hypertens-80,Hypertens-50,Hypnotic-80,Hypnotic-50,Neoplastic-80,Neoplastic-50,Infective-80,Infective-50
0,1,1-Naphthol ...,144.18,7.588,12.822,0.675,0.993,0.711,1.098,11,...,0,0,0,0,0,0,0,0,0,0
1,2,2_amino_5_bromo_benzoic_acid ...,216.04,12.708,12.057,0.709,1.037,0.738,1.118,7,...,0,0,0,0,0,0,0,0,1,0
2,3,4-Iodophenol ...,220.01,16.924,9.612,0.739,1.004,0.877,1.09,6,...,0,0,0,0,0,0,0,0,0,0
3,4,5_bromo_2_4_dihydroxybenzoic_acid ...,233.02,13.707,12.465,0.733,1.07,0.733,1.113,7,...,0,0,0,0,0,0,0,0,0,0
4,5,5_fluorouracil ...,130.09,10.841,8.383,0.699,1.105,0.635,1.181,3,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#for i,aa in enumerate(solub_train_descriptors['NAME']):
#    print(i+1)
#for j,columns in enumerate(solub_train_descriptors.columns):
    #print(j,columns)
    #unique, counts = np.unique(solub_train_descriptors[columns], return_counts=True)
    #print(unique,counts)

* **Searching for missing solubility (S0) data in the form of null/NaN values**

In [6]:
print(solub_train_data['S0 (mM)'].isna().value_counts()) ## specific fro NaN search
print(solub_train_data['S0 (mM)'].isnull().value_counts()) ## null data general

False    94
True     11
Name: S0 (mM), dtype: int64
False    94
True     11
Name: S0 (mM), dtype: int64


* **check the data are in the rigth format**

In [7]:
print(solub_train_data.shape)
print(solub_train_data[['Substance','S0 (mM)']].dtypes)

(105, 12)
Substance     object
S0 (mM)      float64
dtype: object


In [8]:
print(solub_train_descriptors.shape)
print(solub_train_descriptors.dtypes) # truncated list of data types
#print(solub_train_descriptors.info(verbose=True)) # full list of data types

(101, 1468)
No.                int64
NAME              object
MW               float64
AMW              float64
Sv               float64
                  ...   
Hypnotic-50        int64
Neoplastic-80      int64
Neoplastic-50      int64
Infective-80       int64
Infective-50       int64
Length: 1468, dtype: object


In [9]:
print(solub_pred_descriptors.shape)
print(solub_pred_descriptors.dtypes) # truncated list of data types
#print(solub_pred_descriptors.info(verbose=True)) # full list of data types

(32, 1171)
No.                int64
NAME              object
MW               float64
AMW              float64
Sv               float64
                  ...   
Hypnotic-80        int64
Hypnotic-50        int64
Neoplastic-80      int64
Neoplastic-50      int64
Infective-50       int64
Length: 1171, dtype: object


* Need to scale descriptors data to the same range of value [0,1]: MIN-MAX SCALER does this for us (from scikit-learn lib)

In [10]:
from sklearn import preprocessing
%matplotlib inline
min_max_scaler = preprocessing.MinMaxScaler()

In [11]:
columns_train_descrit=solub_train_descriptors.columns
columns_pred_descrit=solub_pred_descriptors.columns

In [12]:
x_train_all_minmax = min_max_scaler.fit_transform(solub_train_descriptors[columns_train_descrit[2:]])
solub_train_descriptors_values=pd.DataFrame(x_train_all_minmax, columns=columns_train_descrit[2:])

In [13]:
solub_train_descriptors_values.describe()

Unnamed: 0,MW,AMW,Sv,Mv,Me,Mp,Mi,nBM,RBN,RBF,...,Depressant-50,Psychotic-80,Hypertens-80,Hypertens-50,Hypnotic-80,Hypnotic-50,Neoplastic-80,Neoplastic-50,Infective-80,Infective-50
count,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,...,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0
mean,0.322169,0.245792,0.435356,0.469429,0.32974,0.295363,0.424655,0.491449,0.278751,0.393447,...,0.059406,0.158416,0.594059,0.207921,0.415842,0.108911,0.712871,0.247525,0.782178,0.207921
std,0.170704,0.162795,0.220415,0.217406,0.210968,0.148663,0.175305,0.179383,0.196975,0.225203,...,0.237562,0.366952,0.493522,0.407844,0.495325,0.313081,0.454679,0.433727,0.414824,0.407844
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.220879,0.152591,0.295478,0.322275,0.162963,0.199313,0.307692,0.363636,0.153846,0.253219,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.305828,0.214999,0.415792,0.445498,0.303704,0.281787,0.395604,0.454545,0.230769,0.377682,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
75%,0.419389,0.305963,0.592755,0.620853,0.451852,0.347079,0.527473,0.590909,0.384615,0.536481,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
x_pred_all_minmax = min_max_scaler.fit_transform(solub_pred_descriptors[columns_pred_descrit[2:]])
solub_pred_descriptors_values=pd.DataFrame(x_pred_all_minmax, columns=columns_pred_descrit[2:])

In [15]:
solub_pred_descriptors_values.describe()

Unnamed: 0,MW,AMW,Sv,Mv,Me,Mp,Mi,nBM,RBN,RBF,...,Inflammat-80,Depressant-80,Psychotic-80,Hypertens-80,Hypertens-50,Hypnotic-80,Hypnotic-50,Neoplastic-80,Neoplastic-50,Infective-50
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,...,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,0.431442,0.370234,0.378107,0.479992,0.419215,0.410227,0.546274,0.408482,0.295455,0.482904,...,0.46875,0.34375,0.1875,0.5625,0.1875,0.4375,0.09375,0.6875,0.25,0.21875
std,0.259477,0.268941,0.23521,0.310569,0.267304,0.260742,0.24349,0.26802,0.266298,0.260128,...,0.507007,0.482559,0.396558,0.504016,0.396558,0.504016,0.296145,0.470929,0.439941,0.420013
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.291934,0.115574,0.220477,0.207317,0.242021,0.215152,0.360577,0.214286,0.090909,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.394978,0.358668,0.333328,0.512195,0.351064,0.375758,0.605769,0.428571,0.227273,0.467647,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.602383,0.607312,0.510609,0.740854,0.609043,0.563636,0.677885,0.571429,0.363636,0.626471,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.25,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
corr_train=solub_train_descriptors_values.corr()
corr_pred=solub_pred_descriptors_values.corr()

In [17]:
#(solub_train_descriptors_values == 0.).sum() > 366.5

In [18]:
#drop_cols=solub_train_descriptors_values[(solub_train_descriptors_values == 0.).sum() > 366.5]

In [19]:
corr_train

Unnamed: 0,MW,AMW,Sv,Mv,Me,Mp,Mi,nBM,RBN,RBF,...,Depressant-50,Psychotic-80,Hypertens-80,Hypertens-50,Hypnotic-80,Hypnotic-50,Neoplastic-80,Neoplastic-50,Infective-80,Infective-50
MW,1.000000,0.031237,0.910694,0.001753,-0.043419,0.089877,-0.050468,0.563880,0.541781,0.230990,...,0.014457,0.290774,0.257669,0.336994,-0.214303,-0.137593,0.177703,0.206110,0.041647,-0.013661
AMW,0.031237,1.000000,-0.342867,0.846911,0.703868,0.767263,-0.029596,0.000640,-0.344031,-0.274207,...,-0.176107,-0.179826,-0.275913,-0.130785,-0.214674,-0.074872,-0.182223,-0.160659,-0.322508,-0.135975
Sv,0.910694,-0.342867,1.000000,-0.290206,-0.325175,-0.171028,-0.090980,0.562606,0.619992,0.278378,...,0.105051,0.387845,0.347709,0.363897,-0.140178,-0.112242,0.223879,0.251185,0.159198,0.030090
Mv,0.001753,0.846911,-0.290206,1.000000,0.675294,0.810388,-0.214844,0.287182,-0.445590,-0.385744,...,-0.145258,-0.091363,-0.223051,-0.137389,-0.188840,-0.026838,-0.099996,-0.165477,-0.221193,-0.125630
Me,-0.043419,0.703868,-0.325175,0.675294,1.000000,0.209322,0.477224,-0.137567,-0.361641,-0.227970,...,-0.202617,-0.199250,-0.086007,0.032876,-0.147884,0.012714,0.010758,-0.040372,-0.256188,-0.108312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Hypnotic-50,-0.137593,-0.074872,-0.112242,-0.026838,0.012714,-0.087459,-0.004163,-0.048000,-0.035692,0.050885,...,-0.087860,-0.151679,0.288996,-0.179118,0.414359,1.000000,0.221875,-0.053227,0.184490,0.212461
Neoplastic-80,0.177703,-0.182223,0.223879,-0.099996,0.010758,-0.184260,0.138781,0.147934,0.189723,0.248774,...,0.159495,0.035605,0.767744,0.325160,0.535465,0.221875,1.000000,0.363995,0.672448,0.325160
Neoplastic-50,0.206110,-0.160659,0.251185,-0.165477,-0.040372,-0.202395,0.101040,-0.019262,0.120706,0.061589,...,0.244073,0.002488,0.474110,0.667183,0.307396,-0.053227,0.363995,1.000000,0.302664,0.327994
Infective-80,0.041647,-0.322508,0.159198,-0.221193,-0.256188,-0.170564,0.025928,0.139648,0.100946,0.124415,...,0.132621,0.228954,0.638383,0.270372,0.396574,0.184490,0.672448,0.302664,1.000000,0.270372


* **Correlation heatmap: load seaborn lib**

* **Uniforming training and pediction data to the same descriptors**

In [20]:
bool_same_descriptors=solub_pred_descriptors_values.columns.isin(solub_train_descriptors.columns)
#unique, counts = np.unique(a, return_counts=True)
#print(unique,counts)
#print(bool_same_descriptors) # check the boolean list

In [21]:
solub_pred_descriptors_new=solub_pred_descriptors_values.loc[:,bool_same_descriptors]

In [22]:
solub_pred_descriptors_new.shape

(32, 1044)

In [23]:
bool_same_descriptors1=solub_train_descriptors_values.columns.isin(solub_pred_descriptors.columns)
#unique, counts = np.unique(b, return_counts=True)
#print(unique,counts)
#print(bool_same_descriptors1) # check the boolean list

In [24]:
solub_train_descriptors_new=solub_train_descriptors_values.loc[:,bool_same_descriptors1]

In [25]:
solub_train_descriptors_new.shape

(101, 1044)

In [26]:
corr_train1=solub_train_descriptors_new.corr()
corr_pred1=solub_pred_descriptors_new.corr()

* **clean data from different isomer forms (only DRAGON 2D descriptors available)**

In [27]:
solub_train_data[solub_train_data['Substance'].str.contains('form')]
solub_train_data1=solub_train_data[~solub_train_data['Substance'].str.contains('form_II')]

In [28]:
solub_train_data1[solub_train_data1['Substance'].str.contains('form')]

Unnamed: 0,Substance,Temperature,assays,Ionic Strength (M),S0 (mM),SD of S0 (mM),Kinetic Solubility (mM),SD of Kinetic Solubility (mM),Unnamed: 8,Unnamed: 9,SMILES,InChI
24,chlorprothixene_form_I,25.48,9.0,0.153915,0.177964,0.00812546,1.554,0.1589,,,CN(C)CC\C=C1\c2ccccc2Sc2ccc(cc12)Cl,InChI=1/C18H18ClNS/c1-20(2)11-5-7-14-15-6-3-4-...
76,phthalic_acid_form_I,25.1,7.0,0.280791,32158.4,2311.89,52659.0,17955.0,,,C(c1c(C(=O)O)cccc1)(=O)O,InChI=1/C8H6O4/c9-7(10)5-3-1-2-4-6(5)8(11)12/h...
92,sulindac_form_I,24.84,20.0,0.160486,210.015,23.1367,231.5,26.48,,,CC1=C(CC(O)=O)c2cc(F)ccc2C\1=C\c1ccc(cc1)S(C)=O,InChI=1/C20H17FO3S/c1-12-17(9-13-3-6-15(7-4-13...
98,trichloromethiazide_form_I,25.66,3.0,0.163679,660.645,17.5235,4530.0,2054.0,,,NS(=O)(=O)c1cc2c(NC(NS2(=O)=O)C(Cl)Cl)cc1Cl,InChI=1/C8H12Cl3N3O4Rn2/c9-3-1-4-6(2-5(3)19(12...


In [29]:
solub_train_data1['Substance'].replace(regex=True,inplace=True,to_replace=r'_form_I',value=r'')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [30]:
solub_train_data1[solub_train_data1['Substance'].str.contains('phthalic')]

Unnamed: 0,Substance,Temperature,assays,Ionic Strength (M),S0 (mM),SD of S0 (mM),Kinetic Solubility (mM),SD of Kinetic Solubility (mM),Unnamed: 8,Unnamed: 9,SMILES,InChI
76,phthalic_acid,25.1,7.0,0.280791,32158.4,2311.89,52659.0,17955,,,C(c1c(C(=O)O)cccc1)(=O)O,InChI=1/C8H6O4/c9-7(10)5-3-1-2-4-6(5)8(11)12/h...


In [31]:
solub_train_data1.reset_index(drop=True,inplace=True)

In [32]:
solub_train_data1.shape

(101, 12)

* **clean data from null/NaN values of solubility S0**

In [33]:
S0_train_descrip=pd.merge(solub_train_data1[['Substance','S0 (mM)']], solub_train_descriptors_new,left_index=True,right_index=True)

In [34]:
S0_train_descrip

Unnamed: 0,Substance,S0 (mM),MW,AMW,Sv,Mv,Me,Mp,Mi,nBM,...,Inflammat-80,Depressant-80,Psychotic-80,Hypertens-80,Hypertens-50,Hypnotic-80,Hypnotic-50,Neoplastic-80,Neoplastic-50,Infective-50
0,1_naphthol,10432.3000,0.054753,0.189654,0.133119,0.620853,0.170370,0.429553,0.087912,0.500000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2_amino_5_bromobenzoic_acid,842.6920,0.190287,0.634060,0.110178,0.781991,0.496296,0.522337,0.307692,0.318182,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4_iodophenol,19312.0000,0.197774,1.000000,0.036856,0.924171,0.251852,1.000000,0.000000,0.272727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5_bromo_2_4_dihydroxybenzoic_acid,2397.2200,0.222312,0.720771,0.122413,0.895735,0.740741,0.505155,0.252747,0.318182,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5_fluorouracil,,0.028178,0.472008,0.000000,0.734597,1.000000,0.168385,1.000000,0.136364,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,trimethoprim,1119.2500,0.330460,0.177242,0.465873,0.327014,0.333333,0.164948,0.571429,0.545455,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
97,trimipramine,16.2552,0.338231,0.063536,0.599202,0.222749,0.037037,0.216495,0.384615,0.545455,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,tryptamine,506.2790,0.085043,0.110581,0.188748,0.322275,0.103704,0.250859,0.417582,0.454545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,verapamil,105.7430,0.640362,0.086885,0.989654,0.184834,0.162963,0.140893,0.450549,0.590909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
S0_train_descrip.dropna(subset=['S0 (mM)'], axis=0, inplace=True)
S0_train_descrip.reset_index(drop=True,inplace=True)

In [36]:
S0_train_descrip.shape

(90, 1046)

In [37]:
columns_S0_descrip=S0_train_descrip.columns

In [38]:
corr_S0_descrip=S0_train_descrip[columns_S0_descrip[1:]].corr()

na_free = solub2['S0 (mM)'].dropna()
remove1=solub2['S0 (mM)'].index.isin(na_free.index)
print(remove1)

* **Import the necessary libs for modeling with SciKit.learn**

In [39]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.feature_selection import f_regression, chi2
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import randint as sp_randint
from sklearn import preprocessing

In [40]:
S0_train_descrip[columns_S0_descrip[1:]]
columns_descriptors=S0_train_descrip[columns_S0_descrip[2:]].columns
X_train=S0_train_descrip[columns_descriptors]
y_train=S0_train_descrip['S0 (mM)']

In [None]:
%matplotlib notebook
#Fitting the PCA algorithm with our Data
pca = PCA().fit(X_train)
#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.show()
#pca = PCA()

In [None]:
reg = Ridge()

In [None]:
anova_filter = SelectKBest(f_regression, k=40)
pipe = Pipeline(steps=[('anova', anova_filter),('pca', pca), ('regressor', reg)])
pipe.fit(X_train, y_train)
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.show()

In [41]:
X_train

Unnamed: 0,MW,AMW,Sv,Mv,Me,Mp,Mi,nBM,RBN,RBF,...,Inflammat-80,Depressant-80,Psychotic-80,Hypertens-80,Hypertens-50,Hypnotic-80,Hypnotic-50,Neoplastic-80,Neoplastic-50,Infective-50
0,0.054753,0.189654,0.133119,0.620853,0.170370,0.429553,0.087912,0.500000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.190287,0.634060,0.110178,0.781991,0.496296,0.522337,0.307692,0.318182,0.076923,0.253219,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.197774,1.000000,0.036856,0.924171,0.251852,1.000000,0.000000,0.272727,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.222312,0.720771,0.122413,0.895735,0.740741,0.505155,0.252747,0.318182,0.076923,0.253219,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.067955,0.187137,0.125202,0.398104,0.333333,0.213058,0.428571,0.318182,0.076923,0.214592,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,0.330460,0.177242,0.465873,0.327014,0.333333,0.164948,0.571429,0.545455,0.384615,0.536481,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
86,0.338231,0.063536,0.599202,0.222749,0.037037,0.216495,0.384615,0.545455,0.307692,0.343348,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87,0.085043,0.110581,0.188748,0.322275,0.103704,0.250859,0.417582,0.454545,0.153846,0.343348,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88,0.640362,0.086885,0.989654,0.184834,0.162963,0.140893,0.450549,0.590909,1.000000,0.776824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
y_train

0     10432.3000
1       842.6920
2     19312.0000
3      2397.2200
4     86329.6000
         ...    
85     1119.2500
86       16.2552
87      506.2790
88      105.7430
89       16.4550
Name: S0 (mM), Length: 90, dtype: float64

In [43]:
pca=PCA(0.95)

In [None]:
%%time
np.seterr(divide='ignore', invalid='ignore')
for i in [10]:
    anova_filter = SelectKBest(f_regression, k=i)
    pipe = Pipeline(steps=[('anova', anova_filter),('pca', pca), ('regressor', reg)])
    pipe.fit(X_train, y_train)
    pipe.score(X_train, y_train)
    param_dist = {"anova__score_func": [mutual_info_regression, f_regression],
                  "anova__k": sp_randint(i/2, i),
                  "regressor__alpha": [0.01,0.1,1.0,10.0,100.0]}
              
    test = RandomizedSearchCV(pipe,
                              param_distributions = param_dist,
                             cv=3,
                             n_iter=100)

    test.fit(X_train, y_train)
    print(test.score(X_train,y_train))
    print(test.best_estimator_)
    #print(test.best_score_)
    mask = anova_filter.get_support() #list of booleans
    new_features = [] # The list of your K best features

    for bool, feature in zip(mask, X_train.columns.values):
        if bool:
            new_features.append(feature)
    print(i,new_features)
np.seterr(divide='warn', invalid='warn')

In [None]:
pca.n_components_

In [None]:
test.predict(solub_pred_descriptors_new)

In [None]:
test.cv_results_['mean_test_score'].size

In [None]:
solub_pred_descriptors_new.head()

In [None]:
test.predict(solub_pred_descriptors_new)

In [None]:
y_hat=test.predict(solub_pred_descriptors_new)

In [None]:
y_hat

In [None]:
solub_pred_data=pd.read_excel('soldata_prediction_withSvalues.xlsx')

In [None]:
solub_pred_data

In [None]:
solub_pred_descriptors

In [44]:
reg1 = AdaBoostRegressor()

In [45]:
%%time
np.seterr(divide='ignore', invalid='ignore')
for i in [100]:
    anova_filter = SelectKBest(f_regression, k=i)
    pipe1 = Pipeline(steps=[('anova', anova_filter),('pca', pca), ('regressor', reg1)])
    pipe1.fit(X_train, y_train)
    pipe1.score(X_train, y_train)
    param_dist1 = {"anova__score_func": [mutual_info_regression, f_regression],
                "anova__k": sp_randint(i/2, i),
                "regressor__n_estimators": [50, 100],
                'regressor__learning_rate' : [0.01,0.05,0.1,0.3,1],
                'regressor__loss' : ['linear', 'square', 'exponential']}
              
    test1 = RandomizedSearchCV(pipe1,
                              param_distributions = param_dist1,
                             cv=3,
                             n_iter=100)

    test1.fit(X_train, y_train)
    print(test1.score(X_train,y_train))
    print(test1.best_estimator_)
    #print(test.best_score_)
    mask1 = anova_filter.get_support() #list of booleans
    new_features1 = [] # The list of your K best features

    for bool1, feature1 in zip(mask1, X_train.columns.values):
        if bool1:
            new_features1.append(feature1)
    print(i,new_features1)
np.seterr(divide='warn', invalid='warn')

0.9288912639566896
Pipeline(memory=None,
         steps=[('anova',
                 SelectKBest(k=85,
                             score_func=<function mutual_info_regression at 0x1a249ad9e0>)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=0.95,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('regressor',
                 AdaBoostRegressor(base_estimator=None, learning_rate=0.3,
                                   loss='exponential', n_estimators=50,
                                   random_state=None))],
         verbose=False)
100 ['MW', 'Sv', 'nBM', 'nCsp2', 'ZM1Mad', 'Xt', 'Psi_e_1', 'piPC02', 'piPC07', 'piPC09', 'piID', 'IDDE', 'VE1_A', 'J_D', 'VE2_X', 'VE2_D/Dt', 'WiA_Dz(Z)', 'VE1_B(m)', 'SpMaxA_B(s)', 'VE1_B(s)', 'ATS8m', 'ATS7e', 'GATS7p', 'GATS8p', 'GATS7i', 'GATS8i', 'SpMax2_Bh(m)', 'SpMax3_Bh(m)', 'SpMax4_Bh(m)', 'SpMax5_Bh(m)', 'SpMax2_Bh(v)', 'SpMax3_Bh(

{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [46]:
test1.predict(solub_pred_descriptors_new)

array([ 3457.57674712,  3247.88525152,  2325.00164227, 34323.2       ,
        3396.57454833, 31764.46363636,  3396.57454833,  3166.85821455,
        2045.60520625,  2800.34057123,  4015.62860486, 30204.        ,
        3363.9616    ,  2325.00164227,  3522.25868786,  3166.85821455,
        2800.34057123,  2045.60520625,  9179.63500156,  4025.15773435,
        2325.00164227,  9015.5606075 ,  3166.85821455, 30029.2       ,
        3396.57454833, 26817.6       ,  3166.85821455,  9786.79181818,
        3522.25868786, 12439.96526087,  3166.85821455,  3363.9616    ])

In [47]:
pca.n_components_

20

In [None]:
%%time
np.seterr(divide='ignore', invalid='ignore')
for i in [40]:
    anova_filter = SelectKBest(f_regression, k=i)
    pipe1 = Pipeline(steps=[('anova', anova_filter),('pca', pca), ('regressor', reg1)])
    pipe1.fit(X_train, y_train)
    pipe1.score(X_train, y_train)
    param_dist1 = {"anova__score_func": [mutual_info_regression, f_regression],
                "anova__k": sp_randint(i/2, i),
                "pca__n_components": sp_randint(1, 5),
                "regressor__n_estimators": [50, 100],
                'regressor__learning_rate' : [0.01,0.05,0.1,0.3,1],
                'regressor__loss' : ['linear', 'square', 'exponential']}
              
    test1 = RandomizedSearchCV(pipe1,
                              param_distributions = param_dist1,
                             cv=3,
                             n_iter=100)

    test1.fit(X_train, y_train)
    print(test1.score(X_train,y_train))
    print(test1.best_estimator_)
    #print(test.best_score_)
    mask1 = anova_filter.get_support() #list of booleans
    new_features1 = [] # The list of your K best features

    for bool1, feature1 in zip(mask1, X_train.columns.values):
        if bool1:
            new_features1.append(feature1)
    print(i,new_features1)
np.seterr(divide='warn', invalid='warn')

In [None]:
test1.predict(solub_pred_descriptors_new)

In [None]:
%%time
np.seterr(divide='ignore', invalid='ignore')
for i in [100]:
    anova_filter = SelectKBest(f_regression, k=i)
    pipe1 = Pipeline(steps=[('anova', anova_filter),('pca', pca), ('regressor', reg1)])
    pipe1.fit(X_train, y_train)
    pipe1.score(X_train, y_train)
    param_dist1 = {"anova__score_func": [mutual_info_regression, f_regression],
                "anova__k": sp_randint(i/2, i),
                "pca__n_components": sp_randint(1, 5),
                "regressor__n_estimators": [50, 100],
                'regressor__learning_rate' : [0.01,0.05,0.1,0.3,1],
                'regressor__loss' : ['linear', 'square', 'exponential']}
              
    test1 = RandomizedSearchCV(pipe1,
                              param_distributions = param_dist1,
                             cv=3,
                             n_iter=100)

    test1.fit(X_train, y_train)
    print(test1.score(X_train,y_train))
    print(test1.best_estimator_)
    #print(test.best_score_)
    mask1 = anova_filter.get_support() #list of booleans
    new_features1 = [] # The list of your K best features

    for bool1, feature1 in zip(mask1, X_train.columns.values):
        if bool1:
            new_features1.append(feature1)
    print(i,new_features1)
np.seterr(divide='warn', invalid='warn')

In [None]:
test1.predict(solub_pred_descriptors_new)

In [None]:
pca = PCA().fit(X_train)
#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.show()

In [None]:
param_dist = {"anova__score_func": [mutual_info_regression, f_regression],
              "anova__k": sp_randint(5, 40),
              "pca__n_components": sp_randint(1, 5),
              "regressor__alpha": [0.01,0.1,1.0,10.0,100.0]}
              
test = RandomizedSearchCV(pipe,
                          param_distributions = param_dist,
                         cv=3,
                         n_iter=100)

test.fit(newX, newY)

In [None]:
%%time
for i in [10,30,50,80,100]:
    param_dist = {"anova__score_func": [mutual_info_regression, f_regression],
                  "anova__k": sp_randint(5, i),
                  "pca__n_components": sp_randint(1, 5),
                  "regressor__alpha": [0.01,0.1,1.0,10.0,100.0]}
              
    test = RandomizedSearchCV(pipe,
                              param_distributions = param_dist,
                             cv=3,
                             n_iter=100)

    test.fit(newX, newY)
    print(test.score(newX, newY))
    print(test.best_estimator_)
    #print(test.best_score_)
    mask = anova_filter.get_support() #list of booleans
    new_features = [] # The list of your K best features

    for bool, feature in zip(mask, newX.columns.values):
        if bool:
            new_features.append(feature)
    print(i,new_features)

In [None]:
print(test.score(newX, newY))
print(test.best_estimator_)
print(test.best_score_)

In [None]:
%%time
anova_filter = SelectKBest(f_regression, k=40)
reg1 = AdaBoostRegressor()

pipe1 = Pipeline(steps=[('anova', anova_filter),
                      ('pca', pca), ('regressor', reg1)])

#pipe.fit(Xpd, ytrain)
#pipe.score(Xpd, ytrain)
pipe1.fit(newX, newY)
pipe1.score(newX, newY)

In [None]:
np.seterr(divide='ignore', invalid='ignore')

In [None]:
%%time

param_dist1 = {"anova__score_func": [mutual_info_regression, f_regression],
                "anova__k": sp_randint(5, 40),
                "pca__n_components": sp_randint(1, 5),
                "regressor__n_estimators": [50, 100],
                'regressor__learning_rate' : [0.01,0.05,0.1,0.3,1],
                'regressor__loss' : ['linear', 'square', 'exponential']}
              
test1 = RandomizedSearchCV(pipe1,
                param_distributions = param_dist1,
                cv=3,
                n_iter=100)

test1.fit(newX, newY)
print(test1.score(newX, newY))
print(test1.best_estimator_)
    #print(test.best_score_)
mask1 = anova_filter.get_support() #list of booleans
new_features1 = [] # The list of your K best features

for bool, feature1 in zip(mask1, newX.columns.values):
    if bool:
        new_features1.append(feature1)
print(new_features1)
    

In [None]:
np.seterr(divide='warn', invalid='warn')

In [None]:
columns3=['Xt', 'MPC07', 'MPC09', 'piPC02', 'piPC09', 'piPC10', 'IDDE', 'Yindex', 'VE2_A', 'ChiA_D', 'VE2_X', 'Chi_Dt', 'VE2_Dt', 'VE2_D/Dt', 'HyWi_B(m)']

In [None]:
df11=pd.read_excel('Solubility_prediction_descriptors_cleaned.xlsx')

In [None]:
df11[columns3]

In [None]:
df11.columns=='piPC10'

In [None]:
%%time
param_dist1 = {"anova__score_func": [mutual_info_regression, f_regression],
              "anova__k": sp_randint(5, 40),
              "pca__n_components": sp_randint(1, 5),
              "regressor__n_estimators": [50, 100],
              'regressor__learning_rate' : [0.01,0.05,0.1,0.3,1],
              'regressor__loss' : ['linear', 'square', 'exponential']}
              
test1 = RandomizedSearchCV(pipe1,
                          param_distributions = param_dist1,
                         cv=3,
                         n_iter=100)

test1.fit(newX, newY)

In [None]:
print(test1.score(newX, newY))
print(test1.best_estimator_)
print(test1.best_score_)

In [None]:
%%time
param_dist2 = {"anova__score_func": [mutual_info_regression, f_regression],
              "anova__k": sp_randint(5, 20),
              "pca__n_components": sp_randint(1, 5),
              "regressor__n_estimators": [50, 100],
              'regressor__learning_rate' : [0.01,0.05,0.1,0.3,1],
              'regressor__loss' : ['linear', 'square', 'exponential']}
              
test2 = RandomizedSearchCV(pipe1,
                          param_distributions = param_dist2,
                         cv=3,
                         n_iter=100)

test2.fit(newX, newY)

In [None]:
print(test1.score(newX, newY))
print(test1.best_estimator_)
print(test1.best_score_)

In [None]:
#solub_pred=pd.read_excel('soldata_prediction.xls')
solub_pred_descriptors=pd.read_excel('Solubility_prediction_descriptors_cleaned.xlsx')

In [None]:
solub_pred_descriptors.head()

In [None]:
#print(newX)

In [None]:
rr = Ridge(alpha=0.01) # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely
# restricted and in this case linear and ridge regression resembles
rr.fit(X_train, y_train)

In [None]:
print(len(X_test), len(y_test))
print(len(X_train), len(y_train))

In [None]:
rr100 = Ridge(alpha=100) #  comparison with alpha value
rr100.fit(X_train, y_train)

In [None]:
rr01 = Ridge(alpha=0.1) #  comparison with alpha value
rr01.fit(X_train, y_train)

In [None]:
rr1 = Ridge(alpha=1.0) #  comparison with alpha value
rr1.fit(X_train, y_train)

In [None]:
rr10 = Ridge(alpha=10.0) #  comparison with alpha value
rr10.fit(X_train, y_train)

In [None]:
train_score=lr.score(X_train, y_train)
test_score=lr.score(X_test, y_test)
Ridge_train_score = rr.score(X_train,y_train)
Ridge_test_score = rr.score(X_test, y_test)
Ridge_train_score01 = rr01.score(X_train,y_train)
Ridge_test_score01 = rr01.score(X_test, y_test)
Ridge_train_score1 = rr1.score(X_train,y_train)
Ridge_test_score1 = rr1.score(X_test, y_test)
Ridge_train_score10 = rr10.score(X_train,y_train)
Ridge_test_score10 = rr10.score(X_test, y_test)
Ridge_train_score100 = rr100.score(X_train,y_train)
Ridge_test_score100 = rr100.score(X_test, y_test)

In [None]:
print("linear regression train score:", train_score)
print("linear regression test score:", test_score)
print("ridge regression train score low alpha 0.01:", Ridge_train_score)
print("ridge regression test score low alpha 0.01:", Ridge_test_score)
print("ridge regression train score low alpha 0.1:", Ridge_train_score01)
print("ridge regression test score low alpha 0.1:", Ridge_test_score01)
print("ridge regression train score low alpha 1.0:", Ridge_train_score1)
print("ridge regression test score low alpha 1.0:", Ridge_test_score1)
print("ridge regression train score low alpha 10.0:", Ridge_train_score10)
print("ridge regression test score low alpha 10.0:", Ridge_test_score10)
print("ridge regression train score high alpha 100.0:", Ridge_train_score100)
print("ridge regression test score high alpha 100.0:", Ridge_test_score100)
plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers
plt.plot(rr01.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='blue',label=r'Ridge; $\alpha = 0.1$',zorder=7) # zorder for ordering the markers
plt.plot(rr1.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='magenta',label=r'Ridge; $\alpha = 1.0$',zorder=7) # zorder for ordering the markers
plt.plot(rr10.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='yellow',label=r'Ridge; $\alpha = 10.0$',zorder=7) # zorder for ordering the markers
plt.plot(rr100.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Ridge; $\alpha = 100$') # alpha here is for transparency
plt.plot(lr.coef_,alpha=0.4,linestyle='none',marker='o',markersize=7,color='green',label='Linear Regression')
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)

In [None]:
data2.describe(include='all')

In [None]:
batch=data2[['S0 (mM)','nHAcc','nHet','nHDon','Sv' ,'ZM1Mad' ,'piPC07', 'IDDE', 'SpMin2_Bh(m)', 'P_VSA_MR_7', 'SpMAD_EA','SM10_AEA(bo)', 'Mor03u' ,'Mor27u' ,'Mor11m' ,'Mor15p' ,'E2u' ,'R7u+' ,'nCrs']]

In [None]:
#batch

In [None]:
#data2.corr()

In [None]:
batch.corr()
columns2=['nHAcc','nHet','nHDon','Sv' ,'ZM1Mad' ,'piPC07', 'IDDE', 'SpMin2_Bh(m)', 'P_VSA_MR_7', 'SpMAD_EA','SM10_AEA(bo)', 'Mor03u' ,'Mor27u' ,'Mor11m' ,'Mor15p' ,'E2u' ,'R7u+' ,'nCrs']
from scipy import stats
for i in columns2:
    #print(i)
    pearson_coef, p_value = stats.pearsonr(batch[i], batch['S0 (mM)'])
    print("descriptor= ",i,"Pearson Correlation Coeff= ", pearson_coef, " with P-value= ", p_value)  

In [None]:
set1=['S0 (mM)','Sv' ,'ZM1Mad' ,'piPC07', 'IDDE', 'SpMin2_Bh(m)', 'P_VSA_MR_7', 'SpMAD_EA','SM10_AEA(bo)', 'Mor03u' ,'Mor27u' ,'Mor11m' ,'Mor15p' ,'E2u' ,'R7u+' ,'nCrs']
set2=set1[0]
set2
data2_group=batch.groupby(set2,as_index=False).mean()

In [None]:
data2_group

In [None]:
print(solub2.shape)
print(solub_descriptors.shape)

In [None]:
data2=pd.merge(solub2[['Substance','S0 (mM)']], solub_descriptors,left_index=True,right_index=True)

In [None]:
data2.head(60)

In [None]:
solub2.head(30)

In [None]:
data3.head(106)

In [None]:
solub2.head()

In [None]:
solub_descriptors.head(7)


In [None]:
solub2.dropna(subset=['S0 (mM)'], axis=0)

In [None]:
na_free = solub2['S0 (mM)'].dropna()
remove1=solub2['S0 (mM)'].index.isin(na_free.index)
print(remove1)

In [None]:
na_free.index

In [None]:
solub2.head(7)

In [None]:
solub2.dropna(subset=['S0 (mM)'], axis=0, inplace=True)
solub2.reset_index(drop=True,inplace=True)

In [None]:
solub2.head(7)
#solub2.shape

In [None]:
solub_descriptors[remove1]
#data = pd.concat([solub2[['Substance','S0 (mM)']],solub_descriptors],axis=1)


In [None]:
data.head()

In [None]:
solub2[['Substance','S0 (mM)']]

In [None]:
data = [solub2[['Substance','S0 (mM)']], solub_descriptors[['Sv' ,'ZM1Mad' ,'piPC07', 'IDDE', 'SpMin2_Bh(m)', 'P_VSA_MR_7', 'SpMAD_EA','SM10_AEA(bo)', 'Mor03u' ,'Mor27u' ,'Mor11m' ,'Mor15p' ,'E2u' ,'R7u+' ,'nCrs']]]
solub_data=pd.concat(data,axis=1)

In [None]:
solub_data.head()

In [None]:
solub_data.dtypes

In [None]:
solub_data.describe()

In [None]:
x_train=solub_descriptors[['Sv' ,'ZM1Mad' ,'piPC07', 'IDDE', 'SpMin2_Bh(m)', 'P_VSA_MR_7', 'SpMAD_EA','SM10_AEA(bo)', 'Mor03u' ,'Mor27u' ,'Mor11m' ,'Mor15p' ,'E2u' ,'R7u+' ,'nCrs']]

In [None]:
x_train.head()
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
x_train_minmax = min_max_scaler.fit_transform(x_train)

In [None]:
print(x_train.head())
print(x_train_minmax)

In [None]:
columns_train=x_train.columns

In [None]:
X_pd_train_minmax=pd.DataFrame(x_train_minmax, columns=columns_train)
Y_pd_train=solub_data['S0 (mM)']

In [None]:
X_pd_train_minmax.head(90)

In [None]:
X_pd_train_minmax.describe()

In [None]:
Y_pd_train.describe()

In [None]:
data_set_all_train_standard=pd.