# Solubility Challenge

### originally by team: C Di Paola, J. Manson and K. Makobe
### revised and adapted By: C. Di Paola

## Import necessary initial libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(threshold=np.inf)
#%load_ext autotime

## Read files for analyis and prediction

We need solubility data and DRAGON 2D descriptors from the training set and DRAGON 2D descriptors for prediction data set

In [None]:
#solub=pd.read_excel('soldata.xls')
solub_train_data=pd.read_excel('soldata_trainingset.xls')
solub_train_descriptors=pd.read_excel('Solubility_training_descriptors_cleaned.xlsx')
solub_pred_descriptors=pd.read_excel('Solubility_prediction_descriptors_cleaned.xlsx')
solub_pred_data=pd.read_excel('soldata_prediction_withSvalues.xlsx')

## Pre-processing data

* training data shape anf formatting

In [None]:
solub_train_data.head()

In [None]:
solub_train_descriptors.head()

In [None]:
solub_pred_data.head()

In [None]:
solub_pred_data['Solubility (from findings) (micro M)']=pd.to_numeric(solub_pred_data['Solubility (from findings) (micro M)'], errors='coerce')

In [None]:
solub_pred_data['Solubility (from findings) (micro M)']


In [None]:
#for i,aa in enumerate(solub_train_descriptors['NAME']):
#    print(i+1)
#for j,columns in enumerate(solub_train_descriptors.columns):
    #print(j,columns)
    #unique, counts = np.unique(solub_train_descriptors[columns], return_counts=True)
    #print(unique,counts)

* **Searching for missing solubility (S0) data in the form of null/NaN values**

In [None]:
print(solub_train_data['S0 (mM)'].isna().value_counts()) ## specific fro NaN search
print(solub_train_data['S0 (mM)'].isnull().value_counts()) ## null data general
print(solub_pred_data['Solubility (from findings) (micro M)'].isnull().value_counts())
print(solub_pred_data['Solubility (from findings) (micro M)'].isna().value_counts())

* **check the data are in the rigth format**

In [None]:
print(solub_train_data.shape)
print(solub_train_data[['Substance','S0 (mM)']].dtypes)

In [None]:
print(solub_train_descriptors.shape)
print(solub_train_descriptors.dtypes) # truncated list of data types
#print(solub_train_descriptors.info(verbose=True)) # full list of data types

In [None]:
print(solub_pred_descriptors.shape)
print(solub_pred_descriptors.dtypes) # truncated list of data types
#print(solub_pred_descriptors.info(verbose=True)) # full list of data types

In [None]:
print(solub_pred_data.shape)
print(solub_pred_data.dtypes)

* Need to scale descriptors data to the same range of value [0,1]: MIN-MAX SCALER does this for us (from scikit-learn lib)

In [None]:
from sklearn import preprocessing
%matplotlib inline
min_max_scaler = preprocessing.MinMaxScaler()

In [None]:
columns_train_descrit=solub_train_descriptors.columns
columns_pred_descrit=solub_pred_descriptors.columns

In [None]:
x_train_all_minmax = min_max_scaler.fit_transform(solub_train_descriptors[columns_train_descrit[2:]])
solub_train_descriptors_values=pd.DataFrame(x_train_all_minmax, columns=columns_train_descrit[2:])

In [None]:
solub_train_descriptors_values.describe()

In [None]:
x_pred_all_minmax = min_max_scaler.fit_transform(solub_pred_descriptors[columns_pred_descrit[2:]])
solub_pred_descriptors_values=pd.DataFrame(x_pred_all_minmax, columns=columns_pred_descrit[2:])

In [None]:
solub_pred_descriptors_values.describe()

In [None]:
corr_train=solub_train_descriptors_values.corr()
corr_pred=solub_pred_descriptors_values.corr()

In [None]:
#(solub_train_descriptors_values == 0.).sum() > 366.5

In [None]:
#drop_cols=solub_train_descriptors_values[(solub_train_descriptors_values == 0.).sum() > 366.5]

In [None]:
corr_train

* **Correlation heatmap: load seaborn lib**

* **Uniforming training and pediction data to the same descriptors**

In [None]:
bool_same_descriptors=solub_pred_descriptors_values.columns.isin(solub_train_descriptors.columns)
#unique, counts = np.unique(a, return_counts=True)
#print(unique,counts)
#print(bool_same_descriptors) # check the boolean list

In [None]:
solub_pred_descriptors_new=solub_pred_descriptors_values.loc[:,bool_same_descriptors]

In [None]:
solub_pred_descriptors_new.shape

In [None]:
bool_same_descriptors1=solub_train_descriptors_values.columns.isin(solub_pred_descriptors.columns)
#unique, counts = np.unique(b, return_counts=True)
#print(unique,counts)
#print(bool_same_descriptors1) # check the boolean list

In [None]:
solub_train_descriptors_new=solub_train_descriptors_values.loc[:,bool_same_descriptors1]

In [None]:
solub_train_descriptors_new.shape

In [None]:
corr_train1=solub_train_descriptors_new.corr()
corr_pred1=solub_pred_descriptors_new.corr()

* **clean data from different isomer forms (only DRAGON 2D descriptors available)**

** clean the training set**

In [None]:
solub_train_data[solub_train_data['Substance'].str.contains('form')]

In [None]:
solub_train_data1=solub_train_data[~solub_train_data['Substance'].str.contains('form_II')]

In [None]:
solub_train_data1[solub_train_data1['Substance'].str.contains('form')]

In [None]:
solub_train_data1['Substance'].replace(regex=True,inplace=True,to_replace=r'_form_I',value=r'')

In [None]:
solub_train_data1[solub_train_data1['Substance'].str.contains('phthalic')]

In [None]:
solub_train_data1.reset_index(drop=True,inplace=True)

In [None]:
solub_train_data1.shape

** clean the reference set**

In [None]:
solub_pred_data[solub_pred_data['name'].str.contains('_I')]

In [None]:
solub_pred_data1=solub_pred_data[~solub_pred_data['name'].str.contains('_II')]
solub_pred_data1=solub_pred_data1[~solub_pred_data['name'].str.contains('_III')]
solub_pred_data1=solub_pred_data1[~solub_pred_data['name'].str.contains('_IV')]

In [None]:
solub_pred_data1

In [None]:
solub_pred_data1.replace(regex=True,inplace=True,to_replace=r'_I',value=r'')

In [None]:
solub_pred_data1

In [None]:
solub_pred_data1.reset_index(drop=True,inplace=True)
solub_pred_data1.shape

In [None]:
solub_pred_data1

* **clean data from null/NaN values of solubility S0**

In [None]:
S0_train_descrip=pd.merge(solub_train_data1[['Substance','S0 (mM)']], solub_train_descriptors_new,left_index=True,right_index=True)

In [None]:
S0_train_descrip.head()

In [None]:
S0_train_descrip.dropna(subset=['S0 (mM)'], axis=0, inplace=True)
S0_train_descrip.reset_index(drop=True,inplace=True)

In [None]:
S0_pred_descrip=pd.merge(solub_pred_data1[['name','Solubility (from findings) (micro M)']], solub_pred_descriptors_new,left_index=True,right_index=True)

In [None]:
S0_pred_descrip

In [None]:
S0_pred_descrip.dropna(subset=['Solubility (from findings) (micro M)'],axis=0, inplace=True)
S0_pred_descrip.reset_index(drop=True,inplace=True)

* create final trainng and prediction sets

In [None]:
columns_S0_descrip=S0_train_descrip.columns
print(columns_S0_descrip)

In [None]:
S0_train_descrip[columns_S0_descrip[1:]].head()

In [None]:
columns_S0_predict=S0_pred_descrip.columns
print(columns_S0_predict)

In [None]:
S0_pred_descrip[columns_S0_predict[1:]].head()

In [None]:
columns_descriptors=S0_train_descrip[columns_S0_descrip[2:]].columns
columns_descriptors1=S0_pred_descrip[columns_S0_predict[2:]].columns
#print(columns_descriptors==columns_descriptors1)
X_train=S0_train_descrip[columns_descriptors]
X_pred=S0_pred_descrip[columns_descriptors1]
y_train=S0_train_descrip['S0 (mM)']
y_ref=S0_pred_descrip['Solubility (from findings) (micro M)']

In [None]:
X_train.shape

In [None]:
X_pred.shape

In [None]:
y_train.shape

In [None]:
y_ref.shape

## Model prediction

* PCA, ANOVA, Ridge or AdaBoost regressors

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.feature_selection import f_regression, chi2
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import randint as sp_randint
from sklearn import preprocessing