In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report,confusion_matrix
%matplotlib inline

In [2]:
def loaddata():
    col = ['TPSA(Tot)','SAacc','H-050','MLOGP','RDCHI','GATS1p','nN','C-040','quantitative response']
    data = pd.read_csv('qsar_aquatic_toxicity.csv',sep=';',error_bad_lines=False)
    data.columns = col
    return data

In [17]:
def printaccuracy(y_test,predict,model):
    print(model," report")
    print("-------------------------------------")
    print(" ")
    print('MAE:', metrics.mean_absolute_error(y_test, predict))
    print('MSE:', metrics.mean_squared_error(y_test, predict))
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predict)))
    print(" ")
    print("-------------------------------------")
    print(" ")

In [4]:
def normalizedata(X):
    SS = StandardScaler()
    X = SS.fit_transform(X)
    print("Normalization done")
    

In [5]:
def PC(components,x):
    cols = []
    pca = PCA(n_components=components)
    pc = pca.fit_transform(x)
    for i in range(components):
        cols.append('pc'+str(i))
    pc_data = pd.DataFrame(data = pc, columns = cols)
    return pc_data

In [6]:
def removeoutliers(data,inplace=False):
    prev_rows = len(data)
    data_copy = data.copy()
    z_score = np.abs(stats.zscore(data_copy))
    data_copy = data_copy[(z_score < 3).all(axis=1)]
    if inplace:
        data=data_copy
    #print("Before removing outliers , rows - ", prev_rows)
    #print("After removing outliers , rows -", len(data_copy))
    #print("Number of records deleted - ", (prev_rows - len(data_copy))#)

In [7]:
def validatecols(data):
    if len(data.columns) == len(col):
        return True
    else:
        return False

In [8]:
def validatedatatypes(trained, newdata):
    for i in range(trained.columns):
        if trained[trained.columns[i]] != newdata[newdata.columns[i]]:
            return False
    return True    

In [19]:
def preprocess():
    #removeoutliers(data,inplace=True)
    X = data.drop('quantitative response',axis=1)
    X_copy = X.copy()
    y= data['quantitative response']
    #normalizedata(X)
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
    return X_train,X_test,y_train,y_test,X_copy,y

In [10]:
def k_vs_error_graph():
    knn_error = []
    for i in range(2,50):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(X_train,y_train)
        knn_predict= knn.predict(X_test)
        knn_error.append(np.mean(y_test!=knn_predict))
    plt.plot(range(2,50),knn_error)
    plt.xlabel("K value")
    plt.ylabel("Error")

In [11]:
def logisticregression():
    lr = LogisticRegression(solver='lbfgs',max_iter=10000,random_state=0)
    lr.fit(X_train,y_train)
    lr_predict = lr.predict(X_test)
    printaccuracy(y_test,lr_predict,"Logistic Regression")
    return f1_score(y_test,lr_predict)

In [12]:
def KNN():
    neighbors={'n_neighbors':np.array(range(2,50))}
    knn_grid=GridSearchCV(KNeighborsClassifier(),neighbors,verbose=False,refit=True,cv=3)
    knn_grid.fit(X_train,y_train)
    #knn_grid.best_params_
    knn_predict = knn_grid.predict(X_test)
    printaccuracy(y_test,knn_predict,"KNN")
    return f1_score(y_test,knn_predict)

In [13]:
def SVM():
    svm = SVC(kernel='rbf',random_state=0)
    params = { 'C' : np.logspace(0, 3, 4), 'gamma' : np.logspace(-2, 1, 4)}
    svm_grid = GridSearchCV(svm, params,cv=3,verbose=False,return_train_score=True)
    svm_X = X.copy()
    svm_X = PC(2,svm_X)
    svm_X_train,svm_X_test,svm_y_train,svm_y_test=train_test_split(svm_X,y,test_size=0.3)
    svm_grid.fit(svm_X_train,svm_y_train)
    svm_predict = svm_grid.predict(svm_X_test)
    printaccuracy(svm_y_test,svm_predict,"SVM")
    return f1_score(svm_y_test,svm_predictict)

In [14]:
def DecisionTree():
    dt = DecisionTreeClassifier(random_state=0)
    dt.fit(X_train,y_train)
    dt_predict = dt.predict(X_test)
    printaccuracy(y_test,dt_predict,"Decision Tree")
    return f1_score(y_test,dt_predict)

In [15]:
def RandomForest():
    rf = RandomForestClassifier(random_state=0)
    params = { 'n_estimators' : np.arange(10,100,10), 'max_depth' : np.arange(5,50,5)}
    rf_grid = GridSearchCV(rf, params, verbose=False, cv=3)
    rf_grid.fit(X_train,y_train)
    rf_predict = rf_grid.predict(X_test)
    printaccuracy(y_test,rf_predict,"Random Forest")
    rf
    return f1_score(y_test,rf_predict)

In [16]:
def Adaboost():
    ab = AdaBoostClassifier(random_state=0)
    params = { 'n_estimators' : np.arange(10,100,10)}
    ab_grid = GridSearchCV(ab, params, verbose=False, cv=3)
    ab_grid.fit(X_train,y_train)
    ab_predict = ab_grid.predict(X_test)
    printaccuracy(y_test,ab_predict,"Adaboost")
    return f1_score(y_test,ab_predict)

In [17]:
def GaussionNB():
    gnb = GaussianNB()
    gnb.fit(X_train,y_train)
    gnb_predict = gnb.predict(X_test)
    printaccuracy(y_test,gnb_predict,"GaussionNB")
    return f1_score(y_test,gnb_predict)

In [18]:
def NueralNetwork():
    nn = MLPClassifier(solver='sgd',random_state=0)
    params = {
    'hidden_layer_sizes': np.arange(50,150,20),
    'learning_rate': ['constant','adaptive'],
    'max_iter': np.arange(200,300,50)
    }
    #'hidden_layer_sizes': [(100,50), (50,20), (20,10)],
    #'hidden_layer_sizes': np.arange(10,100,20)
    # 'activation': ['tanh', 'relu'],
    # 'alpha': 10.0 ** -np.arange(1, 5),
    #'solver': ['sgd', 'adam'],
    nn_grid = GridSearchCV(nn, params, cv=3,verbose=False)
    nn_grid.fit(X_train,y_train)
    nn_predict = nn_grid.predict(X_test)
    printaccuracy(y_test,nn_predict,"Nueral Network")
    return f1_score(y_test,nn_predict)

In [19]:
def EvaluateModels(f1scores):
    models=['Logistic Regression',
            'K Nearest Neighbours',
            'Support Vector Machine',
            'Decision Tree',
            'Random Forest',
            'AdaBoost',
            'Guassian Naive Bayes',
            'Nueral Network'
           ]

    print("        Model Results    ")
    print(" --------------------------- ")
    for i in range(len(f1scores)):
        print(models[i]," : f1 score - ",f1scores[i])
    print(" -----------------------------")
    print("Best model")
    idx = f1scores.index(np.max(f1scores))
    print(models[idx] ," : f1 score - ", np.max(f1scores))
    
    
    
    
    

In [20]:
def trainmodels():
    f1scores=[]
    f1scores.append(logisticregression())
    f1scores.append(KNN())
    f1scores.append(SVM())
    f1scores.append(DecisionTree())
    f1scores.append(RandomForest())
    f1scores.append(Adaboost())
    f1scores.append(GaussionNB())
    f1scores.append(NueralNetwork())
    return f1scores

In [4]:
data = loaddata()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 9 columns):
TPSA(Tot)                545 non-null float64
SAacc                    545 non-null float64
H-050                    545 non-null int64
MLOGP                    545 non-null float64
RDCHI                    545 non-null float64
GATS1p                   545 non-null float64
nN                       545 non-null int64
C-040                    545 non-null int64
quantitative response    545 non-null float64
dtypes: float64(6), int64(3)
memory usage: 38.4 KB


In [20]:
data = loaddata()
X_train,X_test,y_train,y_test,X,y = preprocess()
#f1scores = trainmodels()
#EvaluateModels(f1scores)

In [5]:
data.describe()

Unnamed: 0,TPSA(Tot),SAacc,H-050,MLOGP,RDCHI,GATS1p,nN,C-040,quantitative response
count,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0
mean,48.561872,58.977035,0.93945,2.313299,2.494624,1.04696,1.005505,0.354128,4.660106
std,46.760699,68.182392,1.619619,1.743392,0.809925,0.40372,1.39786,0.807426,1.666279
min,0.0,0.0,0.0,-6.446,1.0,0.281,0.0,0.0,0.122
25%,15.79,11.0,0.0,1.228,1.975,0.737,0.0,0.0,3.601
50%,40.46,42.683,0.0,2.273,2.344,1.021,1.0,0.0,4.53
75%,70.14,77.727,1.0,3.395,2.913,1.267,2.0,0.0,5.61
max,347.32,571.952,18.0,9.148,6.439,2.5,11.0,11.0,10.047


In [21]:
def SVM():
    svr = SVR()
    svr.fit(X_train,y_train)
    svr_predict = svr.predict(X_test)
    printaccuracy(y_test,svr_predict,"SVR")

In [6]:
data.head()

Unnamed: 0,TPSA(Tot),SAacc,H-050,MLOGP,RDCHI,GATS1p,nN,C-040,quantitative response
0,0.0,0.0,0,2.638,1.401,0.632,0,0,4.33
1,9.23,11.0,0,5.799,2.93,0.486,0,0,7.019
2,9.23,11.0,0,5.453,2.887,0.495,0,0,6.723
3,9.23,11.0,0,4.068,2.758,0.695,0,0,5.979
4,215.34,327.629,3,0.189,4.677,1.333,0,4,6.064
