## Goodness of fit test

Does our full space, or any subspace have a goodness of fit that is normal or t?

## Simulation Scores

Lets explore what happens when we take a sample of 1000 simulated points based on the normal parameters of our full space and subspaces, then combat it against our models that have been claimed to do very well.

 

## Simulation Scores Analysis

This should tell us how well the central tendency of our chemical compounds is predictive of potency. If so, we may have some money, or an interesting conversation.

## Libraries

In [1]:
# Core
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats
import os
import subprocess
from __future__ import division
import pickle

def install_package(name):
    sudoPassword = ''
    command = 'pip install ' + name
    p = os.system('echo %s|sudo -S %s' % (sudoPassword, command))

# Stats
from statsmodels.regression import linear_model
import statsmodels.api as sm

# ML
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import Lasso
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import AdaBoostClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.utils import shuffle


# OR
from skcriteria import Data, MIN, MAX
from skcriteria.madm import closeness, simple

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from ggplot import *



  from pandas.core import datetools


## Load Data

In [2]:
# MODELS
"""
Models = {'lda':get_LDA_scores, 
          'rf':get_RF_scores,
          'svc':get_SVC_scores,
          'log_reg':get_LogReg_scores,
          'nb':get_NB_scores,
          'knn':get_KNN_scores}
"""

# FEATURE SPACES
mixed_selection_features = pickle.load(open('../pickled/mixed_selection_features.pickle', 'rw'))
mixed_selection_1x_features = pickle.load(open('../pickled/mixed_selection_1x_features.pickle', 'rw'))
lasso_features = pickle.load(open('../pickled/lasso_features.pickle', 'rw'))
topsis_features = pickle.load(open('../pickled/topsis_features.pickle', 'rw'))
rf_features = pickle.load(open('../pickled/rf_features.pickle', 'rw'))
feature_space      = {  'topsis_features':topsis_features, 
                         'rf_features':rf_features, 
                         'lasso_features':lasso_features,
                         'mixed_selection_1x_features':mixed_selection_1x_features,
                         'mixed_selection_features':mixed_selection_features     }

# FULL SPACE DATA SCORES
lda_full_scores     = pickle.load(open('../pickled/lda_full_scores.pickle', 'rw'))
rf_full_scores      = pickle.load(open('../pickled/rf_full_scores.pickle', 'rw'))
svm_full_scores     = pickle.load(open('../pickled/svm_full_scores.pickle', 'rw'))
log_reg_full_scores = pickle.load(open('../pickled/log_reg_full_scores.pickle', 'rw'))
nb_full_scores      = pickle.load(open('../pickled/nb_full_scores.pickle', 'rw'))
knn_full_scores     = pickle.load(open('../pickled/knn_full_scores.pickle', 'rw'))
full_scores         = {'lda':lda_full_scores,
                       'rf':rf_full_scores,
                       'svm':svm_full_scores,
                       'log_reg':log_reg_full_scores,
                       'nb':nb_full_scores,
                       'knn':knn_full_scores}

# DATA IN EACH SUBSPACE
data_per_space=pickle.load(open("../pickled/data_per_space.pickle", 'rw'))


# FULL SPACE C.I.
full_confidence_intervals = pickle.load(open("../pickled/full_confidence_intervals.pickle", "rw"))

# SUB SPACE DATA SCORES
lda_space_scores=pickle.load(open("../pickled/lda_space_scores.pickle", 'rw'))
rf_space_scores=pickle.load(open("../pickled/rf_space_scores.pickle", "rw"))
svm_space_scores=pickle.load(open("../pickled/svm_space_scores.pickle", "rw"))
log_reg_space_scores=pickle.load(open("../pickled/log_reg_space_scores.pickle", "rw"))
nb_space_scores=pickle.load(open("../pickled/nb_space_scores.pickle", "rw"))
knn_space_scores=pickle.load(open("../pickled/knn_space_scores.pickle", "rw"))
sub_scores = {'lda':lda_space_scores,
             'rf':rf_space_scores,
             'svm':svm_space_scores,
             'log_reg':log_reg_space_scores,
             'nb':nb_space_scores,
             'knn':knn_space_scores}

# SUB SPACE C.I.
sub_model_confidence_interval=pickle.load(open("../pickled/sub_model_confidence_interval.pickle", "rw") )

# INITIAL DATA
bigframe = pickle.load(open('../pickled/bigframe.pickle', 'rw'))


## Correctly Partition

In [37]:
main_df = pd.read_csv("../../data/Series3_6.15.17_padel.csv")
main_mean = main_df.iloc[:,2:].mean().as_matrix()
main_cov = main_df.iloc[:,2:].cov().as_matrix()

bigframe = shuffle(bigframe, random_state=0)
bigframe_train, bigframe_test = train_test_split(bigframe.iloc[:,2:], train_size=0.5)

ytrain = bigframe_train["bin2"]
Xtrain = StandardScaler().fit_transform(bigframe_train.iloc[:,1:])

ytest = bigframe_test["bin2"]
Xtest = StandardScaler().fit_transform(bigframe_test.iloc[:,1:])
print("Data ready to model.")

Data ready to model.


In [36]:
main.mean

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,0,-1.0958,1.200778,23.1572,39.59193,15,17,30,20,10,...,10.200817,40.71361,2.035681,22.242022,4.765434,11.047825,799,31,0.727,110
1,0,1.1547,1.333332,38.023,49.112481,15,16,40,23,17,...,8.202895,48.001779,2.087034,18.848454,0.0,13.231952,1321,34,3.185,124
2,0,0.5906,0.348808,7.7935,31.554344,15,17,23,15,8,...,9.219166,31.354825,2.090322,9.306953,0.0,6.214417,343,21,4.191,80
3,0,2.2246,4.948845,25.3114,36.320758,15,17,23,17,6,...,15.993993,35.091951,2.064232,14.383663,0.0,6.220694,480,27,5.059,92
4,0,1.3358,1.784362,16.7123,33.937551,15,17,23,16,7,...,12.60658,33.230029,2.076877,11.846725,0.0,6.220707,411,23,4.625,86


## Create the simulated test set.

In [31]:
sim_test_set = [np.random.multivariate_normal(bigframe_mean, bigframe_cov ,size=1000)
               for i in range(10)]
pickle.dump(sim_test_set, open("../pickled/sim_test_set.pickle", "w"))
sim_test_set=pickle.load(open("../pickled/sim_test_set.pickle","rw"))
sim_test_set

  from ipykernel import kernelapp as app


[array([[  5.10664640e-03,  -5.01622853e-01,   3.92252097e+00, ...,
           5.01271929e+01,   4.68065472e+00,   1.79612654e+02],
        [ -1.04988757e-01,  -9.29680234e-01,   1.72961963e+00, ...,
           4.71241906e+01,   5.98858280e+00,   1.60273574e+02],
        [  8.56716894e-02,  -1.48545390e+00,   3.17128191e+00, ...,
           2.67537144e+01,   1.05256119e+00,   1.05975495e+02],
        ..., 
        [  1.16230098e-02,  -1.56377922e+00,   5.83931270e+00, ...,
           5.39474674e+01,   5.94545783e+00,   1.71344235e+02],
        [  1.51169191e-01,  -1.99021838e+00,   6.86411180e+00, ...,
           5.27843658e+01,   5.10110881e+00,   1.90614845e+02],
        [  4.30425611e-02,  -3.57320948e-01,  -6.60748520e-01, ...,
           3.50539707e+01,   5.78642025e-01,   1.20382176e+02]]),
 array([[ -1.96880591e-01,  -3.28958764e+00,   1.00717387e+01, ...,
           3.58232406e+01,   7.59459719e-01,   1.24968379e+02],
        [  9.88239124e-02,  -2.15667291e+00,   1.29075578e+0

In [25]:
bigframe_mean.as_matrix().shape

(1445,)

In [26]:
bigframe_cov.as_matrix().shape

(1444, 1444)

In [28]:
bigframe.iloc[:,2:].columns

Index([u'bin2', u'nAcid', u'ALogP', u'ALogp2', u'AMR', u'apol', u'naAromAtom',
       u'nAromBond', u'nAtom', u'nHeavyAtom',
       ...
       u'AMW', u'WTPT-1', u'WTPT-2', u'WTPT-3', u'WTPT-4', u'WTPT-5', u'WPATH',
       u'WPOL', u'XLogP', u'Zagreb'],
      dtype='object', length=1445)