## Quick Results

This notebook is an extension to <code>main_analysis</code> in order to provide a quick way to reproduce results.

In [5]:
# Core
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats
import os
import subprocess
from __future__ import division
import pickle

def install_package(name):
    sudoPassword = ''
    command = 'pip install ' + name
    p = os.system('echo %s|sudo -S %s' % (sudoPassword, command))

# Stats
from statsmodels.regression import linear_model
import statsmodels.api as sm

# ML
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import Lasso
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import AdaBoostClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.utils import shuffle


# OR
from skcriteria import Data, MIN, MAX
from skcriteria.madm import closeness, simple

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from ggplot import *



  from pandas.core import datetools


In [6]:
# MODELS
"""
Models = {'lda':get_LDA_scores, 
          'rf':get_RF_scores,
          'svc':get_SVC_scores,
          'log_reg':get_LogReg_scores,
          'nb':get_NB_scores,
          'knn':get_KNN_scores}
"""

# FEATURE SPACES
mixed_selection_features = pickle.load(open('../pickled/mixed_selection_features.pickle', 'rw'))
mixed_selection_1x_features = pickle.load(open('../pickled/mixed_selection_1x_features.pickle', 'rw'))
lasso_features = pickle.load(open('../pickled/lasso_features.pickle', 'rw'))
topsis_features = pickle.load(open('../pickled/topsis_features.pickle', 'rw'))
rf_features = pickle.load(open('../pickled/rf_features.pickle', 'rw'))
feature_space      = {  'topsis_features':topsis_features, 
                         'rf_features':rf_features, 
                         'lasso_features':lasso_features,
                         'mixed_selection_1x_features':mixed_selection_1x_features,
                         'mixed_selection_features':mixed_selection_features     }

# FULL SPACE DATA SCORES
lda_full_scores     = pickle.load(open('../pickled/lda_full_scores.pickle', 'rw'))
rf_full_scores      = pickle.load(open('../pickled/rf_full_scores.pickle', 'rw'))
svm_full_scores     = pickle.load(open('../pickled/svm_full_scores.pickle', 'rw'))
log_reg_full_scores = pickle.load(open('../pickled/log_reg_full_scores.pickle', 'rw'))
nb_full_scores      = pickle.load(open('../pickled/nb_full_scores.pickle', 'rw'))
knn_full_scores     = pickle.load(open('../pickled/knn_full_scores.pickle', 'rw'))
full_scores         = {'lda':lda_full_scores,
                       'rf':rf_full_scores,
                       'svm':svm_full_scores,
                       'log_reg':log_reg_full_scores,
                       'nb':nb_full_scores,
                       'knn':knn_full_scores}

# DATA IN EACH SUBSPACE
data_per_space=pickle.load(open("../pickled/data_per_space.pickle", 'rw'))


# FULL SPACE C.I.
full_confidence_intervals = pickle.load(open("../pickled/full_confidence_intervals.pickle", "rw"))

# SUB SPACE DATA SCORES
lda_space_scores=pickle.load(open("../pickled/lda_space_scores.pickle", 'rw'))
rf_space_scores=pickle.load(open("../pickled/rf_space_scores.pickle", "rw"))
svm_space_scores=pickle.load(open("../pickled/svm_space_scores.pickle", "rw"))
log_reg_space_scores=pickle.load(open("../pickled/log_reg_space_scores.pickle", "rw"))
nb_space_scores=pickle.load(open("../pickled/nb_space_scores.pickle", "rw"))
knn_space_scores=pickle.load(open("../pickled/knn_space_scores.pickle", "rw"))
sub_scores = {'lda':lda_space_scores,
             'rf':rf_space_scores,
             'svm':svm_space_scores,
             'log_reg':log_reg_space_scores,
             'nb':nb_space_scores,
             'knn':knn_space_scores}

# SUB SPACE C.I.
sub_model_confidence_interval=pickle.load(open("../pickled/sub_model_confidence_interval.pickle", "rw") )

# INITIAL DATA
bigframe = pickle.load(open('../pickled/bigframe.pickle', 'rw'))
bigframe

Unnamed: 0,Name,IC50,bin2,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,OSM-S-106,0.036,1,0,-1.0958,1.200778,23.1572,39.591930,15,17,...,10.200817,40.713610,2.035681,22.242022,4.765434,11.047825,799.0,31.0,0.727,110.0
1,OSM-S-107,10.000,1,0,1.1547,1.333332,38.0230,49.112481,15,16,...,8.202895,48.001779,2.087034,18.848454,0.000000,13.231952,1321.0,34.0,3.185,124.0
2,OSM-S-118,50.000,0,0,0.5906,0.348808,7.7935,31.554344,15,17,...,9.219166,31.354825,2.090322,9.306953,0.000000,6.214417,343.0,21.0,4.191,80.0
3,OSM-S-119,50.000,0,0,2.2246,4.948845,25.3114,36.320758,15,17,...,15.993993,35.091951,2.064232,14.383663,0.000000,6.220694,480.0,27.0,5.059,92.0
4,OSM-S-120,50.000,0,0,1.3358,1.784362,16.7123,33.937551,15,17,...,12.606580,33.230029,2.076877,11.846725,0.000000,6.220707,411.0,23.0,4.625,86.0
5,OSM-S-121,8.000,1,0,1.8695,3.495030,22.8145,22.223586,9,10,...,19.067774,22.284522,2.025866,14.175567,0.000000,6.058732,139.0,14.0,2.124,58.0
6,OSM-S-122,50.000,0,0,0.5906,0.348808,7.7935,44.781516,21,24,...,8.729458,44.175287,2.103585,9.464868,0.000000,6.263543,913.0,32.0,6.788,114.0
7,OSM-S-123,50.000,0,0,-0.1593,0.025376,12.0156,33.321137,15,17,...,9.082069,33.223092,2.076443,11.820971,0.000000,8.662751,426.0,23.0,3.276,86.0
8,OSM-S-124,35.000,1,0,0.4443,0.197402,38.4752,49.059481,15,17,...,8.177604,48.056794,2.089426,18.752813,5.799000,9.748505,1221.0,36.0,2.428,124.0
9,OSM-S-125,50.000,0,0,1.2413,1.540826,40.0782,34.319930,9,10,...,11.498954,33.230029,2.076877,18.333672,3.011505,9.696149,411.0,23.0,1.342,86.0


In [12]:
full_scores

{'knn': [0.83287839082360071,
  0.99927210964738211,
  0.83230199342337563,
  0.99924174040508573,
  0.91569621979215299,
  0.99909020329530673,
  0.91609044589598598,
  0.99915074307166207,
  0.99899909556841771,
  0.99930236849621323],
 'lda': [0.66618137699333069,
  0.66648471615720517,
  0.66690926734594846,
  0.50054585152838427,
  0.50006069432720668,
  0.66654536632702577,
  0.83272674332035646,
  0.66563521636324363,
  0.50066715186802524,
  0.16709130617018217],
 'log_reg': [0.99927215380476808,
  0.99866569626394952,
  0.83284799950261112,
  0.9989688146408936,
  0.99939345414440917,
  0.83248409848368843,
  0.66654536632702577,
  0.83260535466594321,
  0.99915089762251341,
  0.9987869966035906],
 'nb': [0.49981802741184556,
  0.49987869966035908,
  0.49987869966035903,
  0.49981800533315246,
  0.49975733308463899,
  0.5,
  0.49981800533315246,
  0.58309073265405142,
  0.66642399975130562,
  0.58309068849666534],
 'rf': [0.83287845705967978,
  0.91606016496846188,
  0.9990597

In [18]:
#full_scores["lda"] vs. sub_scores["lda"]["mixed_selection_features"]
#full_scores["svm"] vs. sub_scores["svm"]["rf_features"]
#full_scores["log_reg"] vs. sub_scores["log_reg"]["topsis_features"]
#full_scores["nb"] vs. sub_scores["nb"]["rf_features"]
#full_scores["knn"] vs. sub_Scores["knn"]["rf_features"]
import scipy.stats as stats

print("(LDA) Is Reduced set more predictive than Full set?")
for k,v in sub_scores["lda"].items():    
    lda_pval=stats.ttest_ind(v,  # >=
                    full_scores["lda"], 
                    axis=0, equal_var=True)
    print(k,lda_pval[1]/2)

print("(SVM) Small P means 2nd > 1st")
for k,v in sub_scores["svm"].items(): 
    svm_pval=stats.ttest_ind(v,  # >=
                    full_scores["svm"], 
                    axis=0, equal_var=True)
    print(k, svm_pval[1]/2)

print("(Log_Reg) Small P means 2nd > 1st")
for k,v in sub_scores["log_reg"].items(): 
    log_reg_pval=stats.ttest_ind(v,  # >=
                    full_scores["log_reg"], 
                    axis=0, equal_var=True)
    print(k, log_reg_pval[1]/2)

print("(NB) Small P means 2nd > 1st")
for k,v in sub_scores["nb"].items(): 
    nb_pval=stats.ttest_ind(v,  # >=
                    full_scores["nb"], 
                    axis=0, equal_var=True)

    print(k, nb_pval[1]/2)
print("(KNN) Small P means 2nd > 1st")
for k,v in sub_scores["knn"].items(): 
    knn_pval=stats.ttest_ind(v,  # >=
                    full_scores["knn"], 
                    axis=0, equal_var=True)
    print(k,knn_pval[1]/2)
print("(RF) Small P means 2nd > 1st")
for k,v in sub_scores["rf"].items(): 
    rf_pval=stats.ttest_ind(v,  # >=
                    full_scores["rf"], 
                    axis=0, equal_var=True)
    print(k,rf_pval[1]/2)
# LDA topsis_features 0.380512957039085
# SVM lasso_features 0.21698345384201057
# Log_Reg mixed_selection_features 0.37566324944873475
# NB lasso_features 0.000000002 reject null fullspace is greater.
# KNN mixed_selection_features 0.34511637590581723

(LDA) Is Reduced set more predictive than Full set?
('topsis_features', 0.380512957039085)
('lasso_features', 0.098324070265796759)
('mixed_selection_1x_features', 0.028071975619860336)
('mixed_selection_features', 0.008654197441309408)
('rf_features', 0.05777661125381079)
(SVM) Small P means 2nd > 1st
('topsis_features', 0.00030688569037786356)
('lasso_features', 0.21698345384201057)
('mixed_selection_1x_features', 0.0086206004775333696)
('mixed_selection_features', 0.0048227973496748257)
('rf_features', 0.00018036814977609173)
(Log_Reg) Small P means 2nd > 1st
('topsis_features', 0.018826118300759656)
('lasso_features', 0.0046659740282534758)
('mixed_selection_1x_features', 0.17932548819272764)
('mixed_selection_features', 0.37566324944873475)
('rf_features', 0.19716495543953705)
(NB) Small P means 2nd > 1st
('topsis_features', 8.9513312699864532e-14)
('lasso_features', 2.3795197121061658e-09)
('mixed_selection_1x_features', 3.2311275394699811e-05)
('mixed_selection_features', 7.6513

In [None]:
# Recap
# Feature Space

full_confidence_intervals={}
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0*np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
    return m-h, m+h

for k,v in full_scores.items():
    
    full_confidence_intervals[k] = mean_confidence_interval(v)

full_confidence_intervals

In [9]:
pd.DataFrame({'feature':feature_space["lasso_features"]}).to_csv("lasso_features.csv", sep=",")
pd.DataFrame({'feature':feature_space["mixed_selection_features"]}).to_csv("mixed_selection_features.csv", sep=",")
pd.DataFrame({'feature':feature_space["mixed_selection_1x_features"]}).to_csv("mixed_selection_1x_features.csv", sep=",")
pd.DataFrame({'feature':feature_space["topsis_features"]}).to_csv("topsis_features.csv", sep=",")
pd.DataFrame({'feature':feature_space["rf_features"]}).to_csv("rf_features.csv", sep=",")





In [3]:
full_scores

NameError: name 'full_scores' is not defined

In [9]:
feature_space.keys()

['pickled/mixed_selection_1x_features',
 'pickled/mixed_selection_features',
 'pickled/topsis_features',
 'pickled/lasso_features',
 'pickled/rf_features']

In [15]:
for k,v in feature_space.items():
    print(len(v))

46
33
91
28
38


{'knn': (0.89895714076062472, 0.99944752132321191),
 'lda': (0.45475543878895508, 0.71181409929122663),
 'log_reg': (0.83175376297037928, 0.99999040344149925),
 'nb': (0.49150592186912728, 0.57480891640797893),
 'rf': (0.85079240358872865, 0.93115190396781911),
 'svm': (0.9991957980838384, 0.9993300340753497)}