In [152]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy.stats import mode

In [153]:
dataset = pd.read_csv('data/brain_tumor_dataset.csv', index_col = 0)

dataset = dataset.drop(['image_name', 'class_name'], axis = 1)

dataset

Unnamed: 0,mean,variance,std,skewness,kurtosis,entropy,contrast,dissimilarity,homogeneity,asm,energy,correlation,class
0,61.404978,3838.785287,61.957932,0.947800,0.238857,14.884915,456.794107,11.602917,0.322584,0.006458,0.080363,0.940544,0
1,85.822325,8252.643519,90.844062,0.850352,-0.651271,17.351693,129.419093,5.408986,0.476613,0.075636,0.275020,0.992161,2
2,42.516818,3767.802299,61.382427,1.438846,1.309734,16.815211,132.465535,4.752191,0.596372,0.254782,0.504760,0.982437,0
3,61.010380,2452.392131,49.521633,0.477713,-0.256893,17.441209,61.453243,4.125462,0.432363,0.009812,0.099058,0.987458,3
4,30.124722,1406.812993,37.507506,1.320606,1.318425,16.898935,25.445848,2.229429,0.645524,0.070971,0.266403,0.990962,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3201,57.453850,1725.060584,41.533849,0.617645,1.086687,17.559813,39.304546,3.051932,0.478069,0.006733,0.082053,0.988592,3
3202,39.196328,1860.866781,43.137765,1.389074,2.615294,14.628598,230.485699,7.462348,0.395298,0.071769,0.267897,0.938115,0
3203,35.322576,1444.718532,38.009453,1.255980,2.149883,16.545681,119.112805,5.286287,0.467381,0.088983,0.298300,0.958792,0
3204,55.876904,2996.048083,54.736168,0.745804,-0.180871,17.252259,34.421305,2.834611,0.523024,0.016794,0.129591,0.994256,2


In [134]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 12].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

Unnamed: 0,mean,variance,std,skewness,kurtosis,entropy,contrast,dissimilarity,homogeneity,asm,energy,correlation
2230,79.580706,7986.815269,89.368984,0.629460,-0.963562,14.685605,341.652110,6.924378,0.530912,0.179508,0.423684,0.978630
550,48.133879,2558.013719,50.576810,0.787765,-0.332754,17.064253,44.192575,3.480668,0.492614,0.057758,0.240328,0.991364
2015,73.718238,6388.582615,79.928609,0.723898,-0.474584,14.766872,568.879156,11.368878,0.438676,0.131322,0.362384,0.955504
540,32.691471,1602.602362,40.032516,1.037502,0.147772,16.889949,38.674115,2.861737,0.594351,0.088420,0.297355,0.987942
2982,20.247963,822.674957,28.682311,1.635198,3.041553,16.662520,19.606432,1.576818,0.757446,0.139758,0.373842,0.988095
...,...,...,...,...,...,...,...,...,...,...,...,...
1697,51.332851,2446.646149,49.463584,0.734615,-0.344179,17.281369,36.677222,3.297529,0.477628,0.014804,0.121674,0.992505
2586,33.776344,1716.032902,41.425027,1.000472,0.484231,16.844153,45.491591,2.518851,0.677619,0.104608,0.323431,0.986754
1871,39.627712,2030.474031,45.060782,1.186853,1.300900,17.070429,30.574207,2.347721,0.632596,0.050494,0.224708,0.992475
776,75.115777,6883.399378,82.966254,1.253906,0.344116,15.257065,435.219801,10.195177,0.354213,0.046815,0.216369,0.968410


In [223]:
def get_models():
    models = dict()
    models['lr'] = LogisticRegression()
    models['knn'] = KNeighborsClassifier()
    models['cart'] = DecisionTreeClassifier()
    models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    models['rf'] = RandomForestClassifier()
    models['gb'] = GradientBoostingClassifier()
    models['stacking'] = get_stacking()
    
    return models

In [224]:

# compare standalone models for binary classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib import pyplot

In [225]:
# evaluate a given model using cross-validation
def evaluate_model(model, x, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [242]:
models = get_models()
results, names = list(), list()

for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))


>lr 0.571 (0.020)
>knn 0.624 (0.023)
>cart 0.755 (0.022)
>svm 0.468 (0.025)
>bayes 0.551 (0.015)
>rf 0.824 (0.018)
>gb 0.774 (0.017)
>stacking 0.820 (0.016)


In [241]:
# get a stacking ensemble of models
def get_stacking():
    # define the base models
    level0 = list()
    #level0.append(('lr', LogisticRegression()))
    #level0.append(('knn', KNeighborsClassifier()))
    level0.append(('cart', DecisionTreeClassifier()))
    #level0.append(('svm', SVC()))
    level0.append(('bayes', GaussianNB()))
    #level0.append(('gb', GradientBoostingClassifier()))
    level0.append(('rf', RandomForestClassifier()))
    # define meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=4)
    return model