# manual retry
WARNING:
https://github.com/IRkernel/IRkernel needs to be installed

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import pprint
from sklearn.pipeline import Pipeline

import utils
import skutils
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

gSeed = 47

# Use ggplot style
plt.style.use('ggplot')

%matplotlib inline



In [2]:
big = pd.read_csv('train.csv')
#test = pd.read_csv('test.csv')

big.species = big.species.astype('category')
big.species = big.species.cat.codes

In [3]:
def transform(data):
    ID = data.id
    X = data.drop(['species', 'id'], axis=1)
    y = data['species']
    return ID, X, y

ID, X, y = transform(big)

def addZeroColumn(df, colName):
    df.loc[df[colName] < 0.01, colName + '_is_small'] = 1
    df[colName + '_is_small'].fillna(0, inplace=True)

def addZeroColumns(df, colBaseName):
    for n in range(1,65):
        addZeroColumn(df, colBaseName + str(n))
        
addZeroColumns(X, 'margin')
addZeroColumns(X, 'texture')

In [4]:
%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython

ImportError: No module named 'rpy2'

In [None]:
%%R 
# use the R function to create all the same evaluation metrics
library(MLmetrics)

evaluateModel <- function(data,results) {
  # data: real NEVERPAYER column (actual values)
  # results: predicted NEVERPAYER column (predicted values)
  
  confMatrix <- table(data,results)
  print(confMatrix)
  
  err <- (confMatrix["J","N"]+confMatrix["N","J"])/sum(confMatrix)  
  acc <- (confMatrix["J","J"]+confMatrix["N","N"])/sum(confMatrix)  
  
  tpr <- confMatrix["J","J"]/(confMatrix["J","J"]+confMatrix["J","N"]) 
  tnr <- confMatrix["N","N"]/(confMatrix["N","N"]+confMatrix["N","J"]) 
  
  ppv <- confMatrix["J","J"]/(confMatrix["J","J"]+confMatrix["N","J"]) 
  npv <- confMatrix["N","N"]/(confMatrix["N","N"]+confMatrix["J","N"]) 
  
  fpr <- confMatrix["N","J"]/(confMatrix["N","N"]+confMatrix["N","J"]) 
  fnr <- confMatrix["J","N"]/(confMatrix["J","J"]+confMatrix["J","N"]) 
  
  rpp <- (confMatrix["J","J"]+confMatrix["N","J"])/sum(confMatrix) 
  rnp <- (confMatrix["J","J"]+confMatrix["J","N"])/sum(confMatrix) 
    
  kappa <- vcd::Kappa(confMatrix)
  kappa <- kappa$Unweighted[1]
  names(kappa) <- c("kappa") 
  
  lift <- tpr/rpp
  
  names(err) <- c("Error rate")
  names(acc) <- c("Accuracy")
  names(tpr) <- c("Sensitivity (true positives rate)")
  names(tnr) <- c("Specificity (true negatives rate)")
  names(ppv) <- c("Precision J")
  names(npv) <- c("Precision N")
  names(fpr) <- c("False positive rate")
  names(fnr) <- c("False negative rate")
  names(rpp) <- c("Rate of positive predictions")
  names(rnp) <- c("Rate of negative predictions")
  names(lift) <- c("Lift value")

  results <- list(err,acc,tpr,tnr,ppv,npv,fpr,fnr,rpp,rnp,lift, kappa)
  results
}

evaluateAllTheThings <- function(groundTruth, prediction){
    f1 <- MLmetrics::F1_Score(y_pred = prediction, y_true = groundTruth)
    auc <- MLmetrics::AUC(y_pred = prediction, y_true = groundTruth)
    names(f1) <- c("f1_R") 
    names(auc) <- c("AUC_R")

    predictionJN <- ifelse(prediction == 0,"N","J")
    groundTruthJN <- ifelse(groundTruth == 0,"N","J")

    evalA <- evaluateModel(groundTruthJN,predictionJN)
    
    index <- length(evalA)+1

    evalA[[index]] <- f1
    evalA[[index+1]] <- auc
    
    evalA
}

In [None]:
def to_str(val):
    return str(val).split('"')[1]


def flatten_dict(d, prefix='__'):
    def items():
        for key, value in d.items():
            if isinstance(value, dict):
                for sub_key, sub_value in flatten_dict(value).items():
                    yield sub_key, sub_value
            else:
                yield key, value

    return dict(items())


class Observation():
    def __init__(self):
        self.statValues = {}
        self.modelName = ""

    def setModelName(self, nameOfModel):
        self.modelName = nameOfModel

    def addStatMetric(self, metricName, metricValue):
        self.statValues[metricName] = metricValue

def evalSingleModel(X, y_test, clf, modelName, variant, _verbose):
    y_predicted = clf.predict(X)

    if(_verbose):
        print(classification_report(y_test, y_predicted))
    # send the data to R
    groundTruth = y_test.values

    %Rpush groundTruth
    %Rpush y_predicted
    %R res <- evaluateAllTheThings(groundTruth, y_predicted)
    %Rpull res
    statsResults = dict([[to_str(j.names),j[0]] for i,j in enumerate(res)])
    obs = Observation()
    obs.setModelName(modelName + '-' + variant)
    
    for _kpi, value in statsResults.items():
        obs.addStatMetric(_kpi, value)
        
    obs.addStatMetric('typeOfRun', variant)
    if(_verbose):
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(statsResults)
    return obs

def splitOffValidation(X, y, _seed):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=_seed)
    for train_index, test_index in split.split(X, y):
        X_work = X.iloc[train_index]
        X_validation = X.iloc[test_index]
        y_work = y.iloc[train_index]
        y_validation = y.iloc[test_index]
    return X_work, X_validation, y_work, y_validation

def evaluateCV(X, y, pipeline, labelData,allResultsOfModels,_seed, _verbose=True):
    X_work, X_validation, y_work, y_validation = splitOffValidation(X, y, _seed)
    ##############################################################
    ### Train /Test
    split = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=_seed)
    foldCounter = 0
    for train_index, test_index in split.split(X_work, y_work):
        foldCounter += 1
        if _verbose:
            print("###################### Training Fold: ", foldCounter, " #################")
        if _verbose:
            print("TRAIN:", train_index)
            print("TEST:", test_index)
        X_train = X_work.iloc[train_index]
        X_test = X_work.iloc[test_index]
        
        y_train = y_work.iloc[train_index]
        y_test = y_work.iloc[test_index]
        X_validationCopy = X_validation.copy()
    
        pipeline.fit(X_train, y_train)

        allResultsOfModels.append(evalSingleModel(X_test, y_test, pipeline, labelData + '_' + str(foldCounter), 'training', _verbose))
    #############################################################
    ### Evaluation on validation set
    
    if _verbose:
        print("###################### Validation #################")
    
    pipeline.fit(X_work, y_work)
    allResultsOfModels.append(evalSingleModel(X_validation, y_validation, pipeline, labelData, 'validation', _verbose))
    
allResultsOfModels = []

In [None]:
from sklearn.svm import SVC

In [None]:
clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
evaluateCV(X, y, clf, '01_rf10', allResultsOfModels, gSeed)

clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
evaluateCV(X, y, clf, '01_rf100', allResultsOfModels, gSeed)

clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
evaluateCV(X, y, clf, '01_rf1000', allResultsOfModels, gSeed)

clf = SVC()
evaluateCV(X, y, clf, '01_rf', allResultsOfModels, gSeed)

In [None]:
# komisch - das crasht bei mir
#clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', n_jobs=-1)
#evaluateCV(X, y, clf, '02_lr', allResultsOfModels, gSeed)

In [None]:
results = []
for res in allResultsOfModels:
    results.append(res.__dict__)

l = list(map(flatten_dict, results))
results = pd.DataFrame.from_dict(l)

In [None]:
train_res = results[results.typeOfRun != 'validation']
overview = train_res.groupby([train_res.modelName.str.split('_').str[1]]).describe().unstack(
    fill_value=0).loc[:,
           pd.IndexSlice[:, ['mean', 'std']]]#[['kappa', 'Lift value', 'False positive rate', 'False negative rate']]
overview.columns = ['{0[0]}_{0[1]}'.format(tup) for tup in overview.columns]
overview.sort_values('kappa_mean', ascending=False)