In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import pandas as pd



df = pd.read_csv('data_challenge_dataset.csv', encoding='Latin-1')


df1 = df.fillna(-300)

#fillna I do not know the meaning of the missing values so I make them outliers
#after Checking the min and max values for the columns

#Note for the class the average value is 1.72%, the class could ne a diagnosis
#In that case maybe some missing values can be deliberately hidden

X = df1.loc[:,df1.columns != 'Class'] # Trainning columns
y = df1.Class #testing columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # split for trainning

# Random forest most sensitive variable
clf = RandomForestClassifier(n_estimators=10, random_state=0, n_jobs=-1)
#Note we can do a cross validation for the best accuracies

clf.fit(X_train, y_train)



sfm = SelectFromModel(clf, threshold=0.06)
sfm.fit(X_train, y_train)

#give important variables for noise reduction
#for feature_list_index in sfm.get_support(indices=True):
    #print(df1.columns[feature_list_index])
    

X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)


clf_important = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf_important.fit(X_important_train, y_train)
y_pred = clf.predict(X_test)



y_important_pred = clf_important.predict(X_important_test)

#Here we let the random forest classifier operate with all the variable versus 
#random forest with the variables with the threshold above .06 and see the difference
#between accuracy scores.

print(accuracy_score(y_test, y_important_pred)- accuracy_score(y_test, y_pred),',', accuracy_score(y_test, y_important_pred))

#The difference in accuracy score is small enough, we can just use the variables selected from
#threshold value for fitting


  from numpy.core.umath_tests import inner1d


9.83118451728382e-05 , 0.9994382180275835


In [2]:
#We do our fitting with logistic regression 1st
#Note that we do not include class weight balanced in the logreg to account for the occurence of cases
#Since the class is either 0, 1 we use logistic regression 

from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
  


# The important features are V2, V10, V11, V14, V17
# We can use df with the following features for the classifications

df2 = df[['V2', 'V10', 'V11', 'V14', 'V17','Class']]

df2.fillna(-300, inplace = True)#somehow fillna method pad did not work. I used outlier -300

X2 = df2.loc[:,df2.columns != 'Class'] # Trainning columns
y2 = df2.Class #testing columns


X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.25, random_state=0) # split for trainning

scaler = MinMaxScaler() #for scaling

X_train_Scl = scaler.fit_transform(X_train2) #scaling the data since we distance and the weight 
X_test_Scl = scaler.transform(X_test2)# can have an effect on the regression

#We operate a gridsearch to optimize AUC


lr = LogisticRegression(penalty='l1', class_weight = 'balanced', max_iter=40, tol=10) #We use L1 because we have
                                                           #already done feature selection


grid = { 'C': np.power(10.0, np.arange(-100, 100)), 'solver': ['liblinear'] }

grid_AUC_scores = GridSearchCV(lr, param_grid = grid, scoring = 'roc_auc')
grid_AUC_scores.fit(X_train_Scl, y_train2)

Accuracy = GridSearchCV(lr, grid, scoring='accuracy')
Accuracy.fit(X_train_Scl, y_train2)

#optimize the scores ROC: sensitivity and AUC: should be above .5 and closer to 1
#to differentiate from a dummy classifier. In order to optimize our classification
print( 'AUC:', grid_AUC_scores.best_score_ ,', accuracy:', Accuracy.best_score_)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


AUC: 0.7818960118963864 , accuracy: 0.9982584677324969


In [3]:
#We use logistic regression on the entire set with L2 regularization ans sag solver and gridsearch
#We have used the logistic model without class_weight = 'balanced' the AUC ROC is .53 
#However class_weight = 'balanced' there is a slight improvement in AUC .54-.55

from sklearn import cross_validation

scaler = MinMaxScaler()

df1.fillna(-300, inplace = True)#somehow fillna method pad did not work. I used outlier -300


X_train_Scl1 = scaler.fit_transform(X_train) #scaling the data since we distance and the weight 
X_test_Scl1 = scaler.transform(X_test)# can have an effect on the regression

#We operate a gridsearch to optimize AUC


lr2 = LogisticRegression(penalty='l2',class_weight = 'balanced', max_iter=40, tol=10) #We use L2
#class weigth to account for the fact that the target classes do not have the same weight
#It is a way around touching the threshold probability

grid1 = { 'C': np.power(10.0, np.arange(-10, 10)), 'solver': ['sag'] }

grid_AUC_scores1 = GridSearchCV(lr2, param_grid = grid1, scoring = 'roc_auc')
grid_AUC_scores1.fit(X_train_Scl1, y_train)

Accuracy1 = GridSearchCV(lr2, grid1, scoring='accuracy')
Accuracy1.fit(X_train_Scl1, y_train)

#optimize the scores ROC: sensitivity and AUC: should be above .5 and closer to 1
#to differentiate from a dummy classifier. In order to optimize our classification
print( 'AUC:', grid_AUC_scores1.best_score_ ,', accuracy:', Accuracy1.best_score_)



AUC: 0.5417902316797034 , accuracy: 0.9982584677324969


In [4]:
# we do a linear regression on the filtered set
#The model behaves badly. The relationship is very likely nonlinear and a gridsearch will likely not improve the 
#scores so much
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
linreg = linear_model.Lasso(alpha=.5)
linreg.fit(X_train_Scl, y_train2)
PredLinY = linreg.predict(X_test_Scl) 
print('AUC:', roc_auc_score(y_test2, PredLinY.round()), ', accuracy:', accuracy_score(y_test2, PredLinY.round()))

AUC: 0.5 , accuracy: 0.9983146540827504


In [5]:
#We use SVC on the filtered set with a 5 fold cross validation (data split in 5)

from sklearn import svm



Cs = [0.1, 1, 10] #C values for simplicity of model
gammas = [0.01, 1, 10] #We will trade off the influence far or close of the model with C
nfolds = 5
grid2 = {'C': Cs, 'gamma' : gammas}
grid_search = GridSearchCV(svm.SVC(kernel='rbf'), grid2, cv=nfolds)
grid_search.fit(X_train_Scl, y_train2)
grid_search_AUC = GridSearchCV(svm.SVC(kernel='rbf'), grid2, cv=nfolds, scoring='roc_auc')
grid_search_AUC.fit(X_train_Scl, y_train2)
print(grid_search.best_params_,', AUC:', grid_search_AUC.best_score_) # best parameters


{'C': 10, 'gamma': 10} , AUC: 0.9137948109075315


In [7]:
#accuracy for selected parameters
import sklearn.metrics as skm


model = svm.SVC(kernel='rbf', gamma = 10, C = 10)
model.fit(X_train_Scl, y_train2)
PredY = model.predict(X_test_Scl)
print('accuracy:',accuracy_score(y_test2, PredY),', F1 score:', skm.f1_score(y_test2, PredY))

accuracy: 0.9990449706468919 , F1 score: 0.6458333333333335


In [12]:
#We use neural network MLP on entire set with l2 regularization 

from sklearn.neural_network import MLPClassifier

MLP = MLPClassifier(alpha = 5, random_state = 0, 
                    solver = 'lbfgs') #l2 penalization with alpa = 5 on entire set

#MLP.fit(X_train_Scl, y_train)


grid_values = {'hidden_layer_sizes': [[7, 4], [30, 20]]} #layers node or neurons and layers

grid_AUC_scores = GridSearchCV(MLP, param_grid = grid_values, scoring = 'roc_auc')
#grid_Acc_scores = GridSearchCV(MLP, param_grid = grid_values, scoring = 'accuracy')



grid_AUC_scores.fit(X_train_Scl, y_train2)
#grid_Acc_scores.fit(X_train_Scl1, y_train)

#print('AUC:',grid_AUC_scores.best_score_,', accuracy:', grid_Acc_scores.best_score_)
print('AUC:',grid_AUC_scores.best_score_)

AUC: 0.6660534371271403
