In [1]:
# Libraries: Standard ones
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random as rnd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 

In [2]:
# Load the data: data_banknote_authentification
data1 = pd.read_csv("data_banknote_authentication.csv",sep=";")
#print(data)
data1.describe() # Statistical summary of the data

#Data were extracted from images that were taken from genuine and forged banknote-like specimens. For digitization, an industrial camera usually used for print inspection was used. The final images have 400x 400 pixels. Due to the object lens and distance to the investigated object gray-scale pictures with a resolution of about 660 dpi were gained. Wavelet Transform tool were used to extract features from images.

# variance
# skewness
# curtosis
# entropy
# class

Unnamed: 0,var,skew,curt,ent,class
count,1373.0,1373.0,1373.0,1373.0,1373.0
mean,0.436057,1.927265,1.394565,-1.191114,0.444283
std,2.843028,5.86973,4.309953,2.100343,0.497067
min,-7.0421,-13.7731,-5.2861,-8.5482,0.0
25%,-1.7713,-1.7051,-1.6283,-2.4099,0.0
50%,0.49665,2.3259,0.61663,-0.58665,0.0
75%,2.8232,6.8162,3.1769,0.39481,1.0
max,6.8248,12.9516,17.9274,2.4495,1.0


In [9]:
# Load the data: kidney disease
data2 = pd.read_csv("kidney_disease.csv",sep=",")
#print(data2)
data2.describe() # Statistical summary of the data

#age - age 
#bp - blood pressure 
#sg - specific gravity 
#al - albumin 
#su - sugar 
#rbc - red blood cells 
#pc - pus cell 
#pcc - pus cell clumps 
#ba - bacteria 
#bgr - blood glucose random 
#bu - blood urea 
#sc - serum creatinine 
#sod - sodium 
#pot - potassium 
#hemo - hemoglobin 
#pcv - packed cell volume 
#wc - white blood cell count 
#rc - red blood cell count 
#htn - hypertension 
#dm - diabetes mellitus 
#cad - coronary artery disease 
#appet - appetite 
#pe - pedal edema 
#ane - anemia 
#class - class

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,400.0,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,199.5,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,115.614301,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,0.0,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,99.75,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,299.25,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,399.0,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


On s'entraine sur data1 qui est plus léger 

### Recherche des paramètres utiles

In [4]:
corr_matrix = data1.corr().abs()
high_corr_var=np.where(corr_matrix>0.75)
high_corr_var=np.array([(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y])
print(high_corr_var)

[['skew' 'curt']]


In [5]:
data1.pop('skew')

0        8.66610
1        8.66610
2        8.16740
3       -2.63830
4        9.52280
          ...   
1368     1.34920
1369    -4.87730
1370   -13.45860
1371    -8.38270
1372    -0.65804
Name: skew, Length: 1373, dtype: float64

### Initialisation train/test

In [6]:
Y = data1['class']
data1.pop('class')
X = data1
X = StandardScaler().fit_transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [10]:
Y = np.multiply([data2['classification']=='ckd'],1)[0]
data2.drop('classification', 1, inplace=True)
for c in data2 :
    if isinstance(data2[c][3], str):
        a = np.multiply([data2[c] == data2[c][3]],1)
        data2.drop(c, 1, inplace=True)
        data2[c] = a[0]
    data2[c] = np.nan_to_num(data2[c], copy=True, nan=data2[c].mean())
X = data2
X = StandardScaler().fit_transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

### Result evaluation 

In [11]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def accuracy(y_predict, y_test):
    return np.mean([y_predict==y_test])

### Méthode 1: SVM

In [12]:
from sklearn import svm

In [13]:
clf1_1 = svm.SVC(kernel='linear')
clf1_1.fit(x_train,y_train)
test_predict = clf1_1.predict(x_test) 
clf1_1.score(x_test,y_test)

0.9625

In [14]:
clf1_2 = svm.SVC(kernel="poly", degree=2, gamma='auto')
clf1_2.fit(x_train,y_train)
test_predict = clf1_2.predict(x_test)
clf1_2.score(x_test,y_test)

0.925

In [15]:
clf1_3 = svm.SVC(kernel='rbf', gamma='auto')
clf1_3.fit(x_train,y_train)
test_predict = clf1_3.predict(x_test)
clf1_3.score(x_test,y_test)

0.9875

In [16]:
clf1_4 = svm.SVC(kernel='sigmoid', gamma=1./150)
clf1_4.fit(x_train,y_train)
test_predict = clf1_4.predict(x_test)
clf1_4.score(x_test,y_test)

0.9875

### Méthode 2: Stochastic Gradient Descent

In [17]:
from sklearn.linear_model import SGDClassifier 

In [18]:
clf2 = SGDClassifier()
clf2.fit(x_train, y_train)
test_predict = clf2.predict(x_test)
clf2.score(x_test,y_test)

0.9375

### Méthode 3: Desicion trees

In [20]:
from sklearn.tree import DecisionTreeClassifier

In [21]:
clf3 = DecisionTreeClassifier()
clf3.fit(x_train, y_train)
test_predict = clf3.predict(x_test)
clf3.score(x_test, y_test)

0.95

### Méthode 4: Baysien Network

In [22]:
from sklearn.naive_bayes import GaussianNB

In [23]:
clf4 = GaussianNB()
clf4.fit(x_train, y_train)
test_predict = clf4.predict(x_test)
clf4.score(x_test,y_test)

0.975

### Méthode 5: Random Forest

In [24]:
from sklearn.ensemble import RandomForestRegressor

In [25]:
# Instantiate model with 1000 decision trees
clf5 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
clf5.fit(x_train, y_train)
# Use the forest's predict method on the test data
test_predict = clf5.predict(x_test)
clf5.score(x_test,y_test)

0.9361325274725274

### Méthode 6: Neural Network

In [26]:
from sklearn.neural_network import MLPClassifier

In [28]:
clf6 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=1)
clf6.fit(x_train, y_train)
test_predict = clf6.predict(x_test)
clf6.score(x_test, y_test)

0.65

### Méthode 7: Probit model

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
clf7 = LogisticRegression(random_state=0).fit(x_train, y_train)
test_predict = clf7.predict(x_test)
clf7.score(x_test,y_test)

0.9625