In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,scale,minmax_scale
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,mutual_info_classif
from pandas.tools.plotting import scatter_matrix
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [2]:
data = pd.read_csv('all/train.csv',delimiter=',')

In [3]:
y = data['Classification']
X = data.drop('Classification',axis=1)
X = X.drop('id',axis=1)

In [4]:
X.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,47,22.03,84,2.869,0.59,26.65,38.04,3.32,191.72
1,75,30.48,152,7.01,2.6283,50.53,10.06,11.73,99.45
2,25,22.86,82,4.09,0.8273,20.45,23.67,5.14,313.73
3,54,24.2188,86,3.73,0.7913,8.6874,3.7052,10.3446,635.049
4,69,35.0927,101,5.646,1.4066,83.4821,6.797,82.1,263.499


# Solução 1

In [5]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=42)

In [6]:
clf_rf = RandomForestClassifier(random_state=43)
clf_rf = clf_rf.fit(x_train,y_train)

In [7]:
ac = accuracy_score(y_test,clf_rf.predict(x_test))
f = f1_score(y_test,clf_rf.predict(x_test))
pr = precision_score(y_test, clf_rf.predict(x_test), average='macro')

In [8]:
print('Accurácia é: ',ac)
print('F1-Score é:',f)
print('Precisão é: ',pr)

Accurácia é:  0.7
F1-Score é: 0.6666666666666665
Precisão é:  0.7083333333333333


# Solução 2

In [9]:
# 5 melhores atributos
select_feature = SelectKBest(chi2, k=5).fit(x_train, y_train)

In [10]:
print('Score Lista:',select_feature.scores_)
print('Caracteristicas lista:',x_train.columns)

Score Lista: [2.11739278e-02 2.70182175e+00 4.28986176e+01 4.91577422e+01
 2.43315875e+01 8.48884666e-01 2.94505062e+00 1.36325063e+01
 5.82138642e+00]
Caracteristicas lista: Index(['Age', 'BMI', 'Glucose', 'Insulin', 'HOMA', 'Leptin', 'Adiponectin',
       'Resistin', 'MCP.1'],
      dtype='object')


In [11]:
x_train_2 = select_feature.transform(x_train)
x_test_2 = select_feature.transform(x_test)

clf_rf_2 = RandomForestClassifier()
clf_rf_2 = clf_rf_2.fit(x_train_2,y_train)

ac_2 = accuracy_score(y_test,clf_rf_2.predict(x_test_2))
f_2 = f1_score(y_test,clf_rf_2.predict(x_test_2))
pr_2 = precision_score(y_test, clf_rf_2.predict(x_test_2), average='macro')

In [12]:
print('Accurácia é: ',ac_2)
print('F1-Score é:',f_2)
print('Precisão é: ',pr_2)

Accurácia é:  0.6
F1-Score é: 0.6
Precisão é:  0.6


# Solução 3

In [14]:
from sklearn.feature_selection import RFE

#clf_rf_3 = MLPClassifier(solver='lbfgs',activation='relu',hidden_layer_sizes=(8,4))      

clf_rf_3 = RandomForestClassifier()

rfe = RFE(estimator=clf_rf_3, n_features_to_select=5, step=1)

rfe = rfe.fit(x_train, y_train)

In [15]:
x_train.columns[rfe.support_]

Index(['Age', 'BMI', 'Glucose', 'HOMA', 'Resistin'], dtype='object')

In [18]:
rfe.predict(x_test)

array([1, 2, 2, 2, 2, 1, 2, 2, 1, 1])

In [19]:
ac_3 = accuracy_score(y_test,rfe.predict(x_test))

f_3 = f1_score(y_test,rfe.predict(x_test))

pr_3 = precision_score(y_test, rfe.predict(x_test), average='macro')

In [20]:
print('Accurácia é: ',ac_3)
print('F1-Score é:',f_3)
print('Precisão é: ',pr_3)

Accurácia é:  0.9
F1-Score é: 0.888888888888889
Precisão é:  0.9166666666666667


# Teste com o Melhor Classificador

In [33]:
data_test = pd.read_csv('all/test.csv',delimiter=',')

In [34]:
data_test.head()

Unnamed: 0,id,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,100,62,22.6562,92,3.482,0.7902,9.8648,11.2362,10.6955,703.973
1,78,29,23.01,82,5.663,1.1454,35.59,26.72,4.58,174.8
2,77,75,25.7,94,8.079,1.8733,65.926,3.7412,4.4968,206.802
3,113,44,27.8876,99,9.208,2.2486,12.6757,5.4782,23.0331,407.206
4,86,75,23.0,83,4.952,1.0138,17.127,11.579,7.0913,318.302


In [35]:
data_test = data_test.drop('id',axis=1)

In [36]:
data_test.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,62,22.6562,92,3.482,0.7902,9.8648,11.2362,10.6955,703.973
1,29,23.01,82,5.663,1.1454,35.59,26.72,4.58,174.8
2,75,25.7,94,8.079,1.8733,65.926,3.7412,4.4968,206.802
3,44,27.8876,99,9.208,2.2486,12.6757,5.4782,23.0331,407.206
4,75,23.0,83,4.952,1.0138,17.127,11.579,7.0913,318.302


In [37]:
rfe.predict(data_test)

array([2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       1, 1])

In [38]:
out = rfe.predict(data_test)

In [39]:
out = pd.DataFrame(out,columns=['Classificacion'])

In [40]:
out.to_csv('saida.csv',index=False)