In [1]:
import pandas as pd
import numpy as np

## Pré-processamento de dados 

In [2]:
coluns = ['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'Y']

In [3]:
data = pd.read_csv('train.csv', names=coluns)

In [4]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
data.shape

(1344, 15)

In [6]:
data.isnull().sum()

age                0
workclass         85
fnlwgt             0
education          0
education-num      0
marital-status     0
occupation        85
relationship       0
race               0
sex                0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country    28
Y                  1
dtype: int64

In [7]:
data.drop('education', axis=1, inplace=True)

In [8]:
data = data.dropna(axis=0)

In [9]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
Y                 0
dtype: int64

In [10]:
data.shape

(1234, 14)

### Temos esses dados categoricos para serem transformados em numéricos

workclass -> nominal

**education -> ordinal**

marital -> nominal

occupation -> nominal

relationship -> nominal

race -> nominal

sex -> nominal

native-country -> nominal

y -> nominal

In [11]:
sex_mapping = {' Female': 0, ' Male': 1}
data['sex'] = data['sex'].map(sex_mapping)

In [12]:
Y_mapping = {' <=50K' : 0, ' >50K' : 1}
data['Y'] = data['Y'].map(Y_mapping)

In [13]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Y
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
4,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0


In [14]:
data = pd.get_dummies(data, prefix=['workclass', 'marital', 'occupation', 'relationship', 'race', 'native-country' ],drop_first=True)

In [15]:
data.head()

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,Y,workclass_ Local-gov,workclass_ Private,...,native-country_ Mexico,native-country_ Philippines,native-country_ Poland,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ United-States,native-country_ Yugoslavia
0,39,77516,13,1,2174,0,40,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,50,83311,13,1,0,0,13,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,38,215646,9,1,0,0,40,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,53,234721,7,1,0,0,40,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,28,338409,13,0,0,0,40,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1234 entries, 0 to 1342
Data columns (total 70 columns):
age                                   1234 non-null int64
fnlwgt                                1234 non-null int64
education-num                         1234 non-null int64
sex                                   1234 non-null int64
capital-gain                          1234 non-null int64
capital-loss                          1234 non-null int64
hours-per-week                        1234 non-null int64
Y                                     1234 non-null int64
workclass_ Local-gov                  1234 non-null uint8
workclass_ Private                    1234 non-null uint8
workclass_ Self-emp-inc               1234 non-null uint8
workclass_ Self-emp-not-inc           1234 non-null uint8
workclass_ State-gov                  1234 non-null uint8
marital_ Married-AF-spouse            1234 non-null uint8
marital_ Married-civ-spouse           1234 non-null uint8
marital_ Married-spouse

## Divisão dos dados

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
data['age'] = data['age'].astype(float)
data['fnlwgt'] = data['fnlwgt'].astype(float)
data['education-num'] = data['education-num'].astype(float)
data['sex'] = data['sex'].astype(float)
data['capital-gain'] = data['capital-gain'].astype(float)
data['capital-loss'] = data['capital-loss'].astype(float)
data['hours-per-week'] = data['hours-per-week'].astype(float)
data['Y'] = data['Y'].astype(float)

In [19]:
y = data['Y'].values
data.drop('Y', axis=1)
X = data.values

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=3)

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

## Criando modelos e testando eles

In [23]:
from sklearn import metrics as mt

In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [25]:
Knn = KNeighborsClassifier(n_neighbors=30) 
Knn_std = KNeighborsClassifier(n_neighbors=30) 

In [26]:
Knn.fit(X_train, y_train)
Knn_std.fit(X_train_std, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=30, p=2,
           weights='uniform')

In [27]:
y_pred_Knn = Knn.predict(X_test)
y_pred_KnnStd = Knn_std.predict(X_test_std)

In [28]:
accuracyKNN = mt.accuracy_score(y_test, y_pred_Knn)
accuracyKNNStd = mt.accuracy_score(y_test, y_pred_KnnStd)

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
RF = RandomForestClassifier(random_state=0, n_estimators=30)
RF_Sdt = RandomForestClassifier(random_state=0, n_estimators=30)

In [31]:
RF.fit(X_train, y_train)
RF_Sdt.fit(X_train_std, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [32]:
y_pred_RF = RF.predict(X_test)
y_pred_RFStd = RF_Sdt.predict(X_test_std)

In [33]:
accuracyRF = mt.accuracy_score(y_test, y_pred_RF)
accuracyRFStd = mt.accuracy_score(y_test, y_pred_RFStd)

In [34]:
from sklearn.naive_bayes import GaussianNB

$$ P(A | B) = \frac{P(B | A) * P(A)}{P(B)} $$

In [35]:
Gnb = GaussianNB()
Gnb_Std = GaussianNB() 

In [36]:
Gnb.fit(X_train, y_train)
Gnb_Std.fit(X_train_std, y_train)

GaussianNB(priors=None)

In [37]:
y_pred_Gnb = Gnb.predict(X_test)
y_pred_GnbStd = Gnb_Std.predict(X_test_std)

In [38]:
accuracyGnb = mt.accuracy_score(y_test, y_pred_Gnb)
accuracyGnbStd = mt.accuracy_score(y_test, y_pred_GnbStd)

In [39]:
print("ACCURACY")
print("KNN: {:.4f}    KNNStd: {:.4f}".format(accuracyKNN, accuracyKNNStd))
print("RF: {:.4f}     RF: {:.4f}".format(accuracyRF, accuracyRFStd))
print("Gnb: {:.4f}    GnbStd: {:.4f}".format(accuracyGnb, accuracyGnbStd))

ACCURACY
KNN: 0.7493    KNNStd: 0.8787
RF: 1.0000     RF: 1.0000
Gnb: 0.7520    GnbStd: 0.9892
