# Application - *correction*

In [1]:
from ucimlrepo import fetch_ucirepo
import pandas
  
# fetch dataset 
adult = fetch_ucirepo(id = 2)
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 

y = y.assign(income = y.income.str.replace(".", ""))

## Nettoyage des données

- Supprimer les lignes avec données manquantes
- Supprimer les colonnes `fnlwgt`, `education-num` et `native-country`
- Encoder en *ont-hot* les variables `workclass`, `education`, `marital-status`, `occupation`, `relationship`, `race`, `sex`
- Binariser les variables `capital-gain` et `capital-loss`

### Suppression des données manquantes

In [2]:
X1 = X.dropna()
y1 = y.iloc[X.index]

In [3]:
X2 = X1.reset_index()
y2 = y1.reset_index()

### Suppression des colonnes inutiles

In [4]:
X3 = X2.drop(columns = ["fnlwgt", "education-num", "native-country"])

### Encoder en *one-hot*

In [5]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

liste = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex"]
X4 = X3.drop(columns = liste)

for v in liste:
    enc = encoder.fit(X3[[v]])
    enc_df = pandas.DataFrame(enc.transform(X3[[v]]).toarray(), columns = [v+":"+str(c) for c in enc.categories_[0]])
    X4 = pandas.concat([X4, enc_df], axis = 1)

X4

Unnamed: 0,index,age,capital-gain,capital-loss,hours-per-week,workclass:?,workclass:Federal-gov,workclass:Local-gov,workclass:Never-worked,workclass:Private,...,relationship:Own-child,relationship:Unmarried,relationship:Wife,race:Amer-Indian-Eskimo,race:Asian-Pac-Islander,race:Black,race:Other,race:White,sex:Female,sex:Male
0,0,39,2174,0,40,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1,50,0,0,13,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,2,38,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,3,53,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,4,28,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47616,48836,33,0,0,40,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
47617,48837,39,0,0,36,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
47618,48839,38,0,0,50,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
47619,48840,44,5455,0,40,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Binariser les variables `capital-gain` et `capital-loss`

In [6]:
X5 = X4.assign(capital_gain = (X4["capital-gain"] > 0)) \
    .assign(capital_loss = (X4["capital-loss"] > 0)) \
    .drop(columns = ["capital-gain", "capital-loss"])
X5

Unnamed: 0,index,age,hours-per-week,workclass:?,workclass:Federal-gov,workclass:Local-gov,workclass:Never-worked,workclass:Private,workclass:Self-emp-inc,workclass:Self-emp-not-inc,...,relationship:Wife,race:Amer-Indian-Eskimo,race:Asian-Pac-Islander,race:Black,race:Other,race:White,sex:Female,sex:Male,capital_gain,capital_loss
0,0,39,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,True,False
1,1,50,13,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,False,False
2,2,38,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,False,False
3,3,53,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,False,False
4,4,28,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47616,48836,33,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,False,False
47617,48837,39,36,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,False,False
47618,48839,38,50,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,False,False
47619,48840,44,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,True,False
