<a href="https://colab.research.google.com/github/dajebbar/FreeCodeCamp-python-data-analysis/blob/main/evaluate_feature_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
adult = pd.read_csv('./adult.csv')
adult.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K


In [3]:
def clean(col_name):
  return col_name.replace('-', '_')

adult.rename(columns=clean, inplace=True)
adult.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'class'],
      dtype='object')

In [4]:
adult.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25783 entries, 0 to 25782
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             25783 non-null  int64  
 1   workclass       25783 non-null  object 
 2   fnlwgt          25783 non-null  int64  
 3   education       25783 non-null  object 
 4   education_num   25783 non-null  int64  
 5   marital_status  25783 non-null  object 
 6   occupation      25783 non-null  object 
 7   relationship    25783 non-null  object 
 8   race            25783 non-null  object 
 9   sex             25782 non-null  object 
 10  capital_gain    25782 non-null  float64
 11  capital_loss    25782 non-null  float64
 12  hours_per_week  25782 non-null  float64
 13  native_country  25782 non-null  object 
 14  class           25782 non-null  object 
dtypes: float64(3), int64(3), object(9)
memory usage: 16.0 MB


In [5]:
adult.dtypes.value_counts()

object     9
int64      3
float64    3
dtype: int64

In [6]:
adult.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,25783.0,38.639607,13.75526,17.0,28.0,37.0,48.0,90.0
fnlwgt,25783.0,189777.668192,105724.101492,13492.0,117422.0,178109.0,238432.0,1490400.0
education_num,25783.0,10.072102,2.559228,1.0,9.0,10.0,12.0,16.0
capital_gain,25782.0,1074.74552,7480.336533,0.0,0.0,0.0,0.0,99999.0
capital_loss,25782.0,88.440734,403.458492,0.0,0.0,0.0,0.0,4356.0
hours_per_week,25782.0,40.440773,12.407337,1.0,40.0,40.0,45.0,99.0


In [7]:
adult.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               1
capital_gain      1
capital_loss      1
hours_per_week    1
native_country    1
class             1
dtype: int64

In [8]:
adult.sex.value_counts(dropna=False)

 Male      17239
 Female     8543
NaN            1
Name: sex, dtype: int64

In [9]:
adult.dropna(inplace=True)
adult.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
class             0
dtype: int64

In [10]:
(adult=='?').sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
class             0
dtype: int64

In [11]:
target_feature = 'class'
target = adult[target_feature]
data = adult.drop([target_feature, 'education_num'], axis=1)

In [12]:
from sklearn.compose import make_column_selector as selector

numeric_features = selector(dtype_include='number')(data)
cat_features = selector(dtype_include='object')(data)

print(numeric_features)
print(cat_features)

['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week']
['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']


In [13]:
%%time
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingClassifier


cat_preprocessor = OrdinalEncoder(
    handle_unknown='use_encoded_value', 
    unknown_value=-1)

preprocessor = ColumnTransformer([
                                  ('categorical', cat_preprocessor, cat_features)
], remainder='passthrough')

model = make_pipeline(preprocessor, HistGradientBoostingClassifier())

cv_results = cross_validate(model, data, target)

scores = cv_results['test_score']
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 0.869 +/- 0.003
CPU times: user 9.01 s, sys: 208 ms, total: 9.21 s
Wall time: 5.69 s
