In [105]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier

from imblearn.under_sampling import NearMiss

In [106]:
data=pd.read_csv('files/income_evaluation.csv')
df=data.copy()
df.head(60).tail(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
50,25,Private,32275,Some-college,10,Married-civ-spouse,Exec-managerial,Wife,Other,Female,0,0,40,United-States,<=50K
51,18,Private,226956,HS-grad,9,Never-married,Other-service,Own-child,White,Female,0,0,30,?,<=50K
52,47,Private,51835,Prof-school,15,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,1902,60,Honduras,>50K
53,50,Federal-gov,251585,Bachelors,13,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,55,United-States,>50K
54,47,Self-emp-inc,109832,HS-grad,9,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,60,United-States,<=50K
55,43,Private,237993,Some-college,10,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,>50K
56,46,Private,216666,5th-6th,3,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,Mexico,<=50K
57,35,Private,56352,Assoc-voc,11,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,Puerto-Rico,<=50K
58,41,Private,147372,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,48,United-States,<=50K
59,30,Private,188146,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,5013,0,40,United-States,<=50K


In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlwgt          32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [108]:
df.columns=list(map(lambda x: x.strip().lower(),df.columns))
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [109]:
df.isna().sum().sum()

0

In [110]:
df['income'].value_counts()[1]/df.shape[0]

0.2408095574460244

In [111]:
df['income'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [112]:
X=df.drop("income",axis=1)
y=df['income'].replace({' <=50K':0,' >50K':1})
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=4)

In [113]:
num_col=X.select_dtypes(include='number').columns
cat_col=X.select_dtypes(include='object').columns

In [114]:
num_Pipeline = Pipeline([('scaler', StandardScaler())])
cat_Pipeline = Pipeline([('scaler', OneHotEncoder())])

In [115]:
preprocessor=ColumnTransformer(
    [
    ('num',num_Pipeline,num_col),
    ('cat',cat_Pipeline,cat_col)
    ]
)

In [116]:
final_pipeline=Pipeline(
    [
        ('transformer',preprocessor),
        ('estimator',RandomForestClassifier())
    ]
)

In [117]:
y_pred=final_pipeline.fit(X_train,y_train).predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.92      0.90      4972
           1       0.70      0.63      0.67      1541

    accuracy                           0.85      6513
   macro avg       0.80      0.77      0.78      6513
weighted avg       0.85      0.85      0.85      6513



In [118]:
transformer_pipeline=Pipeline(
    [
        ('transformer',preprocessor)
    ]
)

In [121]:
X_train=transformer_pipeline.transform(X_train)


In [122]:
undersample=NearMiss(version=1,n_neighbors=3)
X_train,y_train=undersample.fit_resample(X_train,y_train)

In [123]:
X_test=transformer_pipeline.transform(X_test)

In [124]:
model=RandomForestClassifier()

In [126]:
y_pred=model.fit(X_train,y_train).predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.69      0.78      4972
           1       0.43      0.74      0.54      1541

    accuracy                           0.70      6513
   macro avg       0.66      0.72      0.66      6513
weighted avg       0.78      0.70      0.72      6513

