In [140]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,confusion_matrix,classification_report,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from src.utils import strip_text
# import warnings
# warnings.filterwarnings('ignore')

In [123]:
# Reading the dataset
df=pd.read_csv('./Data/AdultCensusIncome.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [124]:
# Stripping the text in the dataset
strip_text(df)

In [125]:
# Drop the duplicate records
df.drop_duplicates(keep='first',inplace=True)

In [126]:
# Now convert the '?' with nan
df.replace('?',np.nan,inplace=True)

In [127]:
# Mapping the output variable
salary_map = {'<=50K':0,'>50K':1}
df['salary'] = df['salary'].map(salary_map)

In [128]:
# Dropping the least efective column fnlwgt
df.drop(labels=['fnlwgt'],axis=1,inplace=True)

In [129]:
# Separating the input and output features
X=df.iloc[:,:13]
y=df['salary']

In [130]:
X.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [131]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: salary, dtype: int64

In [141]:
# Defining which column should be ordinal encoded and which should be scaled
categorical_cols=X.select_dtypes(include='object').columns
numerical_cols=X.select_dtypes(exclude='object').columns
print(f"Categorical_features={categorical_cols}")
print(f"Numerical_features={numerical_cols}")

Categorical_features=Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'country'],
      dtype='object')
Numerical_features=Index(['age', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')


In [142]:
# Numerical pipeline

num_pipeline=Pipeline(
    steps=[
    ("Imputer",SimpleImputer(strategy='median')),
    ('Scaler',StandardScaler())
    ]
)
# Categorical pipeline

cat_pipeline=Pipeline(
    steps=[
    ('Imputer',SimpleImputer(strategy='most_frequent')),
    ('One_hot_encoder',OneHotEncoder(sparse=False)),
    ('Scaler',StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [143]:
# Splitting the training and testing data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=35)

In [144]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

AttributeError: Estimator Imputer does not provide get_feature_names_out. Did you mean to call pipeline[:-1].get_feature_names_out()?