In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,confusion_matrix,classification_report,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from src.utils import strip_text
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Reading the dataset
df=pd.read_csv('./Data/AdultCensusIncome.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [34]:
# Stripping the text in the dataset
strip_text(df)

TypeError: descriptor 'strip' for 'str' objects doesn't apply to a 'float' object

In [35]:
# Drop the duplicate records
df.drop_duplicates(keep='first',inplace=True)

In [6]:
# Now convert the '?' with nan
df.replace('?',np.nan,inplace=True)

In [7]:
# Mapping the output variable
salary_map = {'<=50K':0,'>50K':1}
df['salary'] = df['salary'].map(salary_map)

In [8]:
# Dropping the least efective column fnlwgt
df.drop(labels=['fnlwgt'],axis=1,inplace=True)

In [9]:
# Separating the input and output features
X=df.iloc[:,:13]
y=df['salary']

In [10]:
X.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [11]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: salary, dtype: int64

In [12]:
# Defining which column should be ordinal encoded and which should be scaled
categorical_cols=X.select_dtypes(include='object').columns
numerical_cols=X.select_dtypes(exclude='object').columns
print(f"Categorical_features={categorical_cols}")
print(f"Numerical_features={numerical_cols}")

Categorical_features=Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'country'],
      dtype='object')
Numerical_features=Index(['age', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')


In [14]:
# Numerical pipeline

num_pipeline=Pipeline(
    steps=[
    ("Imputer",SimpleImputer(strategy='median')),
    ('Scaler',StandardScaler())
    ]
)
# Categorical pipeline

cat_pipeline=Pipeline(
    steps=[
    ('Imputer',SimpleImputer(strategy='most_frequent')),
    ('One_hot_encoder',OneHotEncoder(sparse_output=False)),
    ('Scaler',StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [20]:
# Splitting the training and testing data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=35)

In [21]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

In [22]:
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [18]:
X_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__education-num,num_pipeline__capital-gain,num_pipeline__capital-loss,num_pipeline__hours-per-week,cat_pipeline__workclass_Federal-gov,cat_pipeline__workclass_Local-gov,cat_pipeline__workclass_Never-worked,cat_pipeline__workclass_Private,cat_pipeline__workclass_Self-emp-inc,...,cat_pipeline__country_Portugal,cat_pipeline__country_Puerto-Rico,cat_pipeline__country_Scotland,cat_pipeline__country_South,cat_pipeline__country_Taiwan,cat_pipeline__country_Thailand,cat_pipeline__country_Trinadad&Tobago,cat_pipeline__country_United-States,cat_pipeline__country_Vietnam,cat_pipeline__country_Yugoslavia
0,-0.701111,-0.419149,-0.145209,-0.217336,0.777693,-0.17319,-0.263257,-0.013861,0.572781,-0.18914,...,-0.034531,-0.058575,-0.020562,-0.048067,-0.039232,-0.021476,-0.024801,0.303472,-0.046847,-0.020562
1,0.615598,-0.0297,-0.145209,-0.217336,0.372056,-0.17319,-0.263257,-0.013861,0.572781,-0.18914,...,-0.034531,-0.058575,-0.020562,-0.048067,-0.039232,-0.021476,-0.024801,0.303472,-0.046847,-0.020562
2,1.639706,-0.0297,-0.145209,-0.217336,0.372056,-0.17319,-0.263257,-0.013861,0.572781,-0.18914,...,-0.034531,-0.058575,-0.020562,-0.048067,-0.039232,-0.021476,-0.024801,0.303472,-0.046847,-0.020562
3,2.151759,-0.0297,-0.145209,-0.217336,-0.03358,-0.17319,-0.263257,-0.013861,-1.745868,-0.18914,...,-0.034531,-0.058575,-0.020562,-0.048067,-0.039232,-0.021476,-0.024801,0.303472,-0.046847,-0.020562
4,-0.408509,-0.419149,-0.145209,-0.217336,1.183329,-0.17319,-0.263257,-0.013861,-1.745868,-0.18914,...,-0.034531,-0.058575,-0.020562,-0.048067,-0.039232,-0.021476,-0.024801,-3.295196,-0.046847,-0.020562


In [23]:
X_test.head()

Unnamed: 0,num_pipeline__age,num_pipeline__education-num,num_pipeline__capital-gain,num_pipeline__capital-loss,num_pipeline__hours-per-week,cat_pipeline__workclass_Federal-gov,cat_pipeline__workclass_Local-gov,cat_pipeline__workclass_Never-worked,cat_pipeline__workclass_Private,cat_pipeline__workclass_Self-emp-inc,...,cat_pipeline__country_Portugal,cat_pipeline__country_Puerto-Rico,cat_pipeline__country_Scotland,cat_pipeline__country_South,cat_pipeline__country_Taiwan,cat_pipeline__country_Thailand,cat_pipeline__country_Trinadad&Tobago,cat_pipeline__country_United-States,cat_pipeline__country_Vietnam,cat_pipeline__country_Yugoslavia
0,0.9082,-0.419149,-0.145209,-0.217336,-0.03358,-0.17319,-0.263257,-0.013861,0.572781,-0.18914,...,-0.034531,-0.058575,-0.020562,-0.048067,-0.039232,-0.021476,-0.024801,0.303472,-0.046847,-0.020562
1,1.712856,-0.419149,-0.145209,-0.217336,4.022785,-0.17319,-0.263257,-0.013861,-1.745868,-0.18914,...,-0.034531,-0.058575,-0.020562,-0.048067,-0.039232,-0.021476,-0.024801,0.303472,-0.046847,-0.020562
2,0.83505,-0.0297,-0.145209,-0.217336,-0.03358,-0.17319,-0.263257,-0.013861,0.572781,-0.18914,...,-0.034531,-0.058575,-0.020562,-0.048067,-0.039232,-0.021476,-0.024801,0.303472,-0.046847,-0.020562
3,-0.042757,1.138647,1.916983,-0.217336,0.372056,-0.17319,-0.263257,-0.013861,0.572781,-0.18914,...,-0.034531,-0.058575,-0.020562,-0.048067,-0.039232,-0.021476,-0.024801,0.303472,-0.046847,-0.020562
4,0.103544,-0.419149,-0.145209,-0.217336,-0.03358,-0.17319,-0.263257,-0.013861,0.572781,-0.18914,...,-0.034531,-0.058575,-0.020562,-0.048067,-0.039232,-0.021476,-0.024801,0.303472,-0.046847,-0.020562


In [44]:
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier()
forest_params=[{'max_depth':list(range(5,15))}]
clf=GridSearchCV(rf,forest_params,cv=5,scoring='accuracy')
clf.fit(X_train,y_train)
print(clf.best_params_)
print(clf.best_score_)

{'max_depth': 14}
0.8614238503345175


In [47]:
best_clf=clf.best_estimator_
best_clf.fit(X_train,y_train)

In [29]:
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2score=r2_score(true,predicted)
    return mae,rmse,r2score

In [45]:
# Training multiple models:
from sklearn.ensemble import RandomForestClassifier
models={
    'LogisticRegression':LogisticRegression(),
    'SVC':SVC(),
    # 'DTC':DTC(),
    'Random Forest':RandomForestClassifier()
}

trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    # Prediction

    y_pred=model.predict(X_test)
    mae,rmse,r2score=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print('Model performance (Training)')
    print('RMSE',rmse)
    print('MAE',mae)
    print('R2_SCORE',r2score*100)
    print('Model testing Score: ',model.score(X_test,y_test))

    r2_list.append(r2score)

    print('-'*40)
    print('\n')

LogisticRegression
Model performance (Training)
RMSE 0.3910099133076495
MAE 0.15288875230485557
R2_SCORE 17.33014882324794
Model testing Score:  0.8471112476951445
----------------------------------------


SVC
Model performance (Training)
RMSE 0.39218706573497486
MAE 0.15381069452980947
R2_SCORE 16.83163715786049
Model testing Score:  0.8461893054701906
----------------------------------------


Random Forest
Model performance (Training)
RMSE 0.4049099393745508
MAE 0.1639520590043024
R2_SCORE 11.348008838598545
Model testing Score:  0.8360479409956976
----------------------------------------




In [57]:
from sklearn.metrics import f1_score
f1=f1_score(y_test,y_pred)
f1

0.6577227382180942

In [None]:
# For decision Tree Classifier:
params={'max_depth':list(range(5,10)),'max_features':[]}