In [1]:
import pandas as pd 

#fetch the dataframe 
df = pd.read_csv("data/census_income.csv")

#display the head 
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
#checking for the duplicates 
df.duplicated().sum()

np.int64(29)

In [4]:
df.drop_duplicates(keep="first",inplace=True)

In [5]:
df.drop(["fnlwgt","education-num","native-country"],axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,<=50K


In [7]:
import numpy as np
col_to_operate = ["workclass","occupation","income"]
char_to_remove = ["?"," ?"]

for char in char_to_remove:
    for col in col_to_operate:
        if col == "income":
            df[col] = df[col].str.replace(".","")
        else:
            df[col] = df[col].replace(char,np.nan)

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [18]:
le = LabelEncoder()
df["income"] = le.fit_transform(df["income"])
df["income"].unique()

array([0, 1])

In [10]:
#Setting independent and dependent variables 
X = df.drop("income",axis=1)
y = df[["income"]]

In [19]:
#segregating numerical and categorical columns 
num_columns = X.select_dtypes(exclude="O").columns
cat_columns = X.select_dtypes(include="O").columns

print("Numerical Columns: ",num_columns)
print("Categorical Columns: ",cat_columns)

Numerical Columns:  Index(['age', 'capital-gain', 'capital-loss', 'hours-per-week'], dtype='object')
Categorical Columns:  Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex'],
      dtype='object')


In [12]:
#setting up the pipeline 
num_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("encoder",OneHotEncoder(sparse_output=False))
    ]
)

preprocessor = ColumnTransformer([
    ("num_pipeline",num_pipeline,num_columns),
    ("cat_pipeline",cat_pipeline,cat_columns)
])

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42)

In [14]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix


In [16]:
def eval_models(true,predicted):
    accuracy  = accuracy_score(true,predicted)
    class_report = classification_report(true,predicted)
    confi_matrix = confusion_matrix(true,predicted)

    return (
        accuracy,
        class_report,
        confi_matrix
    )

In [17]:
models = {
    "LogisticRegression":LogisticRegression(),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "SVC":SVC(),
    "RandomForestClassifier":RandomForestClassifier()
}

param_grid = {
    "LogisticRegression":{
        "penalty":['l1', 'l2', 'elasticnet'],
        'solver':["liblinear","sag","saga"],
        "C":[0.1,0.2,0.3,0.4,0.5,1,2],
        'random_state':[10,20,30,42,-1],
        "max_iter":[100,200,300],
    },
    "DecisionTreeClassifier":{
        "criterion":["gini","entropy"],
        "splitter":["best","random"],
        "max_depth":[None,10,20,30,40,50],
        "min_samples_split":[2,3,4,5,6,7],
        "min_samples_leaf":[1,2,3,4,5],
        "max_features":["sqrt", "log2"]
    },
    "SVC":{
        "C":[0.1,0.2,0.3,0.4,0.5,1,2],
        "kernel":["poly", "rbf", "sigmoid"],
        "gamma":["scale", "auto"]
    },
    "RandomForestClassifier":{
        "n_estimators":[100,200,300],
        "criterion":["gini","entropy"],
        "max_depth":[None,10,20,30,40,50],
        "min_samples_split":[2,3,4,5,6,7],
        "min_samples_leaf":[1,2,3,4,5]

    }
}
best_score = -1
best_estimator = None

for model_name,model in models.items():
    print(f"Evaluating the model: {model_name}")

    gridsearch = GridSearchCV(estimator=model,param_grid=param_grid[model_name],n_jobs=-1,verbose=1,cv=5)
    gridsearch.fit(X_train,y_train)

    #making prediction on the test dataset 
    y_pred = gridsearch.predict(X_test)
    best_model_score = gridsearch.best_score_
    best_model_param = gridsearch.best_params_
    best_model = gridsearch.best_estimator_

    print(f"{model_name} best score is :{best_model_score} and the best param for the model is: {best_model_param}")

    accuracy,class_report,confi_matrix = eval_models(y_test,y_pred)
    print(f"Accuracy score for the model:{model_name} is {accuracy}")
    print(f"Classification Report for the model:{model_name} is\n {class_report}")
    print(f"Confusion Matrix for the model:{model_name} is\n {confi_matrix}")

    if best_model_score > best_score:
        best_score = best_model_score
        best_estimator = best_model

print(f"Best estimator found: {best_estimator} and the best model score is: {best_score}.")

Evaluating the model: LogisticRegression
Fitting 5 folds for each of 945 candidates, totalling 4725 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

LogisticRegression best score is :0.850185809616358 and the best param for the model is: {'C': 0.4, 'max_iter': 200, 'penalty': 'l2', 'random_state': 10, 'solver': 'sag'}
Accuracy score for the model:LogisticRegression is 0.8501092597650916
Classification Report for the model:LogisticRegression is
               precision    recall  f1-score   support

           0       0.88      0.93      0.90     11078
           1       0.74      0.59      0.66      3566

    accuracy                           0.85     14644
   macro avg       0.81      0.76      0.78     14644
weighted avg       0.84      0.85      0.84     14644

Confusion Matrix for the model:LogisticRegression is
 [[10328   750]
 [ 1445  2121]]
Evaluating the model: DecisionTreeClassifier
Fitting 5 folds for each of 1440 candidates, totalling 7200 fits
DecisionTreeClassifier best score is :0.8497176097285802 and the best param for the model is: {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf'

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

SVC best score is :0.8579999769576974 and the best param for the model is: {'C': 2, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy score for the model:SVC is 0.8568014203769462
Classification Report for the model:SVC is
               precision    recall  f1-score   support

           0       0.88      0.94      0.91     11078
           1       0.76      0.60      0.67      3566

    accuracy                           0.86     14644
   macro avg       0.82      0.77      0.79     14644
weighted avg       0.85      0.86      0.85     14644

Confusion Matrix for the model:SVC is
 [[10412   666]
 [ 1431  2135]]
Evaluating the model: RandomForestClassifier
Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

RandomForestClassifier best score is :0.8644970754049931 and the best param for the model is: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 200}
Accuracy score for the model:RandomForestClassifier is 0.8624009833378858
Classification Report for the model:RandomForestClassifier is
               precision    recall  f1-score   support

           0       0.88      0.94      0.91     11078
           1       0.78      0.61      0.68      3566

    accuracy                           0.86     14644
   macro avg       0.83      0.78      0.80     14644
weighted avg       0.86      0.86      0.86     14644

Confusion Matrix for the model:RandomForestClassifier is
 [[10460   618]
 [ 1397  2169]]
Best estimator found: RandomForestClassifier(criterion='entropy', max_depth=50, min_samples_leaf=3,
                       min_samples_split=4, n_estimators=200) and the best model score is: 0.8644970754049931.
