In [1]:
# adult makes more than 50k
import numpy as np
import pandas as pd

# load the data
file_url = 'https://archive.ics.uci.edu/static/public/2/data.csv'
df = pd.read_csv(file_url)
df.head().T

Unnamed: 0,0,1,2,3,4
age,39,50,38,53,28
workclass,State-gov,Self-emp-not-inc,Private,Private,Private
fnlwgt,77516,83311,215646,234721,338409
education,Bachelors,Bachelors,HS-grad,11th,Bachelors
education-num,13,13,9,7,13
marital-status,Never-married,Married-civ-spouse,Divorced,Married-civ-spouse,Married-civ-spouse
occupation,Adm-clerical,Exec-managerial,Handlers-cleaners,Handlers-cleaners,Prof-specialty
relationship,Not-in-family,Husband,Not-in-family,Husband,Wife
race,White,White,White,Black,Black
sex,Male,Male,Male,Male,Female


In [2]:
df.columns = df.columns.str.replace('-','_').str.lower()

In [3]:
df.income.value_counts()

income
<=50K     24720
<=50K.    12435
>50K       7841
>50K.      3846
Name: count, dtype: int64

In [4]:
# 0: <=50k , 1: >50k 
df.income = df.income.map(lambda x: 1 if x.count('>') > 0 else 0)

In [5]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income             int64
dtype: object

In [6]:
categorical = ['workclass','education','marital_status','occupation','relationship','race','sex','native_country']

In [7]:
df.isna().any()

age               False
workclass          True
fnlwgt            False
education         False
education_num     False
marital_status    False
occupation         True
relationship      False
race              False
sex               False
capital_gain      False
capital_loss      False
hours_per_week    False
native_country     True
income            False
dtype: bool

In [8]:
df.fillna('UNK', inplace=True)

In [9]:
from sklearn.metrics import mutual_info_score

In [10]:
mi = df[categorical].apply(lambda x: mutual_info_score(df['income'], x))
mi.sort_values(ascending=False)

relationship      0.114663
marital_status    0.108826
education         0.063819
occupation        0.063730
sex               0.025431
workclass         0.015504
native_country    0.005686
race              0.005679
dtype: float64

In [11]:
from IPython.display import display

for c in categorical:
    display(df[c].value_counts())
    print()
    print()

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
?                    1836
Self-emp-inc         1695
Federal-gov          1432
UNK                   963
Without-pay            21
Never-worked           10
Name: count, dtype: int64





education
HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: count, dtype: int64





marital_status
Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: count, dtype: int64





occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
?                    1843
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
UNK                   966
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64





relationship
Husband           19716
Not-in-family     12583
Own-child          7581
Unmarried          5125
Wife               2331
Other-relative     1506
Name: count, dtype: int64





race
White                 41762
Black                  4685
Asian-Pac-Islander     1519
Amer-Indian-Eskimo      470
Other                   406
Name: count, dtype: int64





sex
Male      32650
Female    16192
Name: count, dtype: int64





native_country
United-States                 43832
Mexico                          951
?                               583
Philippines                     295
UNK                             274
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua    





In [12]:
df.replace(['unk','?','UNK'], 'UNKNOWN', inplace=True)

In [13]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [14]:
for education in df['education'].unique():
    print('%s: %.0f' % (education,df[df['education'].isin([education])]['education_num'].unique()[0]))

Bachelors: 13
HS-grad: 9
11th: 7
Masters: 14
9th: 5
Some-college: 10
Assoc-acdm: 12
Assoc-voc: 11
7th-8th: 4
Doctorate: 16
Prof-school: 15
5th-6th: 3
10th: 6
1st-4th: 2
Preschool: 1
12th: 8


In [15]:
df.drop('education_num', axis=1, inplace=True)

In [16]:
numerical = ['age','fnlwgt','capital_gain','capital_loss','hours_per_week']
features = categorical + numerical

In [17]:
# correlation
df[numerical].corrwith(df.income)

age               0.230369
fnlwgt           -0.006339
capital_gain      0.223013
capital_loss      0.147554
hours_per_week    0.227687
dtype: float64

In [18]:
from sklearn.model_selection import train_test_split

# split data into train/val/test with 60%/20%/20%
df_full, df_test = train_test_split(df, test_size=np.round(len(df)*.2).astype(int), random_state=42)
df_train, df_val = train_test_split(df, test_size=np.round(len(df)*.2).astype(int), random_state=42)

In [19]:
y_train = df_train.income.values
y_val = df_val.income.values
y_test = df_test.income.values

del df_train['income']
del df_val['income']
del df_test['income']

df_train = df_train.to_dict(orient='records')
df_val = df_val.to_dict(orient='records')
df_test = df_test.to_dict(orient='records')

from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
dv.fit(df_train)

X_train = dv.transform(df_train)
X_val = dv.transform(df_val)
X_test = dv.transform(df_test)

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# XGBoost
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

In [21]:
# You can now use these models for predictions or further evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define a function for model evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    return accuracy, precision, recall, f1, roc_auc

# Evaluate the models
lr_scores = evaluate_model(lr_model, X_val, y_val)
dt_scores = evaluate_model(dt_model, X_val, y_val)
rf_scores = evaluate_model(rf_model, X_val, y_val)
xgb_scores = evaluate_model(xgb_model, X_val, y_val)

# Compare the performances
models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC AUC']

all_scores = [lr_scores, dt_scores, rf_scores, xgb_scores]

for i, model in enumerate(models):
    print(f"Metrics for {model}:")
    for j, metric in enumerate(metrics):
        print(f"{metric}: {all_scores[i][j]}")
    print('\n')

Metrics for Logistic Regression:
Accuracy: 0.799037674037674
Precision: 0.7416563658838071
Recall: 0.2548853016142736
F1-score: 0.37938665823585205
ROC AUC: 0.6133476953175225


Metrics for Decision Tree:
Accuracy: 0.8159295659295659
Precision: 0.6168067226890757
Recall: 0.6236193712829227
F1-score: 0.6201943388255174
ROC AUC: 0.7503044253231447


Metrics for Random Forest:
Accuracy: 0.8528869778869779
Precision: 0.737441740031072
Recall: 0.6049277824978759
F1-score: 0.6646441073512253
ROC AUC: 0.7682718221904


Metrics for XGBoost:
Accuracy: 0.8746928746928747
Precision: 0.7888548057259713
Recall: 0.6554800339847069
F1-score: 0.7160092807424594
ROC AUC: 0.7998873059052211




In [None]:
# import re
# 
# # CREATING THE DMARTIX:
# features = dv.feature_names_
# 
# # Define a function to replace special characters in column names
# def clean_column_names(features):
#     regex = re.compile(r"[=\(\)&]", re.IGNORECASE)
#     return [regex.sub("_", c) for c in features]
# 
# 
# features = clean_column_names(features)
# dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
# dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)