In [1]:
from sklearn.datasets import fetch_openml
import pandas as pd 
import numpy as np

data = fetch_openml(data_id=1590, as_frame=True)

X = data.data
y = data.target

In [2]:
income_data = pd.concat([X, y], axis= 1)
income_data.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


### Cleaning Data

In [3]:
print(f'Data NA:')
print(income_data.isna().sum())
print()
print(f'Data shape   : {income_data.shape}')
print()
print(f'Data Duplikat: {income_data.duplicated().sum()}')


Data NA:
age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
class                0
dtype: int64

Data shape   : (48842, 15)

Data Duplikat: 52


Sepertinya, data yang NA dan Duplikat bisa dihapus saja, mengingat perbandingan antara data NA dan data Duplikat dengan data keseluruhan sangat besar. 

Selain itu feature 'fnlwgt' dihapus karena dianggap tidak relevan. 

In [4]:
income_data = income_data.drop_duplicates(keep='last').dropna()
income_data = income_data.drop(columns = 'fnlwgt')

print(f'Data setelah NA, duplikat, dan irrelevant feature di-drop: {income_data.shape}')

Data setelah NA, duplikat, dan irrelevant feature di-drop: (45175, 14)


In [5]:
X = income_data.drop('class', axis=1)
y = income_data['class']

In [6]:
y.value_counts(normalize=True)

class
<=50K    0.752031
>50K     0.247969
Name: proportion, dtype: float64

Terlihat bahwa terjadi proporsi kelas yang tidak seimbang. Diperlukan perlakuan khusus saat splitting data. 

### Sampling & Splitting

In [7]:
num_samples = 5000
X = X.sample(n=num_samples, random_state=42)
y = y.loc[X.index]

print(f'X shape after sampling: {X.shape}')
print(f'y shape after sampling: {y.shape}')

X shape after sampling: (5000, 13)
y shape after sampling: (5000,)


Sampling dilakukan untuk menyederhanakan dataset agar proses modeling tidak memakan waktu terlalu lama. 

In [8]:
from sklearn.model_selection import train_test_split 

X_train, X_not_train, y_train, y_not_train = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

X_valid, X_test, y_valid, y_test           = train_test_split(X_not_train, y_not_train, test_size=0.5, random_state=123, stratify=y_not_train)

print('X train shape:', X_train.shape)
print('y train shape:', y_train.shape)
print('X test shape :', X_test.shape)
print('y test shape :', y_test.shape)
print('X valid shape:', X_valid.shape)
print('y valid shape:', y_valid.shape)

X train shape: (4000, 13)
y train shape: (4000,)
X test shape : (500, 13)
y test shape : (500,)
X valid shape: (500, 13)
y valid shape: (500,)


In [9]:
num_features = []
cat_features = []

for i in X_train.columns:
    if pd.api.types.is_numeric_dtype(X_train[i]):  # Check if the column is numeric
        num_features.append(i)
    else:
        cat_features.append(i)


In [10]:
print(num_features)

['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [11]:
print(cat_features)

['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']


In [12]:
for i in cat_features: 
    n = len(np.unique(X[i]))
    print(f'{i}: {n}')

workclass: 7
education: 16
marital-status: 7
occupation: 14
relationship: 6
race: 5
sex: 2
native-country: 40


### Label Encoder

In [13]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

encoded_y_train = label_encoder.fit_transform(y_train)
encoded_y_valid = label_encoder.transform(y_valid)
encoded_y_test  = label_encoder.transform(y_test)



In [14]:
y_train

47277    <=50K
48804    <=50K
38721    <=50K
24852    <=50K
38819    <=50K
         ...  
15106    <=50K
4364     <=50K
31234    <=50K
26284     >50K
26871     >50K
Name: class, Length: 4000, dtype: category
Categories (2, object): ['<=50K', '>50K']

In [15]:
encoded_y_train

array([0, 0, 0, ..., 0, 1, 1])

In [16]:
y_valid

48022    <=50K
46640    <=50K
26311    <=50K
18081    <=50K
31101    <=50K
         ...  
11852    <=50K
6940     <=50K
8045     <=50K
1220     <=50K
2542     <=50K
Name: class, Length: 500, dtype: category
Categories (2, object): ['<=50K', '>50K']

In [17]:
encoded_y_valid

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [18]:
y_test

36220    <=50K
46290    <=50K
37900    <=50K
47785    <=50K
18374     >50K
         ...  
4492      >50K
1117     <=50K
16624    <=50K
12809    <=50K
45883    <=50K
Name: class, Length: 500, dtype: category
Categories (2, object): ['<=50K', '>50K']

In [19]:
encoded_y_test

array([0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,

Label Encoder mengubah: 
a. <=50K menjadi 0 
b. > 50K menjadi 1


### Scaling & One Hot Encoder

In [20]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer = StandardScaler() 

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_features),
        ('num', num_transformer, num_features)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

X_train_processed = pipeline.fit_transform(X_train)
X_valid_processed = pipeline.transform(X_valid)
X_test_processed  = pipeline.transform(X_test)

X_train_processed = X_train_processed.toarray() if hasattr(X_train_processed, 'toarray') else X_train_processed 
X_valid_processed = X_valid_processed.toarray() if hasattr(X_valid_processed, 'toarray') else X_valid_processed 
X_test_processed  = X_test_processed.toarray() if hasattr(X_test_processed, 'toarray') else X_test_processed 




Karena adaboost menggunakan data target -1 dan 1, maka y perlu dikonversikan dari 0 dan 1 menjadi -1 dan 1

In [21]:
# Convert 0 to -1 for AdaBoost
ab_encoded_y_train = 2 * encoded_y_train - 1
ab_encoded_y_valid = 2 * encoded_y_valid - 1
ab_encoded_y_test  = 2 * encoded_y_test  - 1

In [22]:
print(np.unique(ab_encoded_y_train))
print(np.unique(ab_encoded_y_valid))
print(np.unique(ab_encoded_y_test))

[-1  1]
[-1  1]
[-1  1]


### Modeling

In [24]:
# custom model
from Logistic_Regression import Logistic_Regression
from KNN_Classifier import KNearest_Neighbor_Classifier
from SVM_Classifier import SVM_Classifier

from DecTree_Classifier import Decision_Tree_Classifier
from RandomForest_Classifier import Random_Forest_Classifier
from AdaBoost_Classifier import Adaboost_Classifier

# sklearn model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# accuracy metric 
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)



##### Logistic Regression

In [25]:
# Custom Logistic Regression Classifier
lre_cus = Logistic_Regression(learning_rate=0.01, iters_number=1000)
lre_cus.fit(X_train_processed, encoded_y_train)

# Sklearn Logistic Regression Classifier
lre_skl = LogisticRegression()
lre_skl.fit(X_train_processed, encoded_y_train)

# Predictions
cus_y_pred_valid = lre_cus.predict(X_valid_processed)
cus_y_pred_test  = lre_cus.predict(X_test_processed)

skl_y_pred_valid = lre_skl.predict(X_valid_processed)
skl_y_pred_test  = lre_skl.predict(X_test_processed)

# Accuracy
cus_acc_valid = accuracy(encoded_y_valid, cus_y_pred_valid)
cus_acc_test  = accuracy(encoded_y_test, cus_y_pred_test)

skl_acc_valid = accuracy(encoded_y_valid, skl_y_pred_valid)
skl_acc_test  = accuracy(encoded_y_test, skl_y_pred_test)

# Print Results
print('Custom Logistic Regression Accuracy Valid :', cus_acc_valid)
print('Custom Logistic Regression Accuracy Test  :', cus_acc_test)
print()
print('SKLearn Logistic Regression Accuracy Valid:', skl_acc_valid)
print('SKLearn Logistic Regression Accuracy Test :', skl_acc_test)

Custom Logistic Regression Accuracy Valid : 0.808
Custom Logistic Regression Accuracy Test  : 0.816

SKLearn Logistic Regression Accuracy Valid: 0.834
SKLearn Logistic Regression Accuracy Test : 0.848


##### KNearest Neighbors

In [26]:
# Custom KNN Classifier
knn_cus = KNearest_Neighbor_Classifier(k_value=5, distance_metric='manhattan')
knn_cus.fit(X_train_processed, encoded_y_train)

# Sklearn KNN Classifier
knn_skl = KNeighborsClassifier()
knn_skl.fit(X_train_processed, encoded_y_train)

# Predictions
cus_y_pred_valid = knn_cus.predict(X_valid_processed)
cus_y_pred_test  = knn_cus.predict(X_test_processed)

skl_y_pred_valid = knn_skl.predict(X_valid_processed)
skl_y_pred_test  = knn_skl.predict(X_test_processed)

# Accuracy
cus_acc_valid = accuracy(encoded_y_valid, cus_y_pred_valid)
cus_acc_test  = accuracy(encoded_y_test, cus_y_pred_test)

skl_acc_valid = accuracy(encoded_y_valid, skl_y_pred_valid)
skl_acc_test  = accuracy(encoded_y_test, skl_y_pred_test)

# Print Results
print('Custom  KNN Classifier Accuracy Valid:', cus_acc_valid)
print('Custom  KNN Classifier Accuracy Test :', cus_acc_test)
print()
print('SKLearn KNN Classifier Accuracy Valid:', skl_acc_valid)
print('SKLearn KNN Classifier Accuracy Test :', skl_acc_test)

Custom  KNN Classifier Accuracy Valid: 0.812
Custom  KNN Classifier Accuracy Test : 0.846

SKLearn KNN Classifier Accuracy Valid: 0.818
SKLearn KNN Classifier Accuracy Test : 0.844


##### Support Vector Machine

In [27]:
# Custom SVM Classifier
svm_cus = SVM_Classifier(lambda_param=0.0001, learning_rate=0.001, num_of_iters=1000)
svm_cus.fit(X_train_processed, encoded_y_train)

# Sklearn SVM Classifier
svm_skl = SVC() 
svm_skl.fit(X_train_processed, encoded_y_train)

# Predictions
cus_y_pred_valid = svm_cus.predict(X_valid_processed)
cus_y_pred_test  = svm_cus.predict(X_test_processed)

skl_y_pred_valid = svm_skl.predict(X_valid_processed)
skl_y_pred_test  = svm_skl.predict(X_test_processed)

# Accuracy
cus_acc_valid = accuracy(encoded_y_valid, cus_y_pred_valid)
cus_acc_test  = accuracy(encoded_y_test, cus_y_pred_test)

skl_acc_valid = accuracy(encoded_y_valid, skl_y_pred_valid)
skl_acc_test  = accuracy(encoded_y_test, skl_y_pred_test)

# Print Results
print('Custom  SVM Classifier Accuracy Valid:', cus_acc_valid)
print('Custom  SVM Classifier Accuracy Test :', cus_acc_test)
print()
print('SKLearn SVM Classifier Accuracy Valid:', skl_acc_valid)
print('SKLearn SVM Classifier Accuracy Test :', skl_acc_test)

Custom  SVM Classifier Accuracy Valid: 0.834
Custom  SVM Classifier Accuracy Test : 0.842

SKLearn SVM Classifier Accuracy Valid: 0.846
SKLearn SVM Classifier Accuracy Test : 0.854


##### Decision Tree

In [28]:
# Custom Decision Tree Classifier
dtr_cus = Decision_Tree_Classifier(min_samples_split=2, max_depth=5)
dtr_cus.fit(X_train_processed, encoded_y_train)

# Sklearn Decision Tree Classifier
dtr_skl = DecisionTreeClassifier()
dtr_skl.fit(X_train_processed, encoded_y_train)

# Predictions
cus_y_pred_valid = dtr_cus.predict(X_valid_processed)
cus_y_pred_test  = dtr_cus.predict(X_test_processed)

skl_y_pred_valid = dtr_skl.predict(X_valid_processed)
skl_y_pred_test  = dtr_skl.predict(X_test_processed)

# Accuracy
cus_acc_valid = accuracy(encoded_y_valid, cus_y_pred_valid)
cus_acc_test  = accuracy(encoded_y_test, cus_y_pred_test)

skl_acc_valid = accuracy(encoded_y_valid, skl_y_pred_valid)
skl_acc_test = accuracy(encoded_y_test, skl_y_pred_test)


# Print Results
print('Custom Decision Tree Classifier Accuracy Valid :', cus_acc_valid)
print('Custom Decision Tree Classifier Accuracy Test  :', cus_acc_test)
print()
print('SKLearn Decision Tree Classifier Accuracy Valid:', skl_acc_valid)
print('SKLearn Decision Tree Classifier Accuracy Test :', skl_acc_test)


Custom Decision Tree Classifier Accuracy Valid : 0.84
Custom Decision Tree Classifier Accuracy Test  : 0.85

SKLearn Decision Tree Classifier Accuracy Valid: 0.798
SKLearn Decision Tree Classifier Accuracy Test : 0.834


##### Random Forest

In [29]:
# Custom Random Forest Classifier
rfo_cus = Random_Forest_Classifier(n_trees=10, min_samples_split=2, max_depth=10)
rfo_cus.fit(X_train_processed, encoded_y_train)

# Sklearn Random Forest Classifier
rfo_skl = RandomForestClassifier()
rfo_skl.fit(X_train_processed, encoded_y_train)

# Predictions
cus_y_pred_valid = rfo_cus.predict(X_valid_processed)
cus_y_pred_test  = rfo_cus.predict(X_test_processed)

skl_y_pred_valid = rfo_skl.predict(X_valid_processed)
skl_y_pred_test  = rfo_skl.predict(X_test_processed)

# Accuracy
cus_acc_valid = accuracy(encoded_y_valid, cus_y_pred_valid)
cus_acc_test  = accuracy(encoded_y_test, cus_y_pred_test)

skl_acc_valid = accuracy(encoded_y_valid, skl_y_pred_valid)
skl_acc_test  = accuracy(encoded_y_test, skl_y_pred_test)

# Print Results
print('Custom Random Forest Classifier Accuracy Valid :', cus_acc_valid)
print('Custom Random Forest Classifier Accuracy Test  :', cus_acc_test)
print()
print('SKLearn Random Forest Classifier Accuracy Valid:', skl_acc_valid)
print('SKLearn Random Forest Classifier Accuracy Test :', skl_acc_test)


Custom Random Forest Classifier Accuracy Valid : 0.846
Custom Random Forest Classifier Accuracy Test  : 0.864

SKLearn Random Forest Classifier Accuracy Valid: 0.832
SKLearn Random Forest Classifier Accuracy Test : 0.848


##### AdaBoost 

In [30]:
# Custom AdaBoost Classifier
ada_cus = Adaboost_Classifier(n_clf=500)
ada_cus.fit(X_train_processed, ab_encoded_y_train)

# Sklearn AdaBoost Classifier
ada_skl = AdaBoostClassifier()
ada_skl.fit(X_train_processed, ab_encoded_y_train)

# Predictions
cus_y_pred_valid = ada_cus.predict(X_valid_processed)
cus_y_pred_test  = ada_cus.predict(X_test_processed)

skl_y_pred_valid = ada_skl.predict(X_valid_processed)
skl_y_pred_test  = ada_skl.predict(X_test_processed)

# Accuracy
cus_acc_valid = accuracy(ab_encoded_y_valid, cus_y_pred_valid)
cus_acc_test  = accuracy(ab_encoded_y_test, cus_y_pred_test)

skl_acc_valid = accuracy(ab_encoded_y_valid, skl_y_pred_valid)
skl_acc_test  = accuracy(ab_encoded_y_test, skl_y_pred_test)

# Print Results
print('Custom AdaBoost Classifier Accuracy Valid :', cus_acc_valid)
print('Custom AdaBoost Classifier Accuracy Test  :', cus_acc_test)
print()
print('SKLearn AdaBoost Classifier Accuracy Valid:', skl_acc_valid)
print('SKLearn AdaBoost Classifier Accuracy Test :', skl_acc_test)




Custom AdaBoost Classifier Accuracy Valid : 0.832
Custom AdaBoost Classifier Accuracy Test  : 0.854

SKLearn AdaBoost Classifier Accuracy Valid: 0.836
SKLearn AdaBoost Classifier Accuracy Test : 0.862
