In [12]:
import pandas as pd

# Set the path to the CSV containing the dataset to train on.
csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

# Set the column names for the columns in the CSV. If the CSV's first line is a header line containing
# the column names, then set this to None.
csv_columns = [
  "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital-Status",
  "Occupation", "Relationship", "Race", "Sex", "Capital-Gain", "Capital-Loss",
  "Hours-per-week", "Country", "Over-50K"]

# Read the dataset from the provided CSV and print out information about it.
df = pd.read_csv(csv_path, names=csv_columns, skipinitialspace=True)
categorical_features = ["Workclass", "Marital-Status", "Occupation", "Relationship", "Race", "Sex", "Country","Education"]
df = df.drop(df[(df['Workclass'] == '?') | (df['Country'] == '?') | (df['Occupation'] == '?')].index)
df

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Country,Over-50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [13]:
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

categorical_features = ["Workclass", "Marital-Status", "Occupation", "Relationship", "Race", "Sex", "Country"]
category_mappings = {}
for feature in categorical_features:
    label_encoder = LabelEncoder()
    df[feature] = label_encoder.fit_transform(df[feature])
    category_mappings[feature] = {label: index for index, label in enumerate(label_encoder.classes_)}
df['Over-50K'] = df['Over-50K'].map({'<=50K': 0, '>50K': 1})
df["in_USA"] = df.apply(lambda x: str(x["Country"]=='United-States'), axis=1).map({'False': 0, 'True': 1})
df = df.drop("Country", axis = 1)
df.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Over-50K,in_USA
0,39,5,77516,Bachelors,13,4,0,1,4,1,2174,0,40,0,0
1,50,4,83311,Bachelors,13,2,3,0,4,1,0,0,13,0,0
2,38,2,215646,HS-grad,9,0,5,1,4,1,0,0,40,0,0
3,53,2,234721,11th,7,2,5,0,2,1,0,0,40,0,0
4,28,2,338409,Bachelors,13,2,9,5,2,0,0,0,40,0,0


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import numpy as np


columns_to_drop = ['fnlwgt', 'Education', 'Over-50K']
X = df.drop(columns_to_drop, axis=1)
y = df['Over-50K']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [15]:
def find_best_threshold(model, X, y_true):
    y_scores = model.predict_proba(X)[:, 1]
    best_threshold = 0
    best_precision = 0

    for threshold in np.arange(0, 1, 0.01):

        y_pred = (y_scores >= threshold).astype(int)
        precision = precision_score(y_true, y_pred)
        if precision > best_precision:
            best_threshold = threshold
            best_precision = precision

    return best_threshold, best_precision

In [16]:
best_threshold, best_precision = find_best_threshold(lr, X_test, y_test)
y_scores = lr.predict_proba(X_test)[:, 1]
y_pred_optimized = (y_scores >= best_threshold).astype(int)
print(f"Best Threshold: {best_threshold}")
print(f"Precision at best threshold: {best_precision}")
print(f"Recall at best threshold: {recall_score(y_test, y_pred_optimized)}")
print(f"Accuracy at best threshold: {accuracy_score(y_test, y_pred_optimized)}")
print(f"Confusion Matrix at best threshold:\n {confusion_matrix(y_test, y_pred_optimized)}")

Best Threshold: 0.96
Precision at best threshold: 0.9916666666666667
Recall at best threshold: 0.07777777777777778
Accuracy at best threshold: 0.7659539201060832
Confusion Matrix at best threshold:
 [[4502    1]
 [1411  119]]


In [17]:
!pip install imblearn



In [18]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy='minority')
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)

lr_over = LogisticRegression()
lr_over.fit(X_train_over, y_train_over)

best_threshold_over, best_precision_over = find_best_threshold(lr_over, X_test, y_test)
y_scores_over = lr_over.predict_proba(X_test)[:, 1]
y_pred_optimized_over = (y_scores_over >= best_threshold_over).astype(int)

print(f"Best Threshold (Oversampled): {best_threshold_over}")
print(f"Precision at best threshold (Oversampled): {best_precision_over}")
print(f"Recall at best threshold (Oversampled): {recall_score(y_test, y_pred_optimized_over)}")
print(f"Accuracy at best threshold (Oversampled): {accuracy_score(y_test, y_pred_optimized_over)}")
print(f"Confusion Matrix at best threshold (Oversampled):\n{confusion_matrix(y_test, y_pred_optimized_over)}")


Best Threshold (Oversampled): 0.99
Precision at best threshold (Oversampled): 0.990909090909091
Recall at best threshold (Oversampled): 0.07124183006535947
Accuracy at best threshold (Oversampled): 0.7642963699651915
Confusion Matrix at best threshold (Oversampled):
[[4502    1]
 [1421  109]]


In [19]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

lr_smote = LogisticRegression()
lr_smote.fit(X_train_smote, y_train_smote)


best_threshold_smote, best_precision_smote = find_best_threshold(lr_smote, X_test, y_test)
y_scores_smote = lr_smote.predict_proba(X_test)[:, 1]
y_pred_optimized_smote = (y_scores_smote >= best_threshold_smote).astype(int)

print(f"Best Threshold (SMOTE): {best_threshold_smote}")
print(f"Precision at best threshold (SMOTE): {best_precision_smote}")
print(f"Recall at best threshold (SMOTE): {recall_score(y_test, y_pred_optimized_smote)}")
print(f"Accuracy at best threshold (SMOTE): {accuracy_score(y_test, y_pred_optimized_smote)}")
print(f"Confusion Matrix at best threshold (SMOTE):\n{confusion_matrix(y_test, y_pred_optimized_smote)}")


Best Threshold (SMOTE): 0.99
Precision at best threshold (SMOTE): 0.991304347826087
Recall at best threshold (SMOTE): 0.07450980392156863
Accuracy at best threshold (SMOTE): 0.7651251450356373
Confusion Matrix at best threshold (SMOTE):
[[4502    1]
 [1416  114]]


In [20]:
lr_weighted = LogisticRegression(class_weight='balanced')
lr_weighted.fit(X_train, y_train)

best_threshold_weighted, best_precision_weighted = find_best_threshold(lr_weighted, X_test, y_test)

y_scores_weighted = lr_weighted.predict_proba(X_test)[:, 1]
y_pred_optimized_weighted = (y_scores_weighted >= best_threshold_weighted).astype(int)

print(f"Best Threshold (Class Weighted): {best_threshold_weighted}")
print(f"Precision at best threshold (Class Weighted): {best_precision_weighted}")
print(f"Recall at best threshold (Class Weighted): {recall_score(y_test, y_pred_optimized_weighted)}")
print(f"Accuracy at best threshold (Class Weighted): {accuracy_score(y_test, y_pred_optimized_weighted)}")
print(f"Confusion Matrix at best threshold (Class Weighted):\n{confusion_matrix(y_test, y_pred_optimized_weighted)}")


Best Threshold (Class Weighted): 0.99
Precision at best threshold (Class Weighted): 0.9910714285714286
Recall at best threshold (Class Weighted): 0.07254901960784314
Accuracy at best threshold (Class Weighted): 0.7646278799933698
Confusion Matrix at best threshold (Class Weighted):
[[4502    1]
 [1419  111]]


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve

rf = RandomForestClassifier(n_estimators=1000, random_state=57)
rf.fit(X_train, y_train)

y_scores_rf = rf.predict_proba(X_test)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_test, y_scores_rf)
max_precision_index = np.argmax(precisions[precisions < 1])
threshold_max_precision = thresholds[max_precision_index]

y_pred_max_precision = (y_scores_rf >= threshold_max_precision).astype(int)

precision_max = precision_score(y_test, y_pred_max_precision)
recall_max = recall_score(y_test, y_pred_max_precision)
accuracy_max = accuracy_score(y_test, y_pred_max_precision)

print(f"Threshold for Max Precision: {threshold_max_precision}")
print(f"Precision: {precision_max}")
print(f"Recall: {recall_max}")
print(f"Accuracy: {accuracy_max}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_max_precision)}")


Threshold for Max Precision: 0.9948333333333332
Precision: 0.9932885906040269
Recall: 0.09673202614379085
Accuracy: 0.7707608155146694
Confusion Matrix:
[[4502    1]
 [1382  148]]


In [22]:
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, accuracy_score, confusion_matrix

smote = SMOTE(random_state=57,sampling_strategy='minority')
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
rf_smote = RandomForestClassifier(n_estimators=1000, random_state=57)
rf_smote.fit(X_train_smote, y_train_smote)
y_scores_smote = rf_smote.predict_proba(X_test)[:, 1]
precisions_smote, recalls_smote, thresholds_smote = precision_recall_curve(y_test, y_scores_smote)
threshold_smote = thresholds_smote[np.argmax(precisions_smote[precisions_smote < 1])]
y_pred_smote = (y_scores_smote >= threshold_smote).astype(int)


precision_smote = precision_score(y_test, y_pred_smote)
recall_smote = recall_score(y_test, y_pred_smote)
accuracy_smote = accuracy_score(y_test, y_pred_smote)

print("SMOTE Resampling Optimized for Precision")
print(f"Threshold: {threshold_smote}")
print(f"Precision: {precision_smote}")
print(f"Recall: {recall_smote}")
print(f"Accuracy: {accuracy_smote}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_smote)}")

ros = RandomOverSampler(random_state=57,sampling_strategy='minority')
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
rf_ros = RandomForestClassifier(n_estimators=1000, random_state=57)
rf_ros.fit(X_train_ros, y_train_ros)
y_scores_ros = rf_ros.predict_proba(X_test)[:, 1]
precisions_ros, recalls_ros, thresholds_ros = precision_recall_curve(y_test, y_scores_ros)
threshold_ros = thresholds_ros[np.argmax(precisions_ros[precisions_ros < 1])]
y_pred_ros = (y_scores_ros >= threshold_ros).astype(int)


precision_ros = precision_score(y_test, y_pred_ros)
recall_ros = recall_score(y_test, y_pred_ros)
accuracy_ros = accuracy_score(y_test, y_pred_ros)

print("\nRandom Over Sampling Optimized for Precision")
print(f"Threshold: {threshold_ros}")
print(f"Precision: {precision_ros}")
print(f"Recall: {recall_ros}")
print(f"Accuracy: {accuracy_ros}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_ros)}")


SMOTE Resampling Optimized for Precision
Threshold: 0.9974444444444445
Precision: 0.9652777777777778
Recall: 0.09084967320261438
Accuracy: 0.76860600033151
Confusion Matrix:
[[4498    5]
 [1391  139]]

Random Over Sampling Optimized for Precision
Threshold: 0.9998181818181818
Precision: 0.8992248062015504
Recall: 0.07581699346405228
Accuracy: 0.7634675948947456
Confusion Matrix:
[[4490   13]
 [1414  116]]
