In [2]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
# print(adult.metadata) 
  
# variable information 
# print(adult.variables) 

In [3]:
# Inspect the data
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [4]:
# Inspect the data
y.head()

Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K


In [5]:
y = y.replace({"<=50K.": "<=50K", ">50K.": ">50K"})

In [6]:
y.nunique()

income    2
dtype: int64

In [7]:
# Check for missing values in the features
X.isnull().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
dtype: int64

In [14]:
# Extract unique values for each categorical feature
categorical_features = [
    'workclass', 'education', 'marital-status', 'occupation', 
    'relationship', 'race', 'sex', 'native-country'
]
# Get unique values for each feature
unique_values = {feature: X[feature].unique().tolist() for feature in categorical_features}

# Display the unique values
for feature, values in unique_values.items():
    print(f"{feature}: {values}")

workclass: ['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked', nan]
education: ['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th']
marital-status: ['Never-married', 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Separated', 'Married-AF-spouse', 'Widowed']
occupation: ['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners', 'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair', 'Transport-moving', 'Farming-fishing', 'Machine-op-inspct', 'Tech-support', '?', 'Protective-serv', 'Armed-Forces', 'Priv-house-serv', nan]
relationship: ['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried', 'Other-relative']
race: ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other']
sex: ['Male', 'Female']
native-country: ['United-States', 'Cuba', 'Jamaica', 'India'

# Data cleaning

### Handling Missing Values and '?'

In [None]:
import numpy as np
# Replace "?" with NaN in the specified columns using .loc
X.loc[X['workclass'] == "?", 'workclass'] = np.nan
X.loc[X['occupation'] == "?", 'occupation'] = np.nan
X.loc[X['native-country'] == "?", 'native-country'] = np.nan

# Impute missing values with the most frequent value (mode) using .loc[]
X.loc[:, 'workclass'] = X['workclass'].fillna(X['workclass'].mode()[0])
X.loc[:, 'occupation'] = X['occupation'].fillna(X['occupation'].mode()[0])
X.loc[:, 'native-country'] = X['native-country'].fillna(X['native-country'].mode()[0])

In [None]:
# Standardize original categorical columns
X.loc[:, 'native-country'] = X['native-country'].str.lower()

In [None]:
# Check for missing values in the features
X.isnull().sum()

### One-hot Encoding

In [None]:
# Perform one-hot encoding for categorical features
X = pd.get_dummies(X, columns=['workclass', 'education', 'marital-status', 'occupation',
                                          'relationship', 'race', 'sex', 'native-country'], drop_first=True)

### Ensuring Correct Data Types

In [None]:
X.loc[:, 'age'] = X['age'].astype(int)
X.loc[:, 'fnlwgt'] = X['fnlwgt'].astype(int)
X.loc[:, 'capital-gain'] = X['capital-gain'].astype(int)
X.loc[:, 'capital-loss'] = X['capital-loss'].astype(int)
X.loc[:, 'hours-per-week'] = X['hours-per-week'].astype(int)

In [None]:
# Check the data types of the columns
X.dtypes

### Feature Engineering

In [None]:
# Create a new feature 'net-capital-gain'
X['net-capital-gain'] = X['capital-gain'] - X['capital-loss']

# Drop the original 'capital-gain' and 'capital-loss' columns if no longer needed
X.drop(['capital-gain', 'capital-loss'], axis=1, inplace=True)

In [None]:
# Drop `education-num` since it seems redundant
X.drop('education-num', axis=1, inplace=True)

## Handling Outliers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Boxplot to check for outliers in 'hours-per-week'
sns.boxplot(x=X['hours-per-week'])
# Remove outliers in 'hours-per-week'
X = X[X['hours-per-week'] <= 80]

plt.show()

In [None]:
# Function to remove outliers based on IQR
def remove_outliers_iqr(df, columns):
    for col in columns:
        # Calculate Q1 (25th percentile) and Q3 (75th percentile)
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        
        # Calculate IQR
        IQR = Q3 - Q1
        
        # Define lower and upper bounds for detecting outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Remove rows where values are outside the bounds
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    
    return df

# Columns to check for outliers (adjust as needed)
columns_to_check = ['age', 'fnlwgt', 'net-capital-gain']

# Apply the function to remove outliers
X_cleaned = remove_outliers_iqr(X, columns_to_check)

# Display the cleaned dataframe
X_cleaned.head()

In [None]:
X_cleaned['hours-per-week'].max()

In [None]:
print(X_cleaned.shape)
print(y.shape)

In [None]:
# Assuming y is still a pandas Series, you can check:
y = y.loc[X_cleaned.index]

In [None]:
print(X_cleaned.shape)
print(y.shape)

### Splitting the Data

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=0.2, random_state=42)

# Check the shape of the split data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## **Random Forest** Accuracy= 85%

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Train the model with 1D target array (using squeeze)
rf_model.fit(X_train, y_train.squeeze())

In [None]:
# Make predictions on the test data
y_pred = rf_model.predict(X_test)

In [None]:
# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Get a more detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

## **Neural Network Classification**  Accuracy= 83%

In [None]:
# Import libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler

In [None]:
# Preprocess the Target Variable
y_train_nn = pd.get_dummies(y_train).values  # One-hot encoding
y_test_nn = pd.get_dummies(y_test).values    # For evaluation

In [None]:
# Scale the Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Define NN model
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(2, activation='sigmoid')  # Two neurons for binary output with softmax
])

In [None]:
# Compile the model
nn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
history = nn_model.fit(X_train_scaled, y_train_nn, epochs=20, batch_size=32, validation_split=0.2)

In [None]:
# Evaluate the Model
test_loss, test_accuracy = nn_model.evaluate(X_test_scaled, y_test_nn)
print(f'Test accuracy: {test_accuracy:.4f}')

In [None]:
# Convert y_test to integer labels to match the format of y_pred_classes
y_test_int = y_test.replace({"<=50K": 0, ">50K": 1}).astype(int)

# Convert neural network predictions to binary classes
y_pred_nn = nn_model.predict(X_test_scaled)
y_pred_classes = (y_pred_nn[:, 1] > 0.5).astype(int)  # Convert probabilities to binary predictions

In [None]:
# Evaluation
# Import the evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix

# Print classification report and confusion matrix using integer labels
print("Classification Report:")
print(classification_report(y_test_int, y_pred_classes))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_int, y_pred_classes))

## **XGBoost** Accuracy= 86%

In [None]:
import xgboost as xgb

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier(
    n_estimators=100,       # Number of boosting rounds
    max_depth=6,            # Maximum tree depth for base learners
    learning_rate=0.1,      # Step size shrinkage used to prevent overfitting
    eval_metric='logloss',   # Evaluation metric for the validation dataset
    random_state=42
)
y_train_int = y_train.replace({"<=50K": 0, ">50K": 1}).astype(int).values

In [None]:
# Fit the model on the training data
xgb_model.fit(X_train, y_train_int)  # y_train_int is the integer-encoded y_train

In [None]:
# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy_xgb = accuracy_score(y_test_int, y_pred_xgb)
print(f'Accuracy: {accuracy_xgb:.4f}')
print("\nClassification Report:")
print(classification_report(y_test_int, y_pred_xgb))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_int, y_pred_xgb))

In [None]:
# Save models
import joblib

# Save Random Forest model
joblib.dump(rf_model, 'Resources/Models/random_forest_model.pkl')

# Save Neural Network model
joblib.dump(nn_model, 'Resources/Models/neural_network_model.pkl')

# Save XGBoost model
joblib.dump(xgb_model, 'Resources/Models/xgboost_model.pkl')

print("Models saved successfully!")