In [None]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 

In [None]:
# Inspect the data
X.head()

In [None]:
# Inspect the data
y.head()

In [None]:
y = y.replace({"<=50K.": "<=50K", ">50K.": ">50K"})

In [None]:
y.nunique()

In [None]:
# Check for missing values in the features
X.isnull().sum()

# Data cleaning

### Handling Missing Values and '?'

In [None]:
import numpy as np
# Replace "?" with NaN in the specified columns using .loc
X.loc[X['workclass'] == "?", 'workclass'] = np.nan
X.loc[X['occupation'] == "?", 'occupation'] = np.nan
X.loc[X['native-country'] == "?", 'native-country'] = np.nan

# Impute missing values with the most frequent value (mode) using .loc[]
X.loc[:, 'workclass'] = X['workclass'].fillna(X['workclass'].mode()[0])
X.loc[:, 'occupation'] = X['occupation'].fillna(X['occupation'].mode()[0])
X.loc[:, 'native-country'] = X['native-country'].fillna(X['native-country'].mode()[0])

In [None]:
# Standardize original categorical columns
X.loc[:, 'native-country'] = X['native-country'].str.lower()

In [None]:
# Check for missing values in the features
X.isnull().sum()

### One-hot Encoding

In [None]:
# Perform one-hot encoding for categorical features
X = pd.get_dummies(X, columns=['workclass', 'education', 'marital-status', 'occupation',
                                          'relationship', 'race', 'sex', 'native-country'], drop_first=True)

### Ensuring Correct Data Types

In [None]:
X.loc[:, 'age'] = X['age'].astype(int)
X.loc[:, 'fnlwgt'] = X['fnlwgt'].astype(int)
X.loc[:, 'capital-gain'] = X['capital-gain'].astype(int)
X.loc[:, 'capital-loss'] = X['capital-loss'].astype(int)
X.loc[:, 'hours-per-week'] = X['hours-per-week'].astype(int)

In [None]:
# Check the data types of the columns
X.dtypes

### Feature Engineering

In [None]:
# Create a new feature 'net-capital-gain'
X['net-capital-gain'] = X['capital-gain'] - X['capital-loss']

# Drop the original 'capital-gain' and 'capital-loss' columns if no longer needed
X.drop(['capital-gain', 'capital-loss'], axis=1, inplace=True)

In [None]:
# Drop `education-num` since it seems redundant
X.drop('education-num', axis=1, inplace=True)

## Handling Outliers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Boxplot to check for outliers in 'hours-per-week'
sns.boxplot(x=X['hours-per-week'])
# Remove outliers in 'hours-per-week'
X = X[X['hours-per-week'] <= 80]

plt.show()

In [None]:
# Function to remove outliers based on IQR
def remove_outliers_iqr(df, columns):
    for col in columns:
        # Calculate Q1 (25th percentile) and Q3 (75th percentile)
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        
        # Calculate IQR
        IQR = Q3 - Q1
        
        # Define lower and upper bounds for detecting outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Remove rows where values are outside the bounds
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    
    return df

# Columns to check for outliers (adjust as needed)
columns_to_check = ['age', 'fnlwgt', 'net-capital-gain']

# Apply the function to remove outliers
X_cleaned = remove_outliers_iqr(X, columns_to_check)

# Display the cleaned dataframe
X_cleaned.head()

In [None]:
X_cleaned['hours-per-week'].max()

In [None]:
print(X_cleaned.shape)
print(y.shape)

In [None]:
# Assuming y is still a pandas Series, you can check:
y = y.loc[X_cleaned.index]

In [None]:
print(X_cleaned.shape)
print(y.shape)

### Splitting the Data

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=0.2, random_state=42)

# Check the shape of the split data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)



In [None]:
# Train the model with 1D target array (using squeeze)
rf_model.fit(X_train, y_train.squeeze())

In [None]:
# Make predictions on the test data
y_pred = rf_model.predict(X_test)

In [None]:
# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Get a more detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))