In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.impute import SimpleImputer


In [2]:
df = pd.read_csv('archive/adult_train.csv')

df.head(5)


Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.isnull().sum()

Age                  0
Workclass         1836
fnlwgt               0
Education            0
Education_Num        0
Martial_Status       0
Occupation        1843
Relationship         0
Race                 0
Sex                  0
Capital_Gain         0
Capital_Loss         0
Hours_per_week       0
Country            583
Target               0
dtype: int64

In [6]:
 # Group rare categories in 'native.country' into 'Other'
threshold = 50  
country_counts = df['Country'].value_counts() 
rare_countries = country_counts[country_counts < threshold].index  # Identify countries with less than 50 occurrences
df['Country'] = df['Country'].replace(rare_countries, 'Other')  # Replace rare countries with 'Other


In [8]:
X = df.drop(columns=['Target'])
y = df['Target'].map({' <=50K':0, ' >50K':1})


In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numerical_columns = X.select_dtypes(exclude="object").columns  # Select numerical columns (numbers)

# Numerical preprocessing: fill missing values with the mean
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

categorical_columns = X.select_dtypes(include="object").columns  # Select categorical columns (strings)
categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),  # Replace missing categorical values with the most frequent category
        ("onehot", OneHotEncoder(handle_unknown="ignore"))  # Convert categorical values into one-hot encoded variables
    ])
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_columns),
        ("cat", categorical_transformer, categorical_columns)
    ]
)
X_processed = preprocessor.fit_transform(X)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [13]:
RFC = RandomForestClassifier(random_state=42, n_estimators=37, max_depth=32)
RFC.fit(X_train, y_train)
y_pred = RFC.predict(X_test)
print("\nclassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(f"{accuracy_score(y_test, y_pred):.2f}")
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(pd.DataFrame(cm, index=["<=50K", ">50K"], columns=["<=50K", ">50K"]))


classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      4942
           1       0.77      0.63      0.69      1571

    accuracy                           0.87      6513
   macro avg       0.83      0.79      0.80      6513
weighted avg       0.86      0.87      0.86      6513


Accuracy Score:
0.87

Confusion Matrix:
       <=50K  >50K
<=50K   4640   302
>50K     577   994


In [126]:
test_df = pd.read_csv("archive/adult_test.csv")
test_df

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
16277,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [130]:
# Load the test dataset
test_df = pd.read_csv("archive/adult_test.csv")

# Fix target column: strip spaces and remove trailing period
test_df["Target"] = test_df["Target"].str.strip().str.replace(".", "", regex=False)
print(test_df["Target"])

0        <=50K
1        <=50K
2         >50K
3         >50K
4        <=50K
         ...  
16276    <=50K
16277    <=50K
16278    <=50K
16279    <=50K
16280     >50K
Name: Target, Length: 16281, dtype: object


In [133]:
# Encode target
y_test = test_df["Target"].map({"<=50K": 0, ">50K": 1})
y_test

0        0
1        0
2        1
3        1
4        0
        ..
16276    0
16277    0
16278    0
16279    0
16280    1
Name: Target, Length: 16281, dtype: int64

In [134]:
# Features
X_test = test_df.drop(columns=["Target"])

# Preprocess test features using the SAME preprocessor fitted on train data
X_test_processed = preprocessor.transform(X_test)

In [135]:
# Predict with your trained RandomForest model
y_predict = RFC.predict(X_test_processed)

In [137]:
# Check accuracy
acc = accuracy_score(y_test, y_predict)
print("Test Accuracy:", acc)

Test Accuracy: 0.8539401756648854
