In [25]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [26]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

training_df=pd.read_csv('gdrive/My Drive/training set.csv')
test_df=pd.read_csv('gdrive/My Drive/test set.csv')

# Preprocessing the data

In [27]:
trainingData = training_df

In [28]:
# Making sure there are no white spaces in the column names
trainingData.columns = trainingData.columns.str.strip()

In [29]:
# Using one hot encoding to transform all the columns with objects so it can be used for the classification task
col_ohe = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country", "<=50K/>50K."]

# Creating a new dataframe to put the new data in
ohe_df = pd.DataFrame()

ohe = OneHotEncoder()

for column in col_ohe:
  col_data = trainingData[column].values.reshape(-1,1)
  trans_train = ohe.fit_transform(col_data).toarray()

  col_labels = ohe.get_feature_names_out([column])
  ohe_df = pd.concat([ohe_df, pd.DataFrame(trans_train, columns=col_labels)], axis=1)

#Replacing the old data with the new data
trans_df = pd.concat([trainingData, ohe_df], axis=1)
trans_df.drop(columns=col_ohe, inplace=True)
print(trans_df)


       age  education-num  capital-gain  capital-loss  hours-per-week  \
0       30             10             0             0              40   
1       30              9             0             0              25   
2       18              9             0             0              35   
3       74              2             0             0              15   
4       61              9             0             0              52   
...    ...            ...           ...           ...             ...   
29995   37             13             0             0              50   
29996   76              4             0             0              12   
29997   38              9             0             0              40   
29998   19             10             0             0              40   
29999   18              8             0             0              40   

       workclass_ ?  workclass_ Federal-gov  workclass_ Local-gov  \
0               0.0                     0.0           

In [30]:
# Making the "<=50K/>50K." column into one instead of two, so can easily tell if they earn more or less than 50k
trans_df['target'] = trans_df.apply(lambda row: '<=50K' if row['<=50K/>50K._ <=50K'] == 1.0 else '>50K', axis=1)

trans_df.drop(columns=["<=50K/>50K._ <=50K", "<=50K/>50K._ >50K"], inplace=True)

# If it's 1 they earn more than 50k, if its 0, they earn less than 50k
trans_df['target'] = trans_df['target'].map({'>50K': 1, '<=50K': 0})


# Binary Classification Task

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

target_column = "target"
features = trans_df.drop(columns=[target_column])

seed = 24324

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, trans_df[target_column], test_size=0.2, random_state=seed)

# Creating a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=seed)

# Training the model
rf.fit(X_train, y_train)

# Make predictions on test and training data
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

# Evaluate the model
print("Accuracy:", (accuracy_score(y_test, y_pred))*100, "%")
print("Training accuracy:", (accuracy_score(y_train, y_pred_train))*100, "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 85.2 %
Training accuracy: 85.65 %

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.96      0.91      4549
           1       0.81      0.51      0.62      1451

    accuracy                           0.85      6000
   macro avg       0.83      0.74      0.77      6000
weighted avg       0.85      0.85      0.84      6000


Confusion Matrix:
 [[4373  176]
 [ 712  739]]


In [32]:
# Doing a cross validation of the model created
from sklearn.model_selection import cross_val_score

# Performing a 10 fold cross validation
cross_val = cross_val_score(rf, X_train, y_train, cv=10)

mean_accuracy = np.mean(cross_val)
print(cross_val)
print("Mean accuracy of cross validation:", mean_accuracy)

[0.85041667 0.85041667 0.86291667 0.84791667 0.8425     0.85333333
 0.85541667 0.85625    0.86208333 0.85375   ]
Mean accuracy of cross validation: 0.8535
