In [36]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [37]:
'''
Here, we authenticate with Google Drive to add the datafiles from a Drive folder
'''
from google.colab import drive
drive.mount('/content/drive')

train_data = pd.read_csv("/content/drive/My Drive/data-mining-csv-files/census-income.data.csv") #- Albert: remove comment if you want to manually place the files here. If not, it's going to use a Google Drive folder
test_data = pd.read_csv("/content/drive/My Drive/data-mining-csv-files/census-income.test.csv")  #- Albert: remove comment if you want to manually place the files here. If not, it's going to use a Google Drive folder

# Displaying results:


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Addressing the imbalanced data**

1. Undersampling

2. SMOTE

In [38]:
# Assuming your DataFrame is df_copy

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode 'marital-status' column
train_data['marital-status'] = label_encoder.fit_transform(train_data['marital-status'])
train_data['relationship'] = label_encoder.fit_transform(train_data['relationship'])
train_data['race'] = label_encoder.fit_transform(train_data['race'])
train_data['sex'] = label_encoder.fit_transform(train_data['sex'])
## convert 'income' data to numerical values
train_data['income'] = label_encoder.fit_transform(train_data['income'])

In [39]:
# Assuming your DataFrame is df_copy

# Initialize LabelEncoder
abel_encoder = LabelEncoder()

# Encode 'marital-status' column
test_data['marital-status'] = label_encoder.fit_transform(test_data['marital-status'])
test_data['relationship'] = label_encoder.fit_transform(test_data['relationship'])
test_data['race'] = label_encoder.fit_transform(test_data['race'])
test_data['sex'] = label_encoder.fit_transform(test_data['sex'])
## convert 'income' data to numerical values
test_data['income'] = label_encoder.fit_transform(test_data['income'])

In [40]:
## replace 'native-country' with mode
train_data['native-country'] = train_data['native-country'].replace('?', 'United-States')
label_encoder = LabelEncoder()
train_data['native-country'] = label_encoder.fit_transform(train_data['native-country'])

## replace 'work-class' with mode
train_data['work-class'] = train_data['work-class'].replace('?', 'Private')
train_data['work-class'] = label_encoder.fit_transform(train_data['work-class'])

x_train = train_data.drop(['education','income'], axis=1)
y_train = train_data["income"]


In [41]:
## replace 'native-country' with mode
test_data['native-country'] = test_data['native-country'].replace('?', 'United-States')
label_encoder = LabelEncoder()
test_data['native-country'] = label_encoder.fit_transform(test_data['native-country'])

## replace 'work-class' with mode
test_data['work-class'] = test_data['work-class'].replace('?', 'Private')
test_data['work-class'] = label_encoder.fit_transform(test_data['work-class'])

x_test = test_data.drop(['education','income'], axis=1)
y_test = test_data["income"]


In [45]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder

# Replace '?' with NaN in the 'occupation' column
x_train['occupation'].replace('?', np.nan, inplace=True)

# Separate features (X) and target variable (y)
X_train = x_train.drop('occupation', axis=1)
y_occupation_train = x_train['occupation']


# Apply KNN Imputer
knn_imputer = KNNImputer()
X_imputed_train = knn_imputer.fit_transform(X_train)

# Convert imputed data back to DataFrame
X_imputed_df_train = pd.DataFrame(X_imputed_train, columns=X_train.columns)

# Combine imputed data with original data
imputed_x_train = X_imputed_df_train.copy()
imputed_x_train['occupation'] = y_occupation_train


imputed_x_train['occupation'] = label_encoder.fit_transform(imputed_x_train['occupation'])


In [46]:


# Replace '?' with NaN in the 'occupation' column
x_test['occupation'].replace('?', np.nan, inplace=True)

# Step 2: Separate features (X) and target variable (y)
X_test = x_test.drop('occupation', axis=1)
y_occupation_test = x_test['occupation']

# Step 3: Apply KNN Imputer
knn_imputer = KNNImputer()
X_imputed_test = knn_imputer.fit_transform(X_test)

# Step 4: Convert imputed data back to DataFrame
X_imputed_df_test = pd.DataFrame(X_imputed_test, columns=X_test.columns)

# Step 5: Combine imputed data with original data
imputed_x_test = X_imputed_df_test.copy()
imputed_x_test['occupation'] = y_occupation_test

imputed_x_test['occupation'] = label_encoder.fit_transform(imputed_x_test['occupation'])


In [47]:



# Step 3: Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(imputed_x_train, y_train, test_size=0.2, random_state=42)

# Step 4: Initialize and train the Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(imputed_x_train, y_train)

y_pred_train = clf.predict(imputed_x_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Accuracy on train data:", accuracy_train)

clf_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=0.01)  # Example value, you may need to tune this
clf_pruned.fit(imputed_x_train, y_train)

# Step 3: Make predictions on the test data
y_pred_train_pruned = clf_pruned.predict(imputed_x_train)

# Step 4: Evaluate the accuracy of the pruned model on the test data
accuracy_train_pruned = accuracy_score(y_train, y_pred_train_pruned)
print("Accuracy on pruned train data:", accuracy_train_pruned)

from sklearn.metrics import confusion_matrix

# Compute confusion matrix for regular decision tree
conf_matrix_regular = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix for Regular Decision Tree:")
print("                   Predicted Class")
print("                 |  Negative (0) | Positive (1) |")
print("Actual Class ---|---------------|---------------|")
print(f"Negative (0)    |      {conf_matrix_regular[0, 0]}       |      {conf_matrix_regular[0, 1]}       |")
print(f"Positive (1)    |      {conf_matrix_regular[1, 0]}       |      {conf_matrix_regular[1, 1]}       |")

# Compute confusion matrix for pruned decision tree
conf_matrix_pruned = confusion_matrix(y_train, y_pred_train_pruned)
print("\nConfusion Matrix for Pruned Decision Tree:")
print("                   Predicted Class")
print("                 |  Negative (0) | Positive (1) |")
print("Actual Class ---|---------------|---------------|")
print(f"Negative (0)    |      {conf_matrix_pruned[0, 0]}       |      {conf_matrix_pruned[0, 1]}       |")
print(f"Positive (1)    |      {conf_matrix_pruned[1, 0]}       |      {conf_matrix_pruned[1, 1]}       |")


# Generate classification report for regular decision tree
print("\nClassification Report for Regular Decision Tree on Train Data:")
print(classification_report(y_train, y_pred_train))

# Generate classification report for pruned decision tree
print("\nClassification Report for Pruned Decision Tree on Train Data:")
print(classification_report(y_train, y_pred_train_pruned))

# Step 3: Make predictions on the test data
y_pred_test = clf.predict(imputed_x_test)

# Step 4: Evaluate the accuracy of the model on the test data
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Accuracy on test data:", accuracy_test)

# clf_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=0.01)  # Example value, you may need to tune this
# clf_pruned.fit(imputed_x_train, y_train)

# Step 3: Make predictions on the test data
y_pred_test_pruned = clf_pruned.predict(imputed_x_test)

# Step 4: Evaluate the accuracy of the pruned model on the test data
accuracy_test_pruned = accuracy_score(y_test, y_pred_test_pruned)
print("Accuracy on pruned test data:", accuracy_test_pruned)

# Compute confusion matrix for regular decision tree
conf_matrix_regular = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix for Regular Decision Tree:")
print("                   Predicted Class")
print("                 |  Negative (0) | Positive (1) |")
print("Actual Class ---|---------------|---------------|")
print(f"Negative (0)    |      {conf_matrix_regular[0, 0]}       |      {conf_matrix_regular[0, 1]}       |")
print(f"Positive (1)    |      {conf_matrix_regular[1, 0]}       |      {conf_matrix_regular[1, 1]}       |")

# Compute confusion matrix for pruned decision tree
conf_matrix_pruned = confusion_matrix(y_test, y_pred_test_pruned)
print("\nConfusion Matrix for Pruned Decision Tree:")
print("                   Predicted Class")
print("                 |  Negative (0) | Positive (1) |")
print("Actual Class ---|---------------|---------------|")
print(f"Negative (0)    |      {conf_matrix_pruned[0, 0]}       |      {conf_matrix_pruned[0, 1]}       |")
print(f"Positive (1)    |      {conf_matrix_pruned[1, 0]}       |      {conf_matrix_pruned[1, 1]}       |")


# Generate classification report for regular decision tree
print("\nClassification Report for Regular Decision Tree on Test Data:")
print(classification_report(y_test, y_pred_test))

# Generate classification report for pruned decision tree
print("\nClassification Report for Pruned Decision Tree on Test Data:")
print(classification_report(y_test, y_pred_test_pruned))




Accuracy on train data: 0.9999692884125181
Accuracy on pruned train data: 0.8398083596941126
Confusion Matrix for Regular Decision Tree:
                   Predicted Class
                 |  Negative (0) | Positive (1) |
Actual Class ---|---------------|---------------|
Negative (0)    |      24720       |      0       |
Positive (1)    |      1       |      7840       |

Confusion Matrix for Pruned Decision Tree:
                   Predicted Class
                 |  Negative (0) | Positive (1) |
Actual Class ---|---------------|---------------|
Negative (0)    |      23601       |      1119       |
Positive (1)    |      4097       |      3744       |

Classification Report for Regular Decision Tree on Train Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     24720
           1       1.00      1.00      1.00      7841

    accuracy                           1.00     32561
   macro avg       1.00      1.00      1.00     32561
w