In [None]:
### Decision Tree Classifier for Government Census Data

# Step 1: Import Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Step 2: Load and Preprocess the Dataset
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    "age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
    "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
    "hours_per_week", "native_country", "income"
]

data = pd.read_csv(url, names=column_names, header=None, na_values=" ?")

# Drop rows with missing values
data.dropna(inplace=True)

# Encode categorical features
label_encoders = {}
for col in data.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Split into features (X) and target (y)
X = data.drop(columns=["income"])
y = data["income"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the Decision Tree Classifier
# Train the decision tree classifier
model = DecisionTreeClassifier(max_depth=3, random_state=42)  # Pruned at 3 levels
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 4: Visualize the Decision Tree
# Visualize the decision tree
plt.figure(figsize=(20, 10))
plot_tree(
    model,
    feature_names=X.columns,
    class_names=["<=50K", ">50K"],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.show()

# Step 5: Use the Model for Predictions
def classify_individual(features):
    """
    Classify an individual based on their features.
    :param features: A dictionary of feature values.
    :return: Predicted income class (<=50K or >50K).
    """
    input_df = pd.DataFrame([features])
    for col in label_encoders:
        if col in input_df.columns:
            if col in label_encoders:
                input_df[col] = input_df[col].map(lambda x: label_encoders[col].transform([x])[0] if x in label_encoders[col].classes_ else -1)
            else:
                raise ValueError(f"Value {input_df[col]} not found in label encoder for {col}")
    prediction = model.predict(input_df)[0]
    return label_encoders["income"].inverse_transform([prediction])[0]

# Example Prediction
example_features = {
    "age": 35,
    "workclass": "Private",
    "fnlwgt": 215646,
    "education": "Bachelors",
    "education_num": 13,
    "marital_status": "Married-civ-spouse",
    "occupation": "Exec-managerial",
    "relationship": "Husband",
    "race": "White",
    "sex": "Male",
    "capital_gain": 0,
    "capital_loss": 0,
    "hours_per_week": 40,
    "native_country": "United-States"
}

print("Predicted Income Class:", classify_individual(example_features))
