In [None]:
# Import library yang dibutuhkan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, recall_score, precision_score, classification_report, confusion_matrix

# Set pandas display options
pd.set_option('display.max_columns', None)

# Set random seed untuk reproducibility
np.random.seed(42)

# ===============================================================
# 1. LOAD DATA DAN EXPLORATORY DATA ANALYSIS (EDA)
# ===============================================================
print("1. Loading data and performing EDA...")

# Load dataset
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 
                'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 
                'hours_per_week', 'native_country', 'income']

# Load dataset (adjust path as needed)
df = pd.read_csv('adult.data', names=column_names, skipinitialspace=True, na_values="?")

# Dataset Overview
print("Dataset Overview:")
print(f"Shape: {df.shape}")
print(df.info())

# Descriptive Statistics
print("\nDescriptive Statistics:")
print(df.describe())

# Categorical Statistics
print("\nCategorical Statistics:")
print(df.describe(include=['object']))

# Check Missing Values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

print("\nMissing Values:")
print(missing_values)
print("\nMissing Percentage:")
print(missing_percentage)

# Target Distribution
print("\nTarget Distribution:")
print(df['income'].value_counts(normalize=True))

# ===============================================================
# 2. VISUALISASI DATA
# ===============================================================
print("\n2. Data Visualization...")

# Histogram untuk variabel numerik
numeric_features = df.select_dtypes(include=[np.number]).columns
df[numeric_features].hist(figsize=(15, 10))
plt.tight_layout()
plt.savefig('numeric_histograms.png')
plt.close()

# Bar plot untuk variabel kategorikal
categorical_features = df.select_dtypes(include=['object']).columns
for feature in categorical_features:
    plt.figure(figsize=(10, 5))
    df[feature].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {feature}')
    plt.ylabel('Count')
    plt.xlabel(feature)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'distribution_{feature}.png')
    plt.close()

# Korelasi antara variabel numerik
correlation_matrix = df[numeric_features].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Numeric Features')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()

# ===============================================================
# 3. DATA CLEANING
# ===============================================================
print("\n3. Data Cleaning...")

# Handle missing values
for column in df.columns:
    if df[column].dtype == 'object':
        # For categorical columns, fill with mode
        df[column].fillna(df[column].mode()[0], inplace=True)
    else:
        # For numeric columns, fill with median
        df[column].fillna(df[column].median(), inplace=True)

# Check for duplicates
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Remove duplicates
df.drop_duplicates(inplace=True)
print(f"Number of rows after removing duplicates: {len(df)}")

# ===============================================================
# 4. FEATURE ENGINEERING & ENCODING
# ===============================================================
print("\n4. Feature Engineering & Encoding...")

# Store encoding maps
encoding_maps = {}

# Encoding for 'workclass'
workclass_map = {'State-gov':5, 'Self-emp-not-inc':6, 'Private':7, 'Federal-gov':4, 'Local-gov':3,
                'Self-emp-inc':2, 'Without-pay':1, 'Never-worked':0}
df['workclass'] = df['workclass'].map(workclass_map)
encoding_maps['workclass'] = workclass_map

# Encoding for 'marital_status'
marital_status_map = {'Never-married':0, 'Married-civ-spouse':1, 'Divorced':2, 'Married-spouse-absent':3,
                      'Separated':6, 'Married-AF-spouse':5, 'Widowed':4}
df['marital_status'] = df['marital_status'].map(marital_status_map)
encoding_maps['marital_status'] = marital_status_map

# Encoding for 'occupation'
occupation_map = {'Adm-clerical':1, 'Exec-managerial':2, 'Handlers-cleaners':3, 'Prof-specialty':4,
                 'Other-service':5, 'Sales':6, 'Craft-repair':7, 'Transport-moving':8,
                 'Farming-fishing':9, 'Machine-op-inspct':10, 'Tech-support':11,
                 'Protective-serv':12, 'Armed-Forces':13, 'Priv-house-serv':14}
df['occupation'] = df['occupation'].map(occupation_map)
encoding_maps['occupation'] = occupation_map

# Encoding for 'relationship'
relationship_map = {'Not-in-family':1, 'Husband':2, 'Wife':3, 'Own-child':4, 'Unmarried':0, 'Other-relative':5}
df['relationship'] = df['relationship'].map(relationship_map)
encoding_maps['relationship'] = relationship_map

# Encoding for 'race'
race_map = {'White':1, 'Black':2, 'Asian-Pac-Islander':3, 'Amer-Indian-Eskimo':4, 'Other':5}
df['race'] = df['race'].map(race_map)
encoding_maps['race'] = race_map

# Encoding for 'sex'
sex_map = {'Male':1, 'Female':0}
df['sex'] = df['sex'].map(sex_map)
encoding_maps['sex'] = sex_map

# Encoding for 'native_country'
native_country_map = {'United-States':1, 'Cuba':2, 'Jamaica':3, 'India':4, 'Mexico':5, 'South':6,
                      'Puerto-Rico':6, 'Honduras':7, 'England':8, 'Canada':9, 'Germany':10, 'Iran':11,
                      'Philippines':12, 'Italy':13, 'Poland':14, 'Columbia':15, 'Cambodia':16, 'Thailand':17, 'Ecuador':18,
                      'Laos':19, 'Taiwan':20, 'Haiti':21, 'Portugal':22, 'Dominican-Republic':23, 'El-Salvador':24,
                      'France':25, 'Guatemala':26, 'China':27, 'Japan':28, 'Yugoslavia':29, 'Peru':30,
                      'Outlying-US(Guam-USVI-etc)':31, 'Scotland':32, 'Trinadad&Tobago':33, 'Greece':34,
                      'Nicaragua':35, 'Vietnam':36, 'Hong':37, 'Ireland':38, 'Hungary':39, 'Holand-Netherlands':40}
df['native_country'] = df['native_country'].map(native_country_map)
encoding_maps['native_country'] = native_country_map

# Encoding for 'income' (target)
income_map = {'<=50K':0, '>50K':1}
df['income'] = df['income'].map(income_map)
encoding_maps['income'] = income_map

# Drop education column (since education_num is already available)
df = df.drop('education', axis=1)

# ===============================================================
# 5. CHECKING CORRELATIONS AFTER ENCODING
# ===============================================================
print("\n5. Checking correlations after encoding...")

# Create correlation heatmap
correlation = df.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(correlation.round(2),
           annot=True,
           vmax=1,
           square=True,
           cmap='RdYlGn_r')
plt.title('Correlation Matrix After Encoding')
plt.tight_layout()
plt.savefig('correlation_after_encoding.png')
plt.close()

# ===============================================================
# 6. FEATURE SELECTION
# ===============================================================
print("\n6. Feature Selection...")

# Remove constant features
df = df.loc[:, df.apply(pd.Series.nunique) != 1]

# Find and remove highly correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

data_tanpa_fitur = df.drop('income', axis=1)
corr_features = correlation(data_tanpa_fitur, 0.8)
print('Correlated features: ', len(set(corr_features)))
print(corr_features)

# Remove highly correlated features
df.drop(labels=corr_features, axis=1, inplace=True)

# Final correlation check
correlation = df.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(correlation.round(2),
           annot=True,
           vmax=1,
           square=True,
           cmap='RdYlGn_r')
plt.title('Final Correlation Matrix')
plt.tight_layout()
plt.savefig('final_correlation.png')
plt.close()

# ===============================================================
# 7. MODEL TRAINING
# ===============================================================
print("\n7. Model Training...")

# Split data into features and target
X = df.drop('income', axis=1)
y = df['income']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# Define parameter grid for GridSearchCV
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Create decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    estimator=dt_classifier,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f"Best Parameters: {best_params}")

# ===============================================================
# 8. MODEL EVALUATION
# ===============================================================
print("\n8. Model Evaluation...")

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=sorted(df['income'].unique()),
           yticklabels=sorted(df['income'].unique()))
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.close()

# Visualization of the Decision Tree
plt.figure(figsize=(40, 30))
plot_tree(best_model,
         feature_names=X.columns,
         class_names=[str(i) for i in sorted(df['income'].unique())],
         filled=True,
         rounded=True,
         max_depth=3)  # Limiting depth for better visualization
plt.title('Decision Tree Visualization (Limited to Depth 3)')
plt.tight_layout()
plt.savefig('decision_tree.png')
plt.close()

# Plot feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Check performance on training data
ori_y_pred_dt_train = best_model.predict(X_train)

ori_accuracy_dt_train = accuracy_score(y_train, ori_y_pred_dt_train)
print('Akurasi pada training set: ', ori_accuracy_dt_train)

ori_precision_dt_train = precision_score(y_train, ori_y_pred_dt_train, average='micro')
print('Precision pada training set: ', ori_precision_dt_train)

ori_recall_dt_train = recall_score(y_train, ori_y_pred_dt_train, average='micro')
print('Recall pada training set: ', ori_recall_dt_train)

# Recheck performance on testing data
ori_accuracy_dt_test = accuracy_score(y_test, y_pred)
print('Akurasi pada test set: ', ori_accuracy_dt_test)

ori_precision_dt_test = precision_score(y_test, y_pred, average='micro')
print('Precision pada test set: ', ori_precision_dt_test)

ori_recall_dt_test = recall_score(y_test, y_pred, average='micro')
print('Recall pada test set: ', ori_recall_dt_test)

# ===============================================================
# 9. CREATE MODEL COMPONENTS FOR SAVING
# ===============================================================
print("\n9. Creating and saving model components...")

# Create a dictionary of components
model_components = {
    'model': best_model,
    'feature_names': X.columns.tolist(),
    'encoding_maps': encoding_maps,
    'model_params': best_params,
    'removed_features': list(corr_features) if len(corr_features) > 0 else [],
    'target_map': income_map
}

# Save model components
joblib.dump(model_components, 'income_prediction_components.joblib')
print("Model components saved successfully as 'income_prediction_components.joblib'")

# Function to predict with the model (for testing)
def predict_income(data, model_components):
    """
    Make income predictions using the trained model
    
    Parameters:
    -----------
    data : dict or DataFrame
        Data with features for prediction
    model_components : dict
        Dictionary containing model and preprocessing information
    
    Returns:
    --------
    prediction : int 
        Predicted income class (0 for '<=50K', 1 for '>50K')
    probability : float
        Probability of the predicted class
    """
    # Convert single record to DataFrame if needed
    if isinstance(data, dict):
        data = pd.DataFrame([data])
    
    # Get components
    model = model_components['model']
    encoding_maps = model_components['encoding_maps']
    feature_names = model_components['feature_names']
    
    # Apply encoding to categorical features
    for col in data.columns:
        if col in encoding_maps and col != 'income':
            data[col] = data[col].map(encoding_maps[col])
    
    # Ensure we only use features that the model was trained on
    data_for_pred = data[feature_names].copy()
    
    # Make prediction
    prediction = model.predict(data_for_pred)[0]
    probabilities = model.predict_proba(data_for_pred)[0]
    
    # Get inverse mapping for income
    income_map_inverse = {v: k for k, v in encoding_maps['income'].items()}
    prediction_label = income_map_inverse[prediction]
    
    return {
        'prediction': prediction,
        'prediction_label': prediction_label,
        'probability': probabilities[prediction]
    }

# Test the prediction function with a sample
test_sample = X_test.iloc[0].to_dict()
loaded_components = joblib.load('income_prediction_components.joblib')
prediction_result = predict_income(test_sample, loaded_components)
print("\nTest prediction result:")
print(prediction_result)
print("Actual value:", y_test.iloc[0])