# Download Dataset from Here
https://drive.google.com/file/d/1jlI2H9nXrJlrIcoL8PShjdwIV--UNM15/view?usp=sharing

In [None]:
!pip install gradio tensorflow keras keras_tuner

Collecting gradio
  Downloading gradio-5.25.2-py3-none-any.whl.metadata (16 kB)
Collecting keras_tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.5-py3-none-manylinux_

In [None]:
!pip install -U langchain langchain-openai

Collecting langchain-openai
  Downloading langchain_openai-0.3.13-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain)
  Downloading langchain_core-0.3.52-py3-none-any.whl.metadata (5.9 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading langchain_openai-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.7/61.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_core-0.3.52-py3-none-any.whl (433 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.6/433.6 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected p

In [None]:
import argparse
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2  # Added for L2 regularization
import keras_tuner as kt
import io
from PIL import Image
import os

# Global variables to store the trained model and its type
trained_model = None
trained_model_type = None
feature_columns = None
x_train, x_test, y_train, y_test = None, None, None, None  # Initialize globally

# Load and preprocess data
def load_data():
    df = pd.read_csv("/content/adult.data")
    df.replace("?", pd.NA, inplace=True)
    df.dropna(inplace=True)
    df = pd.get_dummies(df, columns=["workclass", "education", "marital-status", "occupation",
                                    "relationship", "race", "sex", "native-country", "income"],
                        drop_first=True)
    df.columns = df.columns.str.strip().str.replace(" ", "_")
    df = df.astype(int)

    X = df.drop(columns=['income__>50K'])
    y = df['income__>50K']
    split_data = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    return split_data[0], split_data[1], split_data[2], split_data[3], X.columns

x_train, x_test, y_train, y_test, feature_columns = load_data()

# Function to create LSTM model with hyperparameter tuning and L2 regularization
def create_lstm_model(hp=None, input_shape=None):
    model = Sequential()
    if hp:
        lstm1_units = hp.Int('lstm1_units', min_value=32, max_value=128, step=32)
        lstm2_units = hp.Int('lstm2_units', min_value=16, max_value=64, step=16)
        dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)
        learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
        l2_lambda = hp.Choice('l2_lambda', values=[0.01, 0.001, 0.0001])  # L2 regularization strength
    else:
        lstm1_units, lstm2_units, dropout_rate, learning_rate = 64, 32, 0.2, 1e-3
        l2_lambda = 0.01  # Default L2 regularization strength

    model.add(LSTM(lstm1_units, input_shape=input_shape, return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(lstm2_units))
    model.add(Dropout(dropout_rate))
    model.add(Dense(16, activation='relu', kernel_regularizer=l2(l2_lambda)))  # L2 regularization
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(l2_lambda)))  # L2 regularization

    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Function for XGBoost training with RandomizedSearchCV
def train_xgboost(tune_hyperparams, x_train, x_test, y_train, y_test):
    eval_set = [(x_train, y_train), (x_test, y_test)]
    xgb = XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)

    if tune_hyperparams:
        param_dist = {
            'n_estimators': [110],
            'max_depth': [3, 4, 5],
            'learning_rate': [0.1, 0.12, 0.14],
            'subsample': [0.7, 0.8, 0.9],
            'colsample_bytree': [0.6, 0.7, 0.8],
            'alpha': [0, 0.1, 1.0],  # L1 regularization
            'lambda': [0.1, 1.0, 10.0]  # L2 regularization
        }
        random_search = RandomizedSearchCV(
            estimator=xgb,
            param_distributions=param_dist,
            n_iter=10,
            cv=3,
            scoring='accuracy',
            n_jobs=-1,
            verbose=1,
            random_state=42
        )
        random_search.fit(x_train, y_train)
        model = random_search.best_estimator_
        best_params = random_search.best_params_
    else:
        model = xgb
        best_params = "Default parameters"

    model.fit(x_train, y_train, eval_set=eval_set, verbose=False)
    results = model.evals_result()
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    return model, results, y_pred, accuracy, best_params

# Function to preprocess user input and predict
def predict_user_input(model_choice, *user_inputs):
    global trained_model, trained_model_type, feature_columns, x_train

    if trained_model is None or trained_model_type != model_choice:
        return "Please train the model first by clicking 'Train Model' with the selected model type."

    try:
        user_data = pd.DataFrame([user_inputs], columns=['age', 'fnlwgt', 'education_num', 'capital_gain',
                                                         'capital_loss', 'hours_per_week', 'workclass',
                                                         'education', 'marital_status', 'occupation',
                                                         'relationship', 'race', 'sex', 'native_country'])

        user_data_encoded = pd.get_dummies(user_data, columns=['workclass', 'education', 'marital_status',
                                                              'occupation', 'relationship', 'race', 'sex',
                                                              'native_country'], drop_first=True)
        user_data_encoded.columns = user_data_encoded.columns.str.strip().str.replace(" ", "_").str.replace("-", "_")

        missing_cols = set(feature_columns) - set(user_data_encoded.columns)
        for col in missing_cols:
            user_data_encoded[col] = 0
        user_data_encoded = user_data_encoded[feature_columns]

        if model_choice == "XGBoost":
            prediction = trained_model.predict(user_data_encoded)[0]
        else:  # LSTM
            user_data_lstm = np.reshape(user_data_encoded.values, (1, 1, user_data_encoded.shape[1]))
            if user_data_lstm.shape[2] != x_train.shape[1]:
                return f"Error: Input feature count ({user_data_lstm.shape[2]}) doesn’t match model’s expected count ({x_train.shape[1]})"
            raw_pred = trained_model.predict(user_data_lstm, verbose=0)
            prediction = (raw_pred > 0.5).astype(int)[0][0]

        result = ">50K" if prediction == 1 else "<=50K"
        return f"Prediction for your input: {result}\nModel used: {model_choice}"
    except Exception as e:
        return f"Error during prediction: {str(e)}"

# Training function with LSTM tuning
def train_model(model_choice, tune_hyperparams):
    global trained_model, trained_model_type, x_train, x_test, y_train, y_test

    if x_train is None or x_test is None or y_train is None or y_test is None:
        return "Error: Training data not loaded. Please ensure 'adult.data' is available and reload the script.", None

    X_train_lstm = np.reshape(x_train.values, (x_train.shape[0], 1, x_train.shape[1]))
    X_test_lstm = np.reshape(x_test.values, (x_test.shape[0], 1, x_test.shape[1]))

    if model_choice == "XGBoost":
        model, results, y_pred, accuracy, best_params = train_xgboost(
            tune_hyperparams, x_train, x_test, y_train, y_test
        )
    else:  # LSTM
        if tune_hyperparams:
            tuner = kt.RandomSearch(
                lambda hp: create_lstm_model(hp, (1, x_train.shape[1])),  # Adjusted input_shape to (1, ...)
                objective='val_accuracy',
                max_trials=5,
                executions_per_trial=2,
                directory='lstm_tuning',
                project_name='income_prediction'
            )
            early_stopping = EarlyStopping(monitor='val_loss', patience=14, restore_best_weights=True, mode='min')
            tuner.search(X_train_lstm, y_train, epochs=30, batch_size=64,
                         validation_data=(X_test_lstm, y_test), callbacks=[early_stopping], verbose=0)
            model = tuner.get_best_models(num_models=1)[0]
            best_hps = tuner.get_best_hyperparameters(num_trials=1)[0].values
            best_params = {k: v for k, v in best_hps.items()}
            history = model.fit(X_train_lstm, y_train, epochs=40, batch_size=64,
                               validation_data=(X_test_lstm, y_test), callbacks=[early_stopping], verbose=1)
        else:
            model = create_lstm_model(input_shape=(1, x_train.shape[1]))  # Adjusted input_shape to (1, ...)
            early_stopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True, mode='min')
            history = model.fit(X_train_lstm, y_train, epochs=50, batch_size=64,
                               validation_data=(X_test_lstm, y_test), callbacks=[early_stopping], verbose=1)
            best_params = "LSTM default parameters"

        results = {
            'validation_0': {'logloss': history.history['loss'], 'acc': history.history['accuracy']},
            'validation_1': {'logloss': history.history['val_loss'], 'acc': history.history['val_accuracy']}
        }
        y_pred = (model.predict(X_test_lstm, verbose=0) > 0.5).astype(int).flatten()
        accuracy = accuracy_score(y_test, y_pred)

    if model_choice == "XGBoost":
        model.save_model('model.xgb')
    else:
        model.save('model_lstm.keras')
    trained_model = model
    trained_model_type = model_choice

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    ax1.plot(results['validation_0']['logloss'], label='Train Loss')
    ax1.plot(results['validation_1']['logloss'], label='Test Loss')
    ax1.set_title('Loss')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Log Loss')
    ax1.legend()
    ax1.grid()

    if model_choice == "XGBoost":
        train_pred = model.predict(x_train)
        test_pred = model.predict(x_test)
        train_acc = [accuracy_score(y_train, train_pred)] * len(results['validation_0']['logloss'])
        test_acc = [accuracy_score(y_test, test_pred)] * len(results['validation_1']['logloss'])
    else:
        train_acc = results['validation_0']['acc']
        test_acc = results['validation_1']['acc']

    ax2.plot(train_acc, label='Train Acc')
    ax2.plot(test_acc, label='Test Acc')
    ax2.set_title('Accuracy')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    ax2.grid()

    plt.tight_layout()
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    img = Image.open(buf)
    plt.savefig('training_plot.png')
    plt.close()

    report = classification_report(y_test, y_pred, target_names=['<=50K', '>50K'])
    result_text = (f"Model trained successfully!\n"
                   f"Accuracy: {accuracy:.4f}\n"
                   f"Best parameters: {best_params}\n\n"
                   f"Classification Report:\n{report}")
    return result_text, img

# Command-line training function
def train_from_terminal(model_choice, tune_hyperparams):
    result_text, _ = train_model(model_choice, tune_hyperparams)
    print(result_text)
    print("Training plots saved as 'training_plot.png'")

# Gradio interface
def create_gradio_interface():
    with gr.Blocks() as demo:
        gr.Markdown("# Income Prediction Model")
        with gr.Row():
            with gr.Column(scale=1):
                model_dropdown = gr.Dropdown(choices=["XGBoost", "LSTM"], label="Select Model")
                tune_checkbox = gr.Checkbox(label="Tune Hyperparameters")

                gr.Markdown("### Enter Your Information")
                age = gr.Number(label="Age", value=30)
                fnlwgt = gr.Number(label="Final Weight (fnlwgt)", value=77516)
                education_num = gr.Number(label="Education Number", value=13)
                capital_gain = gr.Number(label="Capital Gain", value=0)
                capital_loss = gr.Number(label="Capital Loss", value=0)
                hours_per_week = gr.Number(label="Hours per Week", value=40)
                workclass = gr.Dropdown(choices=["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov",
                                                "Local-gov", "State-gov", "Without-pay", "Never-worked"],
                                       label="Workclass", value="Private")
                education = gr.Dropdown(choices=["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school",
                                                "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters",
                                                "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"],
                                       label="Education", value="Bachelors")
                marital_status = gr.Dropdown(choices=["Married-civ-spouse", "Divorced", "Never-married",
                                                     "Separated", "Widowed", "Married-spouse-absent",
                                                     "Married-AF-spouse"],
                                            label="Marital Status", value="Never-married")
                occupation = gr.Dropdown(choices=["Tech-support", "Craft-repair", "Other-service", "Sales",
                                                 "Exec-managerial", "Prof-specialty", "Handlers-cleaners",
                                                 "Machine-op-inspct", "Adm-clerical", "Farming-fishing",
                                                 "Transport-moving", "Priv-house-serv", "Protective-serv",
                                                 "Armed-Forces"],
                                        label="Occupation", value="Adm-clerical")
                relationship = gr.Dropdown(choices=["Wife", "Own-child", "Husband", "Not-in-family",
                                                   "Other-relative", "Unmarried"],
                                          label="Relationship", value="Not-in-family")
                race = gr.Dropdown(choices=["White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"],
                                  label="Race", value="White")
                sex = gr.Dropdown(choices=["Male", "Female"], label="Sex", value="Male")
                native_country = gr.Dropdown(choices=["United-States", "Cambodia", "England", "Puerto-Rico",
                                                     "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India",
                                                     "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras",
                                                     "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico",
                                                     "Portugal", "Ireland", "France", "Dominican-Republic", "Laos",
                                                     "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala",
                                                     "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador",
                                                     "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"],
                                            label="Native Country", value="United-States")

                predict_btn = gr.Button("Predict Your Income")
                train_btn = gr.Button("Train Model")

            with gr.Column(scale=3):
                output_text = gr.Textbox(label="Training Results", lines=15)
                output_plot = gr.Image(label="Training Plots")
                prediction_output = gr.Textbox(label="Your Income Prediction")

        train_btn.click(fn=train_model, inputs=[model_dropdown, tune_checkbox],
                        outputs=[output_text, output_plot])
        predict_btn.click(fn=predict_user_input,
                         inputs=[model_dropdown, age, fnlwgt, education_num, capital_gain, capital_loss,
                                hours_per_week, workclass, education, marital_status, occupation,
                                relationship, race, sex, native_country],
                         outputs=prediction_output)
    return demo

# Main execution
demo = create_gradio_interface()
demo.launch(debug=True)

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://0893b126bd1f03f9b4.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  super().__init__(**kwargs)


Epoch 1/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.7571 - loss: 0.7005 - val_accuracy: 0.7593 - val_loss: 0.5622
Epoch 2/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.7593 - loss: 0.5608 - val_accuracy: 0.7593 - val_loss: 0.5520
Epoch 3/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.7565 - loss: 0.5617 - val_accuracy: 0.7593 - val_loss: 0.5549
Epoch 4/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.7587 - loss: 0.5573 - val_accuracy: 0.7593 - val_loss: 0.5538
Epoch 5/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.7602 - loss: 0.5543 - val_accuracy: 0.7593 - val_loss: 0.5530
Epoch 6/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.7577 - loss: 0.5561 - val_accuracy: 0.7593 - val_loss: 0.5526
Epoch 7/50
[1m407/407[0m 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/50


  super().__init__(**kwargs)


[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.7478 - loss: 0.7096 - val_accuracy: 0.7593 - val_loss: 0.5621
Epoch 2/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.7576 - loss: 0.5645 - val_accuracy: 0.7593 - val_loss: 0.5357
Epoch 3/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.7594 - loss: 0.5469 - val_accuracy: 0.7593 - val_loss: 0.5527
Epoch 4/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.7585 - loss: 0.5586 - val_accuracy: 0.7593 - val_loss: 0.5548
Epoch 5/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.7577 - loss: 0.5578 - val_accuracy: 0.7593 - val_loss: 0.5532
Epoch 6/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.7591 - loss: 0.5544 - val_accuracy: 0.7593 - val_loss: 0.5523
Epoch 7/50
[1m407/407[0m [32m━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
