# Part 1: Data Extraction

### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

import imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE

import xgboost
from xgboost import XGBClassifier

### Loading and Compiling Data

In [None]:
#importing data
#df = pd.read_csv('Combined_LCA_Disclosure_Data_FY2020_to_FY2024.csv', low_memory=False)
df2020 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2020.csv', low_memory=False)
df2021 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2021.csv', low_memory=False)
df2022 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2022.csv', low_memory=False)
df2023 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2023.csv', low_memory=False)
df2024 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2024.csv', low_memory=False)
all_data = [df2020, df2021,df2022,df2023,df2024]

df = pd.concat(all_data)
df.head(5)

In [None]:
# Delete previous dataframes to free memory
del df2020, df2021, df2022, df2023, df2024, all_data

### Data Overview

In [None]:
# overview
df.describe()

In [None]:
df.info()

In [None]:
#checking for missing value
df.isnull()

The dataset contains several missing values, which will be addressed appropriately after selecting the features for the model.

# Part 2: Exploratory Data Analysis & Feature Engineering

### Case Status Distribution

In [None]:
# Distribution of case statuses
case_status_counts = df['CASE_STATUS'].value_counts()

# Case status distribution using pie chart
plt.figure(figsize=(8, 8))
case_status_counts.plot(kind='pie', startangle=90, cmap='Set3',autopct='%1.1f%%', textprops={'rotation':45},legend=True)
plt.title('Case Status Distribution', fontsize=16)
plt.ylabel('')  # Remove y-label for better appearance
plt.legend(loc='right',bbox_to_anchor=(1.4, 0.5))
plt.show()

### Total Number of Applications Over the Years 

In [None]:
# Total number of applications over the years (2020-2024)
df['YEAR'] = pd.to_datetime(df['DECISION_DATE']).dt.year  # Extract year from decision date
yearly_applications = df['YEAR'].value_counts().sort_index()
print(yearly_applications) #Checking Values

In [None]:
#plot
plt.figure(figsize=(10, 6))
plt.bar(yearly_applications.index, yearly_applications.values, color='skyblue')
plt.title('Total H1B Applications by Year', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Applications', fontsize=12)
plt.xticks(yearly_applications.index)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

### Number of H1B Applicants by State

In [None]:
# Example DataFrame
state_counts = df['EMPLOYER_STATE'].value_counts().reset_index()
state_counts.columns = ['State', 'Applicants']

# Use Plotly Express to create a choropleth map
fig = px.choropleth(
    state_counts,
    locations='State',          # Column with state abbreviations
    locationmode='USA-states',  # Specify USA states
    color='Applicants',         # Color by the number of applicants
    color_continuous_scale='Viridis',  # Color scale
    scope='usa',                # Focus on the USA
    title='Number of H1B Applicants by State',
    width=1000,
    height=600
)

# Display the map
fig.show()

### Top 10 Occupations in H1B Applications

In [None]:
# Top occupations in the dataset
top_occupations = df['SOC_TITLE'].value_counts().head(10)

plt.figure(figsize=(10, 6))
top_occupations.plot(kind='barh', color='lightgreen', edgecolor='black')
plt.title('Top 10 Occupations in H1B Applications', fontsize=16)
plt.xlabel('Number of Applications', fontsize=12)
plt.ylabel('Occupation', fontsize=12)
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

### Salary Analysis

In [None]:
#Clean and convert salaries to numeric, handling outliers
unique_units = df['PW_UNIT_OF_PAY'].unique()
print(unique_units)

In [None]:
# Convert salaries to 'Year'
df['PREVAILING_WAGE'] = pd.to_numeric(df['PREVAILING_WAGE'], errors='coerce')
df = df[df['PREVAILING_WAGE'] > 0]  # Filter out non-positive values

conversion_factors = {
    'Year': 1,            # No Normalization
    'Month': 12,          # 12 months in a year
    'Bi-Weekly': 26,      # 26 bi-weekly periods in a year
    'Week': 52,           # 52 weeks in a year
    'Hour': 2080          # 52 weeks in a year x 40 hrs a week
}

df['ANNUAL_WAGE'] = df.apply(
    lambda row: row['PREVAILING_WAGE'] * conversion_factors.get(row['PW_UNIT_OF_PAY'], 1),
    axis=1
)

In [None]:
# Plot the distribution of Annual Wage
plt.figure(figsize=(12, 6))
sns.histplot(df['ANNUAL_WAGE'], bins=50, kde=True, color='blue')
plt.title('Distribution of Annual Wage', fontsize=16)
plt.xlabel('Annual Wage', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Cap outliers at the 99th percentile
wage_cap = df['ANNUAL_WAGE'].quantile(0.99)
filtered_df = df[df['ANNUAL_WAGE'] <= wage_cap]

# Plot the distribution of Annual Wage with outliers capped
plt.figure(figsize=(12, 6))
sns.histplot(filtered_df['ANNUAL_WAGE'], bins=50, kde=True, color='blue')
plt.title('Distribution of Annual Wage (Capped at 99th Percentile)', fontsize=16)
plt.xlabel('Annual Wage', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

### Top 10 Occupations by Average Annual Wage

In [None]:
# Analyze average salaries by occupation
avg_salaries = df.groupby('SOC_TITLE')['ANNUAL_WAGE'].mean().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 6))
avg_salaries.plot(kind='barh', color='purple', edgecolor='black')
plt.title('Top 10 Occupations by Average Annual Wage', fontsize=16)
plt.xlabel('Average Annual Wage', fontsize=12)
plt.ylabel('Occupation', fontsize=12)
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

### Top 10 Employers for H1B Applications 

In [None]:
# Frequency encoding for EMPLOYER_NAME

frequency_encoding = df['EMPLOYER_NAME'].value_counts(normalize=True)  # Compute frequency
df['EMPLOYER_NAME_FREQUENCY'] = df['EMPLOYER_NAME'].map(frequency_encoding)  # Map to DataFrame

# Get top 10 high-frequency companies
top_10_employers = frequency_encoding.head(10)

# Display the top 10
print("Top 10 Employers by Frequency:")
print(top_10_employers)

### Finding Correlation Between Variables

In [None]:
#get the processing time from subtracting decision date and start date and case status is certified
#processing the time into usable format

df['BEGIN_DATE'] = pd.to_datetime(df['BEGIN_DATE'])
df['DECISION_DATE'] = pd.to_datetime(df['DECISION_DATE'])
df['RECEIVED_DATE'] = pd.to_datetime(df['RECEIVED_DATE'])


print(df.BEGIN_DATE.value_counts())
print(df.DECISION_DATE.value_counts())
df['Decision_Duration'] = df['BEGIN_DATE'] - df['DECISION_DATE']

df.Decision_Duration

##### Filtering Dataframe for Variables Possibly Related to Case Staus

In [None]:
#filtering current df
#df1 = df.copy()
df1 = df[['CASE_STATUS','Decision_Duration','RECEIVED_DATE','SOC_TITLE','FULL_TIME_POSITION','EMPLOYER_NAME','EMPLOYER_CITY','EMPLOYER_STATE','AGENT_REPRESENTING_EMPLOYER','TOTAL_WORKER_POSITIONS','WORKSITE_CITY','WORKSITE_STATE','WORKSITE_WORKERS','WORKSITE_POSTAL_CODE','AGENT_ATTORNEY_CITY','AGENT_ATTORNEY_STATE','ANNUAL_WAGE','H_1B_DEPENDENT','SUPPORT_H1B']]

In [None]:
'''
Preprocessing features
#:'Decision_Duration','TOTAL_WORKER_POSITIONS','ANNUAL_WAGE'
Categorical:'CASE_STATUS','RECEIVED_DATE','SOC_TITLE','FULL_TIME_POSITION','EMPLOYER_NAME','EMPLOYER_CITY','EMPLOYER_STATE','AGENT_REPRESENTING_EMPLOYER','WORKSITE_CITY','WORKSITE_STATE','WORKSITE_WORKERS','WORKSITE_POSTAL_CODE','AGENT_ATTORNEY_CITY','AGENT_ATTORNEY_STATE','H_1B_DEPENDENT','SUPPORT_H1B']
'''

cat_columns = ['CASE_STATUS','RECEIVED_DATE','SOC_TITLE','FULL_TIME_POSITION','EMPLOYER_NAME','EMPLOYER_CITY','EMPLOYER_STATE','AGENT_REPRESENTING_EMPLOYER','WORKSITE_CITY','WORKSITE_STATE','WORKSITE_WORKERS','WORKSITE_POSTAL_CODE','AGENT_ATTORNEY_CITY','AGENT_ATTORNEY_STATE','H_1B_DEPENDENT','SUPPORT_H1B']
for col in cat_columns:
    label_encoder = LabelEncoder()
    df1[col] = label_encoder.fit_transform(df1[col])

In [None]:
df1.head(5)

#### Finding Correlation Variables

In [None]:
'''
Spearman correlation can help detect non-linear, categorical, and ordinal data. Good at ranked variables. So maybe it's better in this case when compared to pearson.
'''
#spearman correlation
corr = df1.corr(method='spearman')
corr_matrix = corr['CASE_STATUS']
print(corr_matrix.sort_values(ascending=False))

#pearson correlation
pearson_corr = df1.corr()
pearson_corr_matrix = pearson_corr['CASE_STATUS']
print(pearson_corr_matrix.sort_values(ascending=False))

In [None]:
target_corr = corr['CASE_STATUS'].drop('CASE_STATUS').sort_values(ascending=False)
target_pearcorr = pearson_corr['CASE_STATUS'].drop('CASE_STATUS').sort_values(ascending=False)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=True)

# Plot Pearson correlation
sns.barplot(x=target_pearcorr.values, y=target_pearcorr.index, palette="coolwarm", hue=target_pearcorr.index , ax=axes[0], legend=False)
axes[0].set_title("Feature Correlation with Case Status (Pearson)")
axes[0].set_xlabel("Correlation with Case Status")
axes[0].set_ylabel("Features")

# Plot Spearman correlation
sns.barplot(x=target_corr.values, y=target_corr.index, palette="coolwarm", hue=target_corr.index, ax=axes[1], legend=False)
axes[1].set_title("Feature Correlation with Case Status (Spearman)")
axes[1].set_xlabel("Correlation with Case Status")
axes[1].set_ylabel("")

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
#plot spearman correlation matrix
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.show()

target_corr = corr['CASE_STATUS'].drop('CASE_STATUS').sort_values(ascending=False)

'''
# Plot as a bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=target_corr.values, y=target_corr.index, palette="coolwarm")
plt.xlabel("Correlation with Case Status")
plt.ylabel("Features")
plt.title("Feature Correlation with Cade Status - Spearman")
plt.show()
'''

#### Mutual Information Classification

In [None]:
df1['Decision_Duration'] = df1['Decision_Duration'].dt.days
X = df1.drop(columns=['CASE_STATUS'])
y = df1['CASE_STATUS'] 

mi_scores = mutual_info_classif(X, y, discrete_features=False)
mi_scores_df = pd.DataFrame({'Feature': X.columns, 'Mutual Information Score': mi_scores})
mi_scores_df = mi_scores_df.sort_values(by="Mutual Information Score", ascending=False)
print(mi_scores_df)

In [None]:
# Plot mutual information scores
plt.figure(figsize=(10, 6))
sns.barplot(x=mi_scores_df['Mutual Information Score'], y=mi_scores_df['Feature'], palette="viridis")
plt.xlabel("Mutual Information Score")
plt.ylabel("Features")
plt.title("Feature Importance Based on Mutual Information")
plt.show()

# Part 3: Data Pre-processing

## Filtering Out DataFrame

### 1. Selecting Features

In [None]:
#selecting features based on correlation coefficients from last section
selectdf = df.loc[:, 
    ['CASE_STATUS',
     'SOC_TITLE',
     'ANNUAL_WAGE',
     'SUPPORT_H1B',
     'H_1B_DEPENDENT',
     'EMPLOYER_NAME',
     'EMPLOYER_STATE',
     'EMPLOYER_CITY',
     'AGENT_REPRESENTING_EMPLOYER',
     'AGENT_ATTORNEY_STATE',
     'AGENT_ATTORNEY_CITY',
     'FULL_TIME_POSITION',
     'WORKSITE_STATE',
     'WORKSITE_CITY',
     'WORKSITE_POSTAL_CODE',
     'WORKSITE_WORKERS',
     'TOTAL_WORKER_POSITIONS',
     'RECEIVED_DATE',
     'Decision_Duration'
    ]]
selectdf.info()

In [None]:
selectdf["WORKSITE_WORKERS"] = selectdf["WORKSITE_WORKERS"].fillna(0).astype(int)
selectdf.WORKSITE_WORKERS.value_counts()

In [None]:
selectdf.head(5)

### 2. Filtering "CASE_STATUS" for Certified and Denied

In [None]:
#see count before
print(selectdf.CASE_STATUS.value_counts())

#filter out case status into certifief and withdrawn only
#save into selectdf_filtered
selectdf_filtered = selectdf[selectdf['CASE_STATUS'].isin(['Certified', 'Denied'])]
print(selectdf_filtered.CASE_STATUS.value_counts())

In [None]:
#free up memory
del df, selectdf

### 3. Transforming DataFrame

In [None]:
'''
Preprocessing features
#:'Decision_Duration','TOTAL_WORKER_POSITIONS','ANNUAL_WAGE'
Categorical:
    'CASE_STATUS',
    'SOC_TITLE',
    'SUPPORT_H1B',
    'H_1B_DEPENDENT',
    'EMPLOYER_NAME',
    'EMPLOYER_STATE',
    'EMPLOYER_CITY',
    'AGENT_REPRESENTING_EMPLOYER',
    'AGENT_ATTORNEY_STATE',
    'AGENT_ATTORNEY_CITY',
    'FULL_TIME_POSITION',
    'WORKSITE_STATE',
    'WORKSITE_CITY',
    'WORKSITE_POSTAL_CODE',
    'RECEIVED_DATE']]
'''
selectdf_filtered = selectdf_filtered.copy()

#Label Encoding for categorical features
cat_columns = [
    'CASE_STATUS',
    'SOC_TITLE',
    'SUPPORT_H1B',
    'H_1B_DEPENDENT',
    'EMPLOYER_NAME',
    'EMPLOYER_STATE',
    'EMPLOYER_CITY',
    'AGENT_REPRESENTING_EMPLOYER',
    'AGENT_ATTORNEY_STATE',
    'AGENT_ATTORNEY_CITY',
    'FULL_TIME_POSITION',
    'WORKSITE_STATE',
    'WORKSITE_CITY',
    'WORKSITE_POSTAL_CODE',
    'RECEIVED_DATE'
    ]

#prevent categories from mismatching
label_encoder= {}
for col in cat_columns:
    label_encoder[col] = LabelEncoder()
    selectdf_filtered[col] = label_encoder[col].fit_transform(selectdf_filtered[col])

In [None]:
print(selectdf_filtered.CASE_STATUS.value_counts())
print(selectdf_filtered.info())

### 4. Defining target and features

In [None]:
# Define target and features
target = 'CASE_STATUS'
features = ['SOC_TITLE', 'ANNUAL_WAGE', 'SUPPORT_H1B', 'H_1B_DEPENDENT', 'EMPLOYER_NAME', 'EMPLOYER_STATE', 
            'EMPLOYER_CITY', 'AGENT_REPRESENTING_EMPLOYER', 'AGENT_ATTORNEY_STATE', 'AGENT_ATTORNEY_CITY',
            'FULL_TIME_POSITION', 'WORKSITE_STATE', 'WORKSITE_CITY', 'WORKSITE_POSTAL_CODE',
            'WORKSITE_WORKERS', 'TOTAL_WORKER_POSITIONS', 'RECEIVED_DATE', 'Decision_Duration']

X = selectdf_filtered[features]
y = selectdf_filtered[target]

#Ensuring y is integer
y = y.astype(int)

#Test,train,split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Part 4: Models

### 1. Logistic Regression

In [None]:
#Scaling features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [None]:
#Model with imbalance data
lg_imbalanced = LogisticRegression(max_iter=1000, solver='newton-cg')
lg_imbalanced.fit(X_train_scaled, y_train)

# Predictions
lg_imbalanced_y_pred = lg_imbalanced.predict(X_test_scaled)

# Evaluate
print("Accuracy of Logistic Regression(imbalanced):",accuracy_score(y_test, lg_imbalanced_y_pred))
print(classification_report(y_test, lg_imbalanced_y_pred))

In [None]:
#Setting up model with balanced dataset

#Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

#scale again to ensure it can converge
X_train_smote_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

In [None]:
#Setting up model with balanced
lg = LogisticRegression(max_iter=1000, solver='newton-cg')
lg.fit(X_train_smote_scaled, y_train_smote)

# Predictions
lg_y_pred = lg.predict(X_test_scaled)

# Evaluate the model
lg_accuracy = accuracy_score(y_test, lg_y_pred)
lg_report = classification_report(y_test, lg_y_pred)


print("Accuracy of Logistic Regression(balanced):", lg_accuracy)
print(lg_report)

In [None]:
#compute confusion matrix and plot
def plot_confusion_matrix(y_true, y_pred, title, ax):
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    sns.heatmap(cm_percentage, annot=True, fmt='.2f', cmap='Blues', ax=ax, xticklabels=['Certified', 'Denied'], yticklabels=['Certified', 'Denied'])
    ax.set_title(title)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')


fig, axes = plt.subplots(1, 2, figsize=(12, 5))
plot_confusion_matrix(y_test, lg_imbalanced_y_pred, 'LG Confusion Matrix - Imbalanced Model (%)', axes[0])
plot_confusion_matrix(y_test, lg_y_pred, 'LG Confusion Matrix - Balanced Model (%)', axes[1])
plt.show()

### 2. Neural Network

In [None]:
#set up nn
def create_nn():
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = create_nn()

In [None]:
#Train with imbalanced dataset
nn_imbalanced = create_nn()
nn_imbalanced.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Evaluate the imbalanced model
nn_y_pred_imbalanced = (nn_imbalanced.predict(X_test) > 0.5).astype(int)

print("Imbalanced Neural Network:")
print("Accuracy of imbalanced NN:", accuracy_score(y_test, nn_y_pred_imbalanced))
print(classification_report(y_test, nn_y_pred_imbalanced))

In [None]:
#Setting up model with balanced dataset

#Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

#scaling 
X_train_smote_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

In [None]:
#Train balanced dataset
nn_balanced = create_nn()
nn_balanced.fit(X_train_smote_scaled, y_train_smote, epochs=10, batch_size=32, validation_data=(X_test_scaled, y_test), verbose=1)

# Evaluate the balanced model
nn_y_pred_balanced = (nn_balanced.predict(X_test_scaled) > 0.5).astype(int)
print("Balanced Neural Network with SMOTE:")
print("Accuracy of balanced NN:", accuracy_score(y_test, nn_y_pred_balanced))
print(classification_report(y_test, nn_y_pred_balanced))

In [None]:
#compute and plot confusion matrix
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
plot_confusion_matrix(y_test, nn_y_pred_imbalanced, 'NN Confusion Matrix - Imbalanced Model (%)', axes[0])
plot_confusion_matrix(y_test, nn_y_pred_balanced, 'NN Confusion Matrix - Balanced Model (%)', axes[1])
plt.show()

### 3. XGBoost

In [None]:
# Initialize and train XGBoost model
xgb_model_imbalanced = XGBClassifier(eval_metric='logloss')
xgb_model_imbalanced.fit(X_train, y_train)

# Predictions and evaluation
yXGB_pred_imbalanced = xgb_model_imbalanced.predict(X_test)
print("XGBoost_Imbalanced_Accuracy:", accuracy_score(y_test, yXGB_pred_imbalanced))
print(classification_report(y_test, yXGB_pred_imbalanced))

In [None]:
# Apply oversampling to balance the dataset
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Train XGBoost with balanced data
xgb_model_balanced = XGBClassifier(eval_metric='logloss')
xgb_model_balanced.fit(X_resampled, y_resampled)

# Predictions and evaluation
yXGB_pred_balanced = xgb_model_balanced.predict(X_test)
print("XGBoost_Balanced_Accuracy_:", accuracy_score(y_test, yXGB_pred_balanced))
print(classification_report(y_test, yXGB_pred_balanced))

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
plot_confusion_matrix(y_test, yXGB_pred_imbalanced, 'XGB Confusion Matrix - Imbalanced Model (%)', axes[0])
plot_confusion_matrix(y_test, yXGB_pred_balanced, 'XGB Confusion Matrix - Balanced Model (%)', axes[1])
plt.show()

### 4. Random Forest

In [None]:
# Initialize and train Random Forest model
rf_model_imbalanced = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_imbalanced.fit(X_train, y_train)

# Predictions and evaluation
yRF_pred_imbalanced = rf_model_imbalanced.predict(X_test)
print("Accuracy_Random Forest:", accuracy_score(y_test, yRF_pred_imbalanced))
print(classification_report(y_test, yRF_pred_imbalanced))

In [None]:
# Initialize and train Random Forest model with balanced train dataset
rf_model_balanced = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_balanced.fit(X_resampled, y_resampled)

# Predictions and evaluation
yRF_pred_balanced = rf_model_balanced.predict(X_test)
print("Accuracy_Random Forest:", accuracy_score(y_test, yRF_pred_balanced))
print(classification_report(y_test, yRF_pred_balanced))

In [None]:
# Visualization 
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
plot_confusion_matrix(y_test, yRF_pred_imbalanced, 'RF Confusion Matrix - Imbalanced Model (%)', axes[0])
plot_confusion_matrix(y_test, yRF_pred_balanced, 'RF Confusion Matrix - Balanced Model (%)', axes[1])
plt.show()