# Part 2:Data Models

## Importing libraries

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.utils.class_weight import compute_class_weight

## Importing Data

In [12]:
#importing data
#df = pd.read_csv('Combined_LCA_Disclosure_Data_FY2020_to_FY2024.csv', low_memory=False)
df2020 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2020.csv', low_memory=False)
df2021 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2021.csv', low_memory=False)
df2022 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2022.csv', low_memory=False)
df2023 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2023.csv', low_memory=False)
df2024 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2024.csv', low_memory=False)
all_data = [df2020, df2021,df2022,df2023,df2024]

df = pd.concat(all_data)
df.head(5)

Unnamed: 0,CASE_NUMBER,CASE_STATUS,RECEIVED_DATE,DECISION_DATE,ORIGINAL_CERT_DATE,VISA_CLASS,JOB_TITLE,SOC_CODE,SOC_TITLE,FULL_TIME_POSITION,...,WILLFUL_VIOLATOR,SUPPORT_H1B,STATUTORY_BASIS,APPENDIX_A_ATTACHED,PUBLIC_DISCLOSURE,PREPARER_LAST_NAME,PREPARER_FIRST_NAME,PREPARER_MIDDLE_INITIAL,PREPARER_BUSINESS_NAME,PREPARER_EMAIL
0,I-200-19268-393467,Certified,2019-09-25,2019-10-01,,H-1B,"APPLICATION ENGINEER, OMS [15-1199.02]",15-1199,"COMPUTER OCCUPATIONS, ALL OTHER",Y,...,N,,,,Disclose Business,,,,,
1,I-200-19268-638983,Certified,2019-09-25,2019-10-01,,H-1B,BI DEVELOPER II,15-1132,"SOFTWARE DEVELOPERS, APPLICATIONS",Y,...,N,Y,BOTH,,Disclose Business,,,,,
2,I-200-19268-177184,Certified,2019-09-25,2019-10-01,,H-1B,QUALITY ENGINEER,17-2141,MECHANICAL ENGINEERS,Y,...,N,Y,BOTH,,Disclose Business,,,,,
3,I-200-19268-936403,Certified,2019-09-25,2019-10-01,,H-1B,"SOFTWARE DEVELOPER, APPLICATIONS",15-1132,"SOFTWARE DEVELOPERS, APPLICATIONS",Y,...,N,Y,BOTH,,Disclose Business,,,,,
4,I-200-19268-394079,Certified,2019-09-25,2019-10-01,,H-1B,QUALITY ENGINEER LEVEL II,15-1199,"COMPUTER OCCUPATIONS, ALL OTHER",Y,...,N,Y,BOTH,,Disclose Business,,,,,LEGAL@THEEGIANTS.COM


In [13]:
# Delete previous dataframes to free memory
del df2020, df2021, df2022, df2023, df2024, all_data

## Basic Data Processing 

### 1. Finding Annual Wage

In [14]:
#Salary Analysis
#Clean and convert salaries to numeric, handling outliers

unique_units = df['PW_UNIT_OF_PAY'].unique()
print(unique_units)

['Year' 'Hour' 'Bi-Weekly' nan 'Month' 'Week']


In [15]:
#"Prevailing Wage" * conversion factors to find annual wage
df['PREVAILING_WAGE'] = pd.to_numeric(df['PREVAILING_WAGE'], errors='coerce')
df = df[df['PREVAILING_WAGE'] > 0]  # Filter out non-positive values

conversion_factors = {
    'Year': 1,            # No Normalization
    'Month': 12,          # 12 months in a year
    'Bi-Weekly': 26,      # 26 bi-weekly periods in a year
    'Week': 52,           # 52 weeks in a year
    'Hour': 2080          # 52 weeks in a year x 40 hrs a week
}

df['ANNUAL_WAGE'] = df.apply(
    lambda row: row['PREVAILING_WAGE'] * conversion_factors.get(row['PW_UNIT_OF_PAY'], 1),
    axis=1
)

In [6]:
print(df[df['PW_UNIT_OF_PAY'] == 'Hour'][['PREVAILING_WAGE', 'PW_UNIT_OF_PAY', 'ANNUAL_WAGE']].head())

    PREVAILING_WAGE PW_UNIT_OF_PAY  ANNUAL_WAGE
1              39.0           Hour      81120.0
2              39.0           Hour      81120.0
3              53.0           Hour     110240.0
59             28.0           Hour      58240.0
60             29.0           Hour      60320.0


### 2. Finding Decision Duration of Applications

In [16]:
#get the processing time from subtracting decision date and start date and case status is certified
#processing the time into usable format

df['BEGIN_DATE'] = pd.to_datetime(df['BEGIN_DATE'])
df['DECISION_DATE'] = pd.to_datetime(df['DECISION_DATE'])
df['RECEIVED_DATE'] = pd.to_datetime(df['RECEIVED_DATE'])

df['Decision_Duration'] = df['BEGIN_DATE'] - df['DECISION_DATE']

df.Decision_Duration.head(3)

0    6 days
1   99 days
2    2 days
Name: Decision_Duration, dtype: timedelta64[ns]

In [17]:
#selecting features based on correlation coefficients from last section
selectdf = df.loc[:, ['CASE_STATUS','ANNUAL_WAGE','SUPPORT_H1B','EMPLOYER_NAME','AGENT_ATTORNEY_CITY','WORKSITE_WORKERS','TOTAL_WORKER_POSITIONS','RECEIVED_DATE','Decision_Duration']]

selectdf.CASE_STATUS.value_counts()

CASE_STATUS
Certified                3292311
Certified - Withdrawn     184634
Withdrawn                  64014
Denied                     21777
Name: count, dtype: int64

In [18]:
'''
Preprocessing features
#:'Decision_Duration','TOTAL_WORKER_POSITIONS','ANNUAL_WAGE'
Categorical: 'CASE_STATUS','SUPPORT_H1B','EMPLOYER_NAME','AGENT_ATTORNEY_CITY','WORKSITE_WORKERS','RECEIVED_DATE']]
'''
cat_columns = ['CASE_STATUS','Decision_Duration','SUPPORT_H1B','EMPLOYER_NAME','AGENT_ATTORNEY_CITY','WORKSITE_WORKERS','RECEIVED_DATE']
for col in cat_columns:
    label_encoder = LabelEncoder()
    selectdf[col] = label_encoder.fit_transform(selectdf[col])

selectdf.head(5)


Unnamed: 0,CASE_STATUS,ANNUAL_WAGE,SUPPORT_H1B,EMPLOYER_NAME,AGENT_ATTORNEY_CITY,WORKSITE_WORKERS,TOTAL_WORKER_POSITIONS,RECEIVED_DATE,Decision_Duration
0,0,95118.0,4,89786,494,82,1,223,1349
1,0,81120.0,2,47200,1296,82,1,223,1442
2,0,81120.0,2,55202,2742,82,1,223,1345
3,0,110240.0,2,150953,3155,82,1,223,1349
4,0,65333.0,2,53537,3155,82,1,223,1351


In [19]:
selectdf.CASE_STATUS.value_counts()

CASE_STATUS
0    3292311
1     184634
3      64014
2      21777
Name: count, dtype: int64

In [20]:
#target variable and the selected features from above
X = selectdf[['RECEIVED_DATE','Decision_Duration', 'TOTAL_WORKER_POSITIONS', 'ANNUAL_WAGE', 'SUPPORT_H1B', 'EMPLOYER_NAME','AGENT_ATTORNEY_CITY', 'WORKSITE_WORKERS', 'RECEIVED_DATE']]
y = selectdf['CASE_STATUS']

# Standardize numerical features
scaler = StandardScaler()
X[['Decision_Duration', 'TOTAL_WORKER_POSITIONS', 'ANNUAL_WAGE']] = scaler.fit_transform(X[['Decision_Duration', 'TOTAL_WORKER_POSITIONS', 'ANNUAL_WAGE']])

# Split dataset into train and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


## Logistic Regression

In [12]:
#Setting up model
lgmodel = LogisticRegression(max_iter=100, solver='saga', class_weight='balanced')
lgmodel.fit(X_train, y_train)

# Predictions
y_pred = lgmodel.predict(X_test)




In [13]:
# Evaluate the model
lgaccuracy = accuracy_score(y_test, y_pred)
lgreport_dict = classification_report(y_test, y_pred,output_dict=True)
lg_report = pd.DataFrame(lgreport_dict).T

lg_report

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.6709148015291602,
 '              precision    recall  f1-score   support\n\n           0       0.92      0.72      0.81    658452\n           1       0.16      0.00      0.01     37037\n           2       0.01      0.27      0.01      4339\n           3       0.00      0.00      0.00     12720\n\n    accuracy                           0.67    712548\n   macro avg       0.27      0.25      0.21    712548\nweighted avg       0.86      0.67      0.75    712548\n')

In [94]:
print(f"y_test shape: {y_test.shape}, dtype: {y_test.dtype}")
print(f"y_pred shape: {y_pred.shape}, dtype: {y_pred.dtype}")

print("Unique values in y_test:", np.unique(y_test))
print("Unique values in y_pred:", np.unique(y_pred))



y_test shape: (712548, 4), dtype: float64
y_pred shape: (712548,), dtype: int64
Unique values in y_test: [0. 1.]
Unique values in y_pred: [0 1 2]


In [None]:
# Convert one-hot to single-label encoding
if y_test.ndim == 2:
    y_test = np.argmax(y_test, axis=1)

if y_pred.ndim == 2:
    y_pred = np.argmax(y_pred, axis=1)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report_dict = classification_report(y_test, y_pred, output_dict=True)

# Convert to DataFrame
report_df = pd.DataFrame(report_dict).T


print(report_df)
print(f"Model Accuracy: {accuracy:.4f}")


              precision    recall  f1-score        support
0              0.923894  0.725801  0.812954  658273.000000
1              0.049197  0.001328  0.002585   36911.000000
2              0.006121  0.273941  0.011974    4344.000000
3              0.000000  0.000000  0.000000   13020.000000
accuracy       0.672255  0.672255  0.672255       0.672255
macro avg      0.244803  0.250267  0.206878  712548.000000
weighted avg   0.856106  0.672255  0.751238  712548.000000
Model Accuracy: 0.6723


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Neural Network

In [21]:
#One Hot Encode y for neural network
OneHot = OneHotEncoder(sparse_output=False)
y_encoded = OneHot.fit_transform(selectdf[['CASE_STATUS']])

#test train split again
X_train, X_test, y_train, y_test = train_test_split(selectdf.drop(columns=['CASE_STATUS']), y_encoded, test_size=0.2, random_state=42)

print(X_train.shape)

(2850188, 8)


In [33]:
#neural network architecture - modified to be higher since the original 64 predicted class 0 dominantly. also adding l2 as weight to try to prevent overfit and underfit further. Adding L2 still predicts class 0 only. 

#using class weights and l2 yields class 3 only instead of class 0. adjusting classweights to clip extreme class weights

#using focals loss and class weights. Also, adding another layer

#now trying to use focal loss
def focal_loss(alpha=0.5, gamma=1.5):
    def loss(y_true, y_pred):
        epsilon = K.epsilon()  
        y_pred = K.clip(y_pred, epsilon, 1.0 - epsilon)  
        cross_entropy = -y_true * K.log(y_pred) 
        weights = alpha * K.pow(1 - y_pred, gamma) 
        return K.mean(weights * cross_entropy)
    return loss

# Compute class weights
classes = np.array([0,1,2,3])
class_weights = compute_class_weight('balanced', classes=classes, y=y_train.argmax(axis=1))

#adjusting classweights to clip extreme class weights, still didnt work, adjusting l2 further
class_weights = np.clip(class_weights, 0.5, 2.0)
class_weight_dict = {i: class_weights[i] for i in range(4)}



In [34]:
#model
model_multi = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)),  # More neurons
    Dropout(0.4),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(4, activation='softmax')
])

model_multi.compile(optimizer=Adam(learning_rate=0.001), loss=focal_loss(), metrics=['accuracy'])

# Train the model
history = model_multi.fit(X_train, y_train, epochs=30, batch_size=64, validation_split=0.2, class_weight=class_weight_dict, verbose=1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m35628/35628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 1ms/step - accuracy: 0.9116 - loss: 0.1170 - val_accuracy: 0.9240 - val_loss: 0.1533
Epoch 2/30
[1m35628/35628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 1ms/step - accuracy: 0.9244 - loss: 0.0807 - val_accuracy: 0.9240 - val_loss: 0.0215
Epoch 3/30
[1m35628/35628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 1ms/step - accuracy: 0.9244 - loss: 0.0135 - val_accuracy: 0.9240 - val_loss: 0.0215
Epoch 4/30
[1m35628/35628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 1ms/step - accuracy: 0.9242 - loss: 0.0136 - val_accuracy: 0.9240 - val_loss: 0.0215
Epoch 5/30
[1m35628/35628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 1ms/step - accuracy: 0.9243 - loss: 0.0135 - val_accuracy: 0.9240 - val_loss: 0.0215
Epoch 6/30
[1m35628/35628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 1ms/step - accuracy: 0.9244 - loss: 0.0135 - val_accuracy: 0.9240 - val_loss: 0.021

In [41]:
#Evaluating model
y_prob_multi = model_multi.predict(X_test)

#Converting labels back for better interpretation
y_pred_multi = np.argmax(y_prob_multi, axis=1)
y_test_labels = np.argmax(y_test, axis=1)


print("Shape of y_test:", y_test.shape)
#checking for correct conversions
print(np.unique(y_train.argmax(axis=1), return_counts=True))
print(np.unique(y_test_labels, return_counts=True))


print("Unique values in y_test_labels:", np.unique(y_test_labels))
print("Unique values in y_pred_multi:",np.unique(y_pred_multi))
print("y_test_labels shape:", y_test_labels.shape)
print("y_pred_multi shape:", y_pred_multi.shape)


[1m22268/22268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 232us/step
Shape of y_test: (712548, 4)
(array([0, 1, 2, 3]), array([2634038,  147723,   17433,   50994]))
(array([0, 1, 2, 3]), array([658273,  36911,   4344,  13020]))
Unique values in y_test_labels: [0 1 2 3]
Unique values in y_pred_multi: [0]
y_test_labels shape: (712548,)
y_pred_multi shape: (712548,)


In [40]:
print(history.history['accuracy'][-1], history.history['val_accuracy'][-1])


0.9241966605186462 0.9240278601646423


In [87]:
'''
y_train_labels = encoder.fit_transform(y_train_original)  # Use the correct multi-class labels
y_test_labels = encoder.transform(y_test_original)
'''

'\ny_train_labels = encoder.fit_transform(y_train_original)  # Use the correct multi-class labels\ny_test_labels = encoder.transform(y_test_original)\n'

In [35]:
#Classification
class_report = classification_report(y_test_labels, y_pred_multi, target_names=["Certified", "Certified-Withdrawn", "Withdrawn", "Denied"], output_dict=True)

# Convert to DataFrame
metrics_df = pd.DataFrame(class_report).T

metrics_df


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,precision,recall,f1-score,support
Certified,0.92383,1.0,0.960407,658273.0
Certified-Withdrawn,0.0,0.0,0.0,36911.0
Withdrawn,0.0,0.0,0.0,4344.0
Denied,0.0,0.0,0.0,13020.0
accuracy,0.92383,0.92383,0.92383,0.92383
macro avg,0.230957,0.25,0.240102,712548.0
weighted avg,0.853461,0.92383,0.887252,712548.0


In [80]:
# Count occurrences of each class in y_train_labels
unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique, counts)))


{np.float64(0.0): np.int64(8550564), np.float64(1.0): np.int64(2850188)}
