# Part 2:Data Models

## Importing libraries

In [2]:
pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

2025-02-12 09:47:41.927109: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-12 09:47:41.927162: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-12 09:47:41.928394: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-12 09:47:41.936057: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Importing Data

In [2]:
#importing data
#df = pd.read_csv('Combined_LCA_Disclosure_Data_FY2020_to_FY2024.csv', low_memory=False)
df2020 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2020.csv', low_memory=False)
df2021 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2021.csv', low_memory=False)
df2022 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2022.csv', low_memory=False)
df2023 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2023.csv', low_memory=False)
df2024 = pd.read_csv('Combined_LCA_Disclosure_Data_FY2024.csv', low_memory=False)
all_data = [df2020, df2021,df2022,df2023,df2024]

df = pd.concat(all_data)
df.head(5)

Unnamed: 0,CASE_NUMBER,CASE_STATUS,RECEIVED_DATE,DECISION_DATE,ORIGINAL_CERT_DATE,VISA_CLASS,JOB_TITLE,SOC_CODE,SOC_TITLE,FULL_TIME_POSITION,...,WILLFUL_VIOLATOR,SUPPORT_H1B,STATUTORY_BASIS,APPENDIX_A_ATTACHED,PUBLIC_DISCLOSURE,PREPARER_LAST_NAME,PREPARER_FIRST_NAME,PREPARER_MIDDLE_INITIAL,PREPARER_BUSINESS_NAME,PREPARER_EMAIL
0,I-200-19268-393467,Certified,2019-09-25,2019-10-01,,H-1B,"APPLICATION ENGINEER, OMS [15-1199.02]",15-1199,"COMPUTER OCCUPATIONS, ALL OTHER",Y,...,N,,,,Disclose Business,,,,,
1,I-200-19268-638983,Certified,2019-09-25,2019-10-01,,H-1B,BI DEVELOPER II,15-1132,"SOFTWARE DEVELOPERS, APPLICATIONS",Y,...,N,Y,BOTH,,Disclose Business,,,,,
2,I-200-19268-177184,Certified,2019-09-25,2019-10-01,,H-1B,QUALITY ENGINEER,17-2141,MECHANICAL ENGINEERS,Y,...,N,Y,BOTH,,Disclose Business,,,,,
3,I-200-19268-936403,Certified,2019-09-25,2019-10-01,,H-1B,"SOFTWARE DEVELOPER, APPLICATIONS",15-1132,"SOFTWARE DEVELOPERS, APPLICATIONS",Y,...,N,Y,BOTH,,Disclose Business,,,,,
4,I-200-19268-394079,Certified,2019-09-25,2019-10-01,,H-1B,QUALITY ENGINEER LEVEL II,15-1199,"COMPUTER OCCUPATIONS, ALL OTHER",Y,...,N,Y,BOTH,,Disclose Business,,,,,LEGAL@THEEGIANTS.COM


In [3]:
# Delete previous dataframes to free memory
del df2020, df2021, df2022, df2023, df2024, all_data

## Basic Data Processing 

### 1. Finding Annual Wage

In [4]:
#Salary Analysis
#Clean and convert salaries to numeric, handling outliers

unique_units = df['PW_UNIT_OF_PAY'].unique()
print(unique_units)

['Year' 'Hour' 'Bi-Weekly' nan 'Month' 'Week']


In [5]:
#"Prevailing Wage" * conversion factors to find annual wage
df['PREVAILING_WAGE'] = pd.to_numeric(df['PREVAILING_WAGE'], errors='coerce')
df = df[df['PREVAILING_WAGE'] > 0]  # Filter out non-positive values

conversion_factors = {
    'Year': 1,            # No Normalization
    'Month': 12,          # 12 months in a year
    'Bi-Weekly': 26,      # 26 bi-weekly periods in a year
    'Week': 52,           # 52 weeks in a year
    'Hour': 2080          # 52 weeks in a year x 40 hrs a week
}

df['ANNUAL_WAGE'] = df.apply(
    lambda row: row['PREVAILING_WAGE'] * conversion_factors.get(row['PW_UNIT_OF_PAY'], 1),
    axis=1
)

In [6]:
print(df[df['PW_UNIT_OF_PAY'] == 'Hour'][['PREVAILING_WAGE', 'PW_UNIT_OF_PAY', 'ANNUAL_WAGE']].head())

    PREVAILING_WAGE PW_UNIT_OF_PAY  ANNUAL_WAGE
1              39.0           Hour      81120.0
2              39.0           Hour      81120.0
3              53.0           Hour     110240.0
59             28.0           Hour      58240.0
60             29.0           Hour      60320.0


### 2. Finding Decision Duration of Applications

In [7]:
#get the processing time from subtracting decision date and start date and case status is certified
#processing the time into usable format

df['BEGIN_DATE'] = pd.to_datetime(df['BEGIN_DATE'])
df['DECISION_DATE'] = pd.to_datetime(df['DECISION_DATE'])
df['RECEIVED_DATE'] = pd.to_datetime(df['RECEIVED_DATE'])

df['Decision_Duration'] = df['BEGIN_DATE'] - df['DECISION_DATE']

df.Decision_Duration.head(3)

0    6 days
1   99 days
2    2 days
Name: Decision_Duration, dtype: timedelta64[ns]

In [8]:
#selecting features based on correlation coefficients from last section
selectdf = df.loc[:, ['CASE_STATUS','ANNUAL_WAGE','SUPPORT_H1B','EMPLOYER_NAME','AGENT_ATTORNEY_CITY','WORKSITE_WORKERS','TOTAL_WORKER_POSITIONS','RECEIVED_DATE','Decision_Duration']]

In [9]:
'''
Preprocessing features
#:'Decision_Duration','TOTAL_WORKER_POSITIONS','ANNUAL_WAGE'
Categorical: 'CASE_STATUS','SUPPORT_H1B','EMPLOYER_NAME','AGENT_ATTORNEY_CITY','WORKSITE_WORKERS','RECEIVED_DATE']]
'''
cat_columns = ['CASE_STATUS','Decision_Duration','SUPPORT_H1B','EMPLOYER_NAME','AGENT_ATTORNEY_CITY','WORKSITE_WORKERS','RECEIVED_DATE']
for col in cat_columns:
    label_encoder = LabelEncoder()
    selectdf[col] = label_encoder.fit_transform(selectdf[col])

selectdf.head(5)


Unnamed: 0,CASE_STATUS,ANNUAL_WAGE,SUPPORT_H1B,EMPLOYER_NAME,AGENT_ATTORNEY_CITY,WORKSITE_WORKERS,TOTAL_WORKER_POSITIONS,RECEIVED_DATE,Decision_Duration
0,0,95118.0,4,89786,494,82,1,223,1349
1,0,81120.0,2,47200,1296,82,1,223,1442
2,0,81120.0,2,55202,2742,82,1,223,1345
3,0,110240.0,2,150953,3155,82,1,223,1349
4,0,65333.0,2,53537,3155,82,1,223,1351


In [10]:
selectdf.CASE_STATUS.value_counts()

CASE_STATUS
0    3292311
1     184634
3      64014
2      21777
Name: count, dtype: int64

In [11]:
#target variable and the selected features from above
X = selectdf[['RECEIVED_DATE','Decision_Duration', 'TOTAL_WORKER_POSITIONS', 'ANNUAL_WAGE', 'SUPPORT_H1B', 'EMPLOYER_NAME',
        'AGENT_ATTORNEY_CITY', 'WORKSITE_WORKERS', 'RECEIVED_DATE']]
y = selectdf['CASE_STATUS']

# Standardize numerical features
scaler = StandardScaler()
X[['Decision_Duration', 'TOTAL_WORKER_POSITIONS', 'ANNUAL_WAGE']] = scaler.fit_transform(X[['Decision_Duration', 'TOTAL_WORKER_POSITIONS', 'ANNUAL_WAGE']])

# Split dataset into train and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


## Logistic Regression

In [None]:
#Setting up model
lgmodel = LogisticRegression(max_iter=100, solver='saga', class_weight='balanced')
lgmodel.fit(X_train, y_train)

# Predictions
y_pred = lgmodel.predict(X_test)


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report

## Neural Network

In [12]:
# Define the neural network architecture
model_nn = Sequential([
    Dense(32, activation='relu', input_shape=(X.shape[1],)),
    Dropout(0.2),  # Regularization to prevent overfitting
    Dense(16, activation='relu'),  
    Dropout(0.2),
    Dense(1, activation='sigmoid')  
])

2025-02-12 09:50:19.023346: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10534 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:8a:00.0, compute capability: 6.1


In [None]:
#model
model_nn.compile(optimizer=Adam(learning_rate=0.001),
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

# Train the model
history = model_nn.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/20


2025-02-12 09:50:28.702167: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f671862ecb0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-02-12 09:50:28.702218: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1080 Ti, Compute Capability 6.1
2025-02-12 09:50:28.716123: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-02-12 09:50:28.751043: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
I0000 00:00:1739353828.877089   23931 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




In [None]:
# Evaluate the model
test_loss, test_accuracy = model_nn.evaluate(X_test, y_test, verbose=0)

test_accuracy


In [None]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# Get probability predictions from the neural network
y_prob_nn = model_nn.predict(X_test).flatten()

# Compute ROC-AUC Score
roc_auc_nn = roc_auc_score(y_test, y_prob_nn)

# Compute Precision-Recall AUC Score
precision_nn, recall_nn, _ = precision_recall_curve(y_test_opt, y_prob_nn)
pr_auc_nn = auc(recall_nn, precision_nn)