# Import necessary libraries

In [1]:
# Common libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import copy

In [2]:
# Scikit-learn
import sklearn as skl
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn import svm

In [3]:
# Torch
import torch
import torch.nn as nn
import torch.optim as optim

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preparation

In [5]:
# Path to the data
data_folder = "/content/drive/MyDrive/ML - Bài tập nhóm/Datasets"

# -------> Change name of dataset HERE <-------
data_file = "job_descriptions.csv"

# Join path
data_path = os.path.join(data_folder, data_file)
data_path

'/content/drive/MyDrive/ML - Bài tập nhóm/Datasets/job_descriptions.csv'

In [6]:
df = pd.read_csv(data_path)

In [7]:
df.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."


In [None]:
df["degree"].value_counts()

bachelor    1984
master      1984
phd          247
Name: degree, dtype: int64

In [None]:
sensitive_features = ["gender"]

In [None]:
def one_hot_enc(dataset, sensitive=False):
  continuous = ["GPA"]

  enc = OneHotEncoder(handle_unknown='ignore', drop='if_binary')
  transformed = enc.fit_transform(dataset.drop(continuous, axis=1))

  feature_names = enc.get_feature_names_out([x for x in dataset.columns if x not in continuous])

  df_categorical = pd.DataFrame(transformed.toarray(), columns=feature_names, index=dataset.index)

  df_continuous = dataset[continuous]

  df_onehoted = pd.concat([df_continuous, df_categorical], axis=1)

  return df_onehoted

In [None]:
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(df.loc[:, df.columns != "decision"], df["decision"], test_size=0.2, random_state=100)
X_train_orig.head()

Unnamed: 0,gender,GPA,degree
1551,male,68.0,bachelor
1833,female,56.0,bachelor
3582,male,54.0,master
3041,male,59.0,bachelor
1993,male,60.0,bachelor


In [None]:
X_train_orig_no_sensitive = X_train_orig.drop(sensitive_features, axis=1)
X_test_orig_no_sensitive = X_test_orig.drop(sensitive_features, axis=1)
X_train_orig_no_sensitive.head()

Unnamed: 0,GPA,degree
1551,68.0,bachelor
1833,56.0,bachelor
3582,54.0,master
3041,59.0,bachelor
1993,60.0,bachelor


In [None]:
one_hot_enc(X_train_orig)

Unnamed: 0,GPA,gender_female,gender_male,gender_other,degree_bachelor,degree_master,degree_phd
1551,68.0,0.0,1.0,0.0,1.0,0.0,0.0
1833,56.0,1.0,0.0,0.0,1.0,0.0,0.0
3582,54.0,0.0,1.0,0.0,0.0,1.0,0.0
3041,59.0,0.0,1.0,0.0,1.0,0.0,0.0
1993,60.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
4149,55.0,0.0,1.0,0.0,0.0,1.0,0.0
1890,63.0,0.0,1.0,0.0,1.0,0.0,0.0
350,61.0,0.0,1.0,0.0,0.0,1.0,0.0
79,56.0,0.0,1.0,0.0,1.0,0.0,0.0


In [None]:
X_train = np.array(one_hot_enc(X_train_orig).values)
X_test = np.array(one_hot_enc(X_test_orig).values)
y_train = np.array(y_train_orig.values)
y_test = np.array(y_test_orig.values)

y_train

array([False, False, False, ...,  True,  True,  True])

In [None]:
X_train_no_sensitive = np.array(one_hot_enc(X_train_orig_no_sensitive, sensitive=True).values)
X_test_no_sensitive = np.array(one_hot_enc(X_test_orig_no_sensitive, sensitive=True).values)

# Model Definition

Input:

X_train, X_test, y_train, y_test: numpy array

X_train_no_sensitive, X_test_no_sensitive: numpy array (without sensitive features)

Output:

y_prob, y_prob_no_sensitive: numpy array

## Basic Deep Neural Network

In [None]:
# Model definition
model = nn.Sequential(
    nn.Linear(29, 12).double(),
    nn.ReLU().double(),
    nn.Linear(12, 6).double(),
    nn.ReLU().double(),
    nn.Linear(6, 1).double(),
    nn.Sigmoid()
)

print(model)

# Model definition
model_no_sensitive = nn.Sequential(
    nn.Linear(22, 12).double(),
    nn.ReLU().double(),
    nn.Linear(12, 6).double(),
    nn.ReLU().double(),
    nn.Linear(6, 1).double(),
    nn.Sigmoid()
)

print(model_no_sensitive)

Sequential(
  (0): Linear(in_features=29, out_features=12, bias=True)
  (1): ReLU()
  (2): Linear(in_features=12, out_features=6, bias=True)
  (3): ReLU()
  (4): Linear(in_features=6, out_features=1, bias=True)
  (5): Sigmoid()
)
Sequential(
  (0): Linear(in_features=22, out_features=12, bias=True)
  (1): ReLU()
  (2): Linear(in_features=12, out_features=6, bias=True)
  (3): ReLU()
  (4): Linear(in_features=6, out_features=1, bias=True)
  (5): Sigmoid()
)


In [None]:
# Loss function
criterion = nn.BCELoss()  # binary cross entropy
optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer_no_sensitive = optim.Adam(model_no_sensitive.parameters(), lr=0.001)

## Basic Logistic Regression

In [None]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
y_prob = clf.predict_proba(X_test)[:, 1]

In [None]:
clf_no_sensitive = LogisticRegression(random_state=0)
clf_no_sensitive.fit(X_train_no_sensitive, y_train)
y_prob_no_sensitive = clf_no_sensitive.predict_proba(X_test_no_sensitive)[:, 1]

## Decision tree - Kat

Remember the input and output forms are above

In [None]:
# Input:

# X_train, X_test, y_train, y_test: numpy array

# X_train_no_sensitive, X_test_no_sensitive: numpy array (without sensitive features)

# Output:

# y_prob, y_prob_no_sensitive: numpy array

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [None]:
# clf = DecisionTreeClassifier()
# Accuracy: 0.6737841043890866

In [None]:
# clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
# Accuracy: 0.7022538552787663
# Note: log_loss yield same accuracy

In [None]:
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# print(y_pred)

# print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

# Create a random forest classifier
clf = RandomForestClassifier(criterion="entropy")

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(clf,
                                 param_distributions = param_dist,
                                 n_iter=5,
                                 cv=5)

rand_search.fit(X_train, y_train)

best_rf = rand_search.best_estimator_

print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'max_depth': 8, 'n_estimators': 480}


In [None]:
y_pred = best_rf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6951364175563464


## Support vector machine - Andy

Remember the input and output forms are above

In [None]:
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [None]:
# param_grid = {'C': [10, 100, 1000],
#               'gamma': [0.001, 0.01, 0.1, 1]}

# grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=5)
# grid_search.fit(X_train, y_train)

In [None]:
# best_params = grid_search.best_params_

# print("Best C:", best_params['C'])
# print("Best gamma:", best_params['gamma'])

# best_model = grid_search.best_estimator_

# y_pred = best_model.predict(X_test)
# y_prob = best_model.decision_function(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

# Best C: 100
# Best gamma: 0.01

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
clf = svm.SVC(kernel='rbf', C=100, gamma=0.01)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
dist = clf.decision_function(X_test)
y_prob = sigmoid(dist)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

NameError: ignored

In [None]:
clf_no_sensitive = svm.SVC(kernel='rbf', C=100, gamma=0.01)
clf_no_sensitive.fit(X_train_no_sensitive, y_train)

y_pred = clf_no_sensitive.predict(X_test_no_sensitive)
dist = clf_no_sensitive.decision_function(X_test_no_sensitive)
y_prob_no_sensitive = sigmoid(dist)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
param_grid = {'C': [10, 100],
              'gamma': [0.001, 0.01, 0.1]}

grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=5)
grid_search.fit(X_train_no_sensitive, y_train)
best_params = grid_search.best_params_

print("Best C:", best_params['C'])
print("Best gamma:", best_params['gamma'])

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_no_sensitive)
y_prob = best_model.decision_function(X_test_no_sensitive)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

## another model of your choice

Remember whatever model you want to use, the input and output forms are above

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
lr_list = [0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in lr_list:
  clf = GradientBoostingClassifier(n_estimators=100, learning_rate=learning_rate, max_depth=3, random_state=0)
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)
  print("Learning Rate: ", learning_rate)
  print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

# Learning Rate:  0.05
# Accuracy: 0.7105575326215896
# Learning Rate:  0.075
# Accuracy: 0.7046263345195729
# Learning Rate:  0.1
# Accuracy: 0.6986951364175563
# Learning Rate:  0.25
# Accuracy: 0.6927639383155397
# Learning Rate:  0.5
# Accuracy: 0.6832740213523132
# Learning Rate:  0.75
# Accuracy: 0.6856465005931198
# Learning Rate:  1
# Accuracy: 0.685646500593119

Learning Rate:  0.025
Accuracy: 0.7069988137603797
Learning Rate:  0.05
Accuracy: 0.7105575326215896
Learning Rate:  0.075
Accuracy: 0.7046263345195729
Learning Rate:  0.1
Accuracy: 0.6986951364175563
Learning Rate:  0.25
Accuracy: 0.6927639383155397
Learning Rate:  0.5
Accuracy: 0.6832740213523132
Learning Rate:  0.75
Accuracy: 0.6856465005931198
Learning Rate:  1
Accuracy: 0.6856465005931198


In [None]:
lr_list = [0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in lr_list:
  clf = GradientBoostingClassifier(n_estimators=100, learning_rate=learning_rate, max_depth=3, random_state=0)
  clf.fit(X_train_no_sensitive, y_train)

  y_pred = clf.predict(X_test_no_sensitive)
  print("Learning Rate: ", learning_rate)
  print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Learning Rate:  0.025
Accuracy: 0.6797153024911032
Learning Rate:  0.05
Accuracy: 0.6797153024911032
Learning Rate:  0.075
Accuracy: 0.6761565836298933
Learning Rate:  0.1
Accuracy: 0.6785290628706999
Learning Rate:  0.25
Accuracy: 0.6737841043890866
Learning Rate:  0.5
Accuracy: 0.6702253855278766
Learning Rate:  0.75
Accuracy: 0.6666666666666666
Learning Rate:  1
Accuracy: 0.6666666666666666


# Training

Including concatenate prediction to original test data

In [None]:
n_epochs = 2000
batch_size = 10

In [None]:
def training(X, y, model=model, optimizer=optimizer, n_epochs=2000):
  for layer in model.children():
   if hasattr(layer, 'reset_parameters'):
       layer.reset_parameters()

  for epoch in range(n_epochs):
    # rest of your training code

    # each batch for training
    Xtorch = X
    ytorch = y.unsqueeze(1)

    optimizer.zero_grad()
    # Get the output of current training step
    cur_pred = model(Xtorch)

    # Get the loss
    loss = criterion(cur_pred, ytorch)

    #torch.nn.utils.clip_grad_norm_(model.parameters(), 5)

    # Train
    loss.backward()
    optimizer.step()

    #print(model[0].weight)
    # Add to loss of current epoch
    if epoch % 50:
      continue
    print(f'Finished epoch {epoch}, latest loss {loss.item()}')

  return model

In [None]:
# compute prediction (no_grad is optional)
model_torch = training(X=X_train, y=y_train, model=model, optimizer=optimizer, n_epochs=2000)
with torch.no_grad():
    y_pred = model_torch(X_test)

y_pred_df = pd.DataFrame(y_pred, columns=["probability"], index=X_test_orig.index)

test_orig = pd.concat([X_test_orig, y_test_orig, y_pred_df], axis=1)

test_orig.to_csv(os.path.join(data_folder, "model_output.csv"))

test_orig

In [None]:
# compute prediction (no_grad is optional)
model_torch_no_sensitive = training(X=X_train_no_sensitive, y=y_train, model=model_no_sensitive, optimizer=optimizer_no_sensitive, n_epochs=2000)
with torch.no_grad():
    y_pred = model_torch_no_sensitive(X_test_no_sensitive)

y_pred_df_no_sensitive = pd.DataFrame(y_pred, columns=["probability"], index=X_test_orig.index)

test_orig_no_sensitive = pd.concat([X_test_orig, y_test_orig, y_pred_df_no_sensitive], axis=1)

test_orig_no_sensitive.to_csv(os.path.join(data_folder, "model_no_sensitive_output.csv"))

test_orig_no_sensitive

# Data Combination for Evaluation

# New Section

In [None]:
y_prob_df = pd.DataFrame(y_prob, columns=["probability"], index=y_test_orig.index)
df_comb = pd.concat([X_test_orig, y_test_orig, y_prob_df], axis=1)

df_comb.to_csv(os.path.join(data_folder, "final_job_output.csv"))

In [None]:
y_prob_df_no_sensitive = pd.DataFrame(y_prob_no_sensitive, columns=["probability"], index=y_test_orig.index)
df_comb_no_sensitive = pd.concat([X_test_orig, y_test_orig, y_prob_df_no_sensitive], axis=1)

df_comb_no_sensitive.to_csv(os.path.join(data_folder, "final_job_output_no_sensitive.csv"))