In [280]:
# as Data Scientist
import pandas as pd
import numpy as np

In [281]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [282]:
TRAIN_PATH = '/content/drive/MyDrive/Public/DS102 - Machine Learning/data/train.csv'
DEV_PATH = '/content/drive/MyDrive/Public/DS102 - Machine Learning/data/dev.csv'
TEST_PATH = '/content/drive/MyDrive/Public/DS102 - Machine Learning/data/test.csv'

JOB_LABELS_PATH = '/content/drive/MyDrive/Public/DS102 - Machine Learning/data/job_labels.csv'

# Load Data

In [283]:
train = pd.read_csv(TRAIN_PATH)
dev = pd.read_csv(DEV_PATH)
test = pd.read_csv(TEST_PATH)

# job labels
labels = pd.read_csv(JOB_LABELS_PATH)

In [284]:
X_train = train['description'] + ' ' + train['requirement'].fillna('')
X_dev = dev['description'] + ' ' + dev['requirement'].fillna('')
X_test = test['description'] + ' ' + test['requirement'].fillna('')

y_train = train['mapped_industry']
y_dev = dev['mapped_industry']
y_test = test['mapped_industry']

# Prepare data for training

In [285]:
labels = labels['0'].tolist()
num_labels = len(labels)

In [286]:
# Make onehot label
def create_onehot_labels(y):
    full_labels = []

    for val in y:
        seperated = val.split(' / ')
        lbl = np.zeros(num_labels)

        for i in range(0, num_labels):
            if labels[i] in seperated:
                lbl[i] = 1

        full_labels.append(lbl)

    return (full_labels)

# Return label
def return_label(y):
    y = y.flatten()
    job_labels = []
    for i in range(0, len(y)):
        if y[i] == 1:
            job_labels.append(labels[i])

    return job_labels

In [287]:
y_train_onehot = create_onehot_labels(y_train)
y_dev_onehot = create_onehot_labels(y_dev)
y_test_onehot = create_onehot_labels(y_test)

In [288]:
X_train = X_train.tolist()
X_dev = X_dev.tolist()
X_test = X_test.tolist()

In [289]:
X_full = X_train + X_dev + X_test

# Hyperparameters

In [290]:
MAX_WORDS = 5000
ANALYZER = 'word'

BATCH_SIZE = 16
EPOCH = 20

# TFIDF

In [291]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [292]:
tfidf = TfidfVectorizer(analyzer=ANALYZER, max_features=MAX_WORDS)
tfidf.fit(X_full)

In [293]:
X_train_ = tfidf.transform(X_train)
X_dev_ = tfidf.transform(X_dev)
X_test_ = tfidf.transform(X_test)

In [294]:
X_train_.shape, X_train_.shape, X_train_.shape

((10969, 5000), (10969, 5000), (10969, 5000))

# Machine Learning Models

In [295]:
from sklearn.linear_model import SGDClassifier # Stochastic Gradient Descent
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [296]:
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC()

In [297]:
classifiers = [sgd, lr, svc]
model_names = ['SGD', 'LR', 'SVC']

In [298]:
models = {}
for c, models_name in zip(classifiers, model_names):
    clf = OneVsRestClassifier(c)
    clf.fit(X_train_, y_train_onehot)
    models[models_name] = clf

# Evaluation

In [299]:
def accuracy_score(y_true, y_pred):
    temp = 0
    for i in range(0, len(y_true)):
        temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))
    return temp / len(y_true)


def f1_score(y_true, y_pred):
    temp = 0
    for i in range(len(y_true)):
        if (sum(y_true[i]) == 0) and (sum(y_pred[i]) == 0):
            continue
        temp+= (2*sum(np.logical_and(y_true[i], y_pred[i]))) / (sum(y_true[i])+sum(y_pred[i]))
    return temp/ len(y_true)

def em_score(y_true, y_pred):
    MR = np.all(y_pred == y_true, axis=1).mean()
    return MR

In [300]:
def score(X, y, clf, name):
    y_pred = clf.predict(X)

    print('Model name: ', name)
    print('Accuracy score: ', accuracy_score(y, y_pred))
    print('F1 score: ', f1_score(y, y_pred))
    print('Em score: ', em_score(y, y_pred))

In [301]:
for name, clf in models.items():
    score(X_dev_, y_dev_onehot, clf, name)
    print()

Model name:  SGD
Accuracy score:  0.372665390342481
F1 score:  0.4574467438538901
Em score:  0.13592852584556478

Model name:  LR
Accuracy score:  0.32261752818549466
F1 score:  0.3990784635487872
Em score:  0.11455009572431397

Model name:  SVC
Accuracy score:  0.4199000212720697
F1 score:  0.5093452760810768
Em score:  0.16177409061901724



In [302]:
for name, clf in models.items():
    score(X_test_, y_test_onehot, clf, name)
    print()

Model name:  SGD
Accuracy score:  0.37257653061224466
F1 score:  0.45635781098153355
Em score:  0.13584183673469388

Model name:  LR
Accuracy score:  0.31205357142857076
F1 score:  0.38692450194363437
Em score:  0.10459183673469388

Model name:  SVC
Accuracy score:  0.4197704081632655
F1 score:  0.5132637876579164
Em score:  0.15051020408163265



# Error Analysis

In [303]:
dev_result_df = pd.DataFrame({'description': X_dev, 'true_label': y_dev.map(lambda x: x.split(' / '))})
test_result_df = pd.DataFrame({'description': X_test, 'true_label': y_test.map(lambda x: x.split(' / '))})

In [304]:
dev_sgd_pred = models['SGD'].predict(X_dev_)
dev_lr_pred = models['LR'].predict(X_dev_)
dev_svc_pred = models['SVC'].predict(X_dev_)

test_sgd_pred = models['SGD'].predict(X_test_)
test_lr_pred = models['LR'].predict(X_test_)
test_svc_pred = models['SVC'].predict(X_test_)

In [305]:
dev_sgd_pred = pd.Series([return_label(i) for i in dev_sgd_pred])
dev_lr_pred = pd.Series([return_label(i) for i in dev_lr_pred])
dev_svc_pred = pd.Series([return_label(i) for i in dev_svc_pred])

test_sgd_pred = pd.Series([return_label(i) for i in test_sgd_pred])
test_lr_pred = pd.Series([return_label(i) for i in test_lr_pred])
test_svc_pred = pd.Series([return_label(i) for i in test_svc_pred])

In [306]:
dev_result_df['predicted_label_SGD'] = dev_sgd_pred
dev_result_df['predicted_label_LR'] = dev_lr_pred
dev_result_df['predicted_label_SVC'] = dev_svc_pred

test_result_df['predicted_label_SGD'] = test_sgd_pred
test_result_df['predicted_label_LR'] = test_lr_pred
test_result_df['predicted_label_SVC'] = test_svc_pred

In [307]:
dev_result_df['count'] = dev_result_df['true_label'].map(lambda x: len(x))
test_result_df['count'] = test_result_df['true_label'].map(lambda x: len(x))

In [308]:
dev_result_df['sgd_correct'] = dev_result_df['true_label'] == dev_result_df['predicted_label_SGD']
dev_result_df['lr_correct'] = dev_result_df['true_label'] == dev_result_df['predicted_label_LR']
dev_result_df['svc_correct'] = dev_result_df['true_label'] == dev_result_df['predicted_label_SVC']

test_result_df['sgd_correct'] = test_result_df['true_label'] == test_result_df['predicted_label_SGD']
test_result_df['lr_correct'] = test_result_df['true_label'] == test_result_df['predicted_label_LR']
test_result_df['svc_correct'] = test_result_df['true_label'] == test_result_df['predicted_label_SVC']

In [309]:
dev_result_df[['count', 'sgd_correct', 'lr_correct', 'svc_correct']].groupby('count').sum()

Unnamed: 0_level_0,sgd_correct,lr_correct,svc_correct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,213,174,193
2,61,53,78
3,21,18,44


In [310]:
test_result_df[['count', 'sgd_correct', 'lr_correct', 'svc_correct']].groupby('count').sum()

Unnamed: 0_level_0,sgd_correct,lr_correct,svc_correct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,105,73,90
2,25,21,33
3,15,12,21
