# Perform the ranking tasks

- randomly select 20 candidate jobs and rank them.

In [204]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import random
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

In [317]:
def show_result(y_true, y_prob):
    y_prediction = [0 if i<=0.5 else 1 for i in y_prob]
    report = classification_report(y_true,y_prediction,digits=4)
    report = report.splitlines()
    columns = ['class'] + report[0].split()
    col_1, col_2, col_3, col_4, col_5 = [], [], [], [], []
    for row in report[1:]:
        if len(row.split()) != 0:
            row = row.split()
            if len(row) < 5:
                col_1.append(row[0])
                col_2.append('')
                col_3.append('')
                col_4.append(row[1])
                col_5.append(row[2])
            elif len(row) > 5:
                col_1.append(row[0] + ' ' + row[1])
                col_2.append(row[2])
                col_3.append(row[3])
                col_4.append(row[4])
                col_5.append(row[5])
            else:
                col_1.append(row[0])
                col_2.append(row[1])
                col_3.append(row[2])
                col_4.append(row[3])
                col_5.append(row[4])
    col_1.append("overall")
    col_2.append(precision_score(y_true, y_prediction))
    col_3.append(recall_score(y_true, y_prediction))
    col_4.append(f1_score(y_true, y_prediction))
    col_5.append(roc_auc_score(y_true, y_prob))
    result = pd.DataFrame()
    result[columns[0]] = col_1
    result[columns[1]] = col_2
    result[columns[2]] = col_3
    result[columns[3]] = col_4
    result[columns[4]] = col_5
    print("——————Test——————")
    print(result)

In [2]:
user_set = pd.read_csv("user_set_cleaned.csv")
job_set = pd.read_csv("job_set_cleaned.csv")
work_history = pd.read_csv("work_history_cleaned.csv")
dataset = pd.read_csv("dataset_cleaned.csv")

In [4]:
X_train = np.load("X_train.npy")
Y_train = np.load("Y_train.npy")
X_test = np.load("X_test.npy")
Y_test = np.load("Y_test.npy")

# 1. Build datasets

In [37]:
# about 1 min
job_set = job_set.fillna(" ")
job_set["word"] = job_set.Title + job_set.Description + job_set.Requirements
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=5, max_features=100, stop_words='english')
tfidf_matrix = tf.fit_transform(job_set['word'])

In [38]:
word_history_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, max_features=50, stop_words='english')
word_history_tf_matrix = word_history_tf.fit_transform(work_history.groupby("UserID").JobTitle.sum().values)

In [39]:
test_user = user_set[user_set.Split=="Test"].UserID.values
test_data = dataset[dataset.UserID.isin(test_user)]

In [251]:
ranking_data = pd.DataFrame(columns = ["UserID","JobID","label", "City", "State"])
job_id = job_set.JobID.unique().tolist()
groups = test_data.groupby("UserID")
user_ids = []
job_ids = []
labels = []
City = []
State = []
for idx, group in tqdm(groups):
    size = 99
    exist_job = group.JobID.unique().tolist()
    candidate_job = [i for i in job_id if i not in exist_job ]
    sample_job = random.sample(range(0,len(candidate_job)),size)
    user_ids.extend([idx] * (size+1))
    job_ids.append(exist_job[0])
    job_ids.extend([candidate_job[i] for i in sample_job])
    labels.append(1)
    labels.extend([0] * (size))
    City.append(group.City.values[0])
    State.append(group.State.values[0])
    jobs = job_set[job_set.JobID.isin([candidate_job[i] for i in sample_job])]
    
    City.extend([0 if i!=group.City.values[0] else a for i in jobs.City.values.tolist()])
    State.extend([0 if i!=group.State.values[0] else a for i in jobs.State.values.tolist()])
    
ranking_data.UserID = user_ids
ranking_data.JobID = job_ids
ranking_data.label = labels
ranking_data.City = City
ranking_data.State = State
# ranking_data.to_csv("ranking_data.csv",index=False)

100%|██████████| 260/260 [00:04<00:00, 57.42it/s]


# 2. Define the evaluation function

In [246]:
def test_hit_rate(model, N):
    hit = 0
    groups = ranking_data.groupby("UserID")
    for u_id, group in tqdm(groups):
        X = np.zeros((1,158))
        user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", 
                                                "ManagedOthers", "ManagedHowMany"]]
        u_idx = user.index.values[0]
        user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
        job_id_list = group.JobID.values
        jobs = job_set[job_set.JobID.isin(job_id_list)]
        j_idx = jobs.index.values
        f = []
        for i in j_idx:
            feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
            f.append(feature)
        feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
        X = np.concatenate((X, feature), axis=0)
        result = model.predict_proba(X[1:])
#         result = model.predict(X[1:])
        a = -np.sort(-result[:,1])
        idx = np.argwhere(a==result[0,1])[0][0]
        if idx <= N-1:
            hit += 1
    return hit/len(test_user)

# 3. Test models
- Random Forest

In [318]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,Y_train)
y_pred = rf.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

——————Test——————
          class precision    recall  f1-score   support
0             0    0.6395    0.6262    0.6328       527
1             1    0.6338    0.6471    0.6404       527
2      accuracy                        0.6366      1054
3     macro avg    0.6367    0.6366    0.6366      1054
4  weighted avg    0.6367    0.6366    0.6366      1054
5       overall  0.633829  0.647059  0.640376  0.702132


In [274]:
test_hit_rate(rf,1), test_hit_rate(rf,5), test_hit_rate(rf,10), test_hit_rate(rf,20)

100%|██████████| 260/260 [00:07<00:00, 32.61it/s]
100%|██████████| 260/260 [00:07<00:00, 32.54it/s]
100%|██████████| 260/260 [00:07<00:00, 32.87it/s]
100%|██████████| 260/260 [00:07<00:00, 33.57it/s]


(0.026923076923076925,
 0.12307692307692308,
 0.22692307692307692,
 0.4307692307692308)

- Linear Regression

In [319]:
def test_hit_rate_linearRegr(model, N):
    hit = 0
    groups = ranking_data.groupby("UserID")
    for u_id, group in tqdm(groups):
        X = np.zeros((1,158))
        user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", 
                                                "ManagedOthers", "ManagedHowMany"]]
        u_idx = user.index.values[0]
        user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
        job_id_list = group.JobID.values
        jobs = job_set[job_set.JobID.isin(job_id_list)]
        j_idx = jobs.index.values
        f = []
        for i in j_idx:
            feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
            f.append(feature)
        feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
        X = np.concatenate((X, feature), axis=0)
#         result = model.predict_proba(X[1:])
        result = model.predict(X[1:])
        a = -np.sort(-result)
        idx = np.argwhere(a==result[0])[0][0]
        if idx <= N-1:
            hit += 1
    return hit/len(test_user)
linear_r = LinearRegression()
linear_r.fit(X_train,Y_train)
y_pred = linear_r.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.5418    0.5655    0.5534       527
1             1    0.5456    0.5218    0.5335       527
2      accuracy                        0.5436      1054
3     macro avg    0.5437    0.5436    0.5434      1054
4  weighted avg    0.5437    0.5436    0.5434      1054
5       overall  0.545635  0.521822  0.533463  0.548738


In [273]:
test_hit_rate_linearRegr(linear_r,1), test_hit_rate_linearRegr(linear_r,5), test_hit_rate_linearRegr(linear_r,10), test_hit_rate_linearRegr(linear_r, 20)

100%|██████████| 260/260 [00:04<00:00, 64.74it/s]
100%|██████████| 260/260 [00:03<00:00, 65.60it/s]
100%|██████████| 260/260 [00:03<00:00, 68.62it/s]
100%|██████████| 260/260 [00:03<00:00, 67.53it/s]


(0.007692307692307693, 0.1, 0.1576923076923077, 0.28846153846153844)

- Logistic Regression

In [320]:
lr = LogisticRegression()
lr.fit(X_train,Y_train)
y_pred = lr.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

——————Test——————
          class precision    recall  f1-score  support
0             0    0.5326    0.5579    0.5449      527
1             1    0.5359    0.5104    0.5228      527
2      accuracy                        0.5342     1054
3     macro avg    0.5342    0.5342    0.5339     1054
4  weighted avg    0.5342    0.5342    0.5339     1054
5       overall  0.535857  0.510436  0.522838  0.55033


In [271]:
test_hit_rate(lr,1), test_hit_rate(lr,5), test_hit_rate(lr,10), test_hit_rate(lr,20)

100%|██████████| 260/260 [00:03<00:00, 66.40it/s]
100%|██████████| 260/260 [00:03<00:00, 65.50it/s]
100%|██████████| 260/260 [00:03<00:00, 66.03it/s]
100%|██████████| 260/260 [00:04<00:00, 61.75it/s]


(0.007692307692307693, 0.09230769230769231, 0.1423076923076923, 0.3)

- Decision Tree

In [321]:
dt = DecisionTreeClassifier(max_leaf_nodes=1500,random_state=0)
dt.fit(X_train,Y_train)
y_pred = dt.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

——————Test——————
          class precision    recall  f1-score   support
0             0    0.6022    0.6205    0.6112       527
1             1    0.6086    0.5901    0.5992       527
2      accuracy                        0.6053      1054
3     macro avg    0.6054    0.6053    0.6052      1054
4  weighted avg    0.6054    0.6053    0.6052      1054
5       overall  0.608611  0.590133  0.599229  0.630908


In [270]:
test_hit_rate(dt,1), test_hit_rate(dt,5), test_hit_rate(dt,10), test_hit_rate(dt,20)

100%|██████████| 260/260 [00:04<00:00, 59.88it/s]
100%|██████████| 260/260 [00:03<00:00, 66.21it/s]
100%|██████████| 260/260 [00:03<00:00, 68.08it/s]
100%|██████████| 260/260 [00:04<00:00, 64.94it/s]


(0.03076923076923077,
 0.08076923076923077,
 0.16153846153846155,
 0.3038461538461538)

- Naive Bayes

In [322]:
nb = GaussianNB()
nb.fit(X_train,Y_train)
y_pred = nb.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

——————Test——————
          class precision   recall  f1-score   support
0             0    0.5125   0.5825    0.5453       527
1             1    0.5165   0.4459    0.4786       527
2      accuracy                       0.5142      1054
3     macro avg    0.5145   0.5142    0.5120      1054
4  weighted avg    0.5145   0.5142    0.5120      1054
5       overall  0.516484  0.44592  0.478615  0.530636


In [269]:
test_hit_rate(nb,1), test_hit_rate(nb,5), test_hit_rate(nb,10), test_hit_rate(nb,20)

100%|██████████| 260/260 [00:03<00:00, 66.77it/s]
100%|██████████| 260/260 [00:05<00:00, 45.99it/s]
100%|██████████| 260/260 [00:04<00:00, 62.23it/s]
100%|██████████| 260/260 [00:03<00:00, 67.78it/s]


(0.015384615384615385,
 0.08461538461538462,
 0.2076923076923077,
 0.36923076923076925)

- AdaBoost

In [324]:
ada = AdaBoostClassifier(random_state=0)
ada.fit(X_train,Y_train)
y_pred = ada.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

——————Test——————
          class precision    recall  f1-score   support
0             0    0.5228    0.5863    0.5528       527
1             1    0.5292    0.4649    0.4949       527
2      accuracy                        0.5256      1054
3     macro avg    0.5260    0.5256    0.5239      1054
4  weighted avg    0.5260    0.5256    0.5239      1054
5       overall  0.529158  0.464896  0.494949  0.534969


In [293]:
test_hit_rate(ada,1), test_hit_rate(ada,5), test_hit_rate(ada,10), test_hit_rate(ada,20)

100%|██████████| 260/260 [00:05<00:00, 44.07it/s]
100%|██████████| 260/260 [00:06<00:00, 38.27it/s]
100%|██████████| 260/260 [00:05<00:00, 43.52it/s]
100%|██████████| 260/260 [00:06<00:00, 40.42it/s]


(0.019230769230769232,
 0.12307692307692308,
 0.21153846153846154,
 0.3269230769230769)

- Gradient Boosting

In [284]:
gbdt = GradientBoostingClassifier(max_depth=10, random_state=0, verbose=1)
gbdt.fit(X_train,Y_train)
y_pred = gbdt.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

      Iter       Train Loss   Remaining Time 
         1           1.3806            4.76m
         2           1.3771            5.20m
         3           1.3720            5.37m
         4           1.3695            5.30m
         5           1.3652            5.15m
         6           1.3574            5.05m
         7           1.3522            4.97m
         8           1.3505            4.98m
         9           1.3450            4.93m
        10           1.3371            4.81m
        20           1.3059            4.25m
        30           1.2817            3.70m
        40           1.2592            3.21m
        50           1.2361            2.69m
        60           1.2155            2.15m
        70           1.1986            1.63m
        80           1.1873            1.09m
        90           1.1682           32.63s
       100           1.1548            0.00s
——————Test——————
          class precision    recall  f1-score   support
0             0    0.6278 

In [285]:
test_hit_rate(gbdt,1),  test_hit_rate(gbdt,5), test_hit_rate(gbdt,10), test_hit_rate(gbdt,20)

100%|██████████| 260/260 [00:04<00:00, 59.66it/s]
100%|██████████| 260/260 [00:04<00:00, 64.58it/s]
100%|██████████| 260/260 [00:03<00:00, 68.54it/s]
100%|██████████| 260/260 [00:03<00:00, 68.10it/s]


(0.038461538461538464, 0.15, 0.23461538461538461, 0.40384615384615385)