## 读取数据

In [1]:
import pandas as pd
import numpy as np
import json
import re
from tqdm import tqdm

In [2]:
train = json.load(open('dataset/train.json'))
test = json.load(open('dataset/test.json'))
job_list = json.load(open('dataset/job_list.json'))

In [3]:
len(job_list), len(train), len(test)

(51, 20000, 6500)

In [4]:
job_list[15]

{'positionName': '77 102 113 102 14859143 14846102 14856354 14725004 14860175',
 'positionDescription': '54 14976902 14846085 14925705 15710888 14924977 15640254 14720169 15706553 15706534 15252363 14788542 14781097 15710888 14924977 14720387 15637383 14792325 14854539 14782359 14784945 14784899 14849419 14847384 15697082 15706553 15706534 15252380 53 14976902 14783927 14844318 15704195 15698820 14924964 14722950 15252363 14859953 14854539 14782359 14720191 14859667 14720387 14781831 14785448 14782848 14788260 15252380 52 14976902 15709080 15709092 15640254 14720169 14720387 15638449 14789042 14780320 14791308 15706258 14849419 14858370 15704454 14720387 14781831 14785448 14783400 14781318 14976901',
 'positionRequirements': '54 14976902 15630474 14714302 15638181 14788518 14785451 14721174 14846093 14924962 14925709 14854817 14846337 15252363 15706534 14723472 14785469 14976902 15632285 14923942 14976902 14923942 14780072 14856354 14725004 14723726 14720191 14844596 14925716 14925725 

In [5]:
train[0]

{'positionName': '77 102 113 102 14859143 14846102 14856354 14725004 14860175',
 'resumeData': {'profileEduExps': [{'schoolLevel': ['62 63 50', '53 54 54'],
    'schoolOriginal': '14925738 14857910 14853024 14854817',
    'courses': '',
    'education': '14722706 14853292',
    'majorOriginal': '14717874 14854807 14925705 15632285 14923942 14856354 14725004',
    'endTimeOriginal': '53 55 53 53 14859955 55 49 14785423',
    'duration': '53 54',
    'educationOriginal': '14722706 14853292',
    'major': '14717874 14854807 14925705 15632285 14923942 14856354 14725004',
    'school': '14925738 14857910 14853024 14854817',
    'startTimeOriginal': '53 55 53 55 14859955 55 62 14785423',
    'GPA': '',
    'ranking': '',
    'startTime': '53 55 53 55 42 55 62',
    'endTime': '53 55 53 53 42 55 49',
    'department': '14717874 14854807 14925705 14923942 14780072 14856354 14725004 14854817 15638181'},
   {'schoolLevel': ['14844587 14847385 15641759 14788518'],
    'schoolOriginal': '14792344 

In [6]:
train[0].keys()

dict_keys(['positionName', 'resumeData', 'positionID', 'resumeRecordId'])

In [7]:
train[0]['resumeData'].keys()

dict_keys(['profileEduExps', 'profileSocialExps', 'profileLanguage', 'profileProjectExps', 'profileSkills', 'profileAwards', 'profileWorkExps', 'profileDesire'])

In [8]:
train[0]['resumeData']['profileDesire']

{'expectFunction': [],
 'expectIndustry': [],
 'expectIndustryOriginal': [],
 'expectPlace': [],
 'currentStatusOriginal': '',
 'expectSalary': ''}

## 特征工程

In [9]:
# 正则表达式模式，匹配单词，其中单词由空格、标点符号或字符串的开始和结束进行分割
pattern = re.compile(r'\b\w+\b')

train_feat = []
for train_sample in tqdm(train):
    cv_sample_word = {}
    for key in ['profileEduExps', 'profileSocialExps', 'profileLanguage', 'profileProjectExps', 'profileSkills', 'profileAwards', 'profileWorkExps', 'profileDesire']:
        cv_sample_word[key] = pattern.findall(str(train_sample['resumeData'][key]))
                           
    for job_sample in job_list:
        positionName_word = re.findall(pattern, job_sample['positionName'])
        positionDescription_word = pattern.findall(job_sample['positionDescription'])
        positionRequirements_word = pattern.findall(job_sample['positionRequirements'])

        feat = [
            len(train_sample['resumeData']['profileEduExps']),
        ]

        for key in ['profileEduExps', 'profileSocialExps', 'profileLanguage', 'profileProjectExps', 'profileSkills', 'profileAwards', 'profileWorkExps', 'profileDesire']:
            feat.append(len(set(cv_sample_word[key]) & set(positionName_word)))
            feat.append(len(set(cv_sample_word[key]) & set(positionDescription_word)))
            feat.append(len(set(cv_sample_word[key]) & set(positionRequirements_word)))

            feat.append(len(set(cv_sample_word[key]) & set(positionName_word)) / len(set(positionName_word)))
            feat.append(len(set(cv_sample_word[key]) & set(positionDescription_word)) / len(set(positionDescription_word)))
            feat.append(len(set(cv_sample_word[key]) & set(positionRequirements_word)) / len(set(positionRequirements_word)))

            feat.append(len(set(cv_sample_word[key]) & set(positionName_word)) / (len(set(cv_sample_word[key])) + 1))
            feat.append(len(set(cv_sample_word[key]) & set(positionDescription_word)) / (len(set(cv_sample_word[key])) + 1))
            feat.append(len(set(cv_sample_word[key]) & set(positionRequirements_word)) / (len(set(cv_sample_word[key])) + 1))


        if train_sample['positionID'] == job_sample['positionID']:
            feat.append(1)
        else:
            feat.append(0)


        train_feat.append(feat)

100%|██████████| 20000/20000 [10:55<00:00, 30.50it/s]


In [10]:
train_feat = np.array(train_feat).astype(float)

## 模型训练与验证

In [11]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(loss='log_loss')
model.fit(train_feat[:-20000, :-1], train_feat[:-20000, -1])

In [12]:
model.score(train_feat[-20000:, :-1], train_feat[-20000:, -1])

0.9804

In [15]:
from lightgbm import LGBMClassifier

model = LGBMClassifier()
model.fit(train_feat[:-20000, :-1], train_feat[:-20000, -1])

[LightGBM] [Info] Number of positive: 19607, number of negative: 980393
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.084207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11475
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019607 -> initscore=-3.912067
[LightGBM] [Info] Start training from score -3.912067


In [16]:
model.score(train_feat[-20000:, :-1], train_feat[-20000:, -1])

0.98045

# 测试集预测

In [17]:
test_submit = []
for train_sample in tqdm(test):
    cv_sample_word = {}
    for key in ['profileEduExps', 'profileSocialExps', 'profileLanguage', 'profileProjectExps', 'profileSkills', 'profileAwards', 'profileWorkExps', 'profileDesire']:
        cv_sample_word[key] = pattern.findall(str(train_sample['resumeData'][key]))

    test_feat = []
    for job_sample in job_list:
        positionName_word = re.findall(pattern, job_sample['positionName'])
        positionDescription_word = pattern.findall(job_sample['positionDescription'])
        positionRequirements_word = pattern.findall(job_sample['positionRequirements'])

        feat = [
            train_sample['resumeRecordId'],
            job_sample['positionID'],
            len(train_sample['resumeData']['profileEduExps']),
        ]

        for key in ['profileEduExps', 'profileSocialExps', 'profileLanguage', 'profileProjectExps', 'profileSkills', 'profileAwards', 'profileWorkExps', 'profileDesire']:
            feat.append(len(set(cv_sample_word[key]) & set(positionName_word)))
            feat.append(len(set(cv_sample_word[key]) & set(positionDescription_word)))
            feat.append(len(set(cv_sample_word[key]) & set(positionRequirements_word)))

            feat.append(len(set(cv_sample_word[key]) & set(positionName_word)) / len(set(positionName_word)))
            feat.append(len(set(cv_sample_word[key]) & set(positionDescription_word)) / len(set(positionDescription_word)))
            feat.append(len(set(cv_sample_word[key]) & set(positionRequirements_word)) / len(set(positionRequirements_word)))

            feat.append(len(set(cv_sample_word[key]) & set(positionName_word)) / (len(set(cv_sample_word[key])) + 1))
            feat.append(len(set(cv_sample_word[key]) & set(positionDescription_word)) / (len(set(cv_sample_word[key])) + 1))
            feat.append(len(set(cv_sample_word[key]) & set(positionRequirements_word)) / (len(set(cv_sample_word[key])) + 1))
        
        test_feat.append(feat)

    test_feat = np.array(test_feat)
    test_submit.append(
        [
            train_sample['resumeRecordId'],
            test_feat[model.predict_proba(test_feat[:, 2:].astype(float))[:, 1].argmax(), 1]
        ]
    )

100%|██████████| 6500/6500 [04:12<00:00, 25.71it/s]


In [18]:
test_submit = pd.DataFrame(test_submit)

In [19]:
test_submit.columns = ['resumeRecordId', 'positionID']

In [20]:
test_submit.to_csv('submit.csv', index=None)