In [2]:
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer 
from scipy.spatial.distance import cosine
from collections import defaultdict
import warnings
from IPython.display import clear_output
warnings.filterwarnings("ignore")


In [3]:
with open('train_data.json', 'r', encoding='utf-8') as file:
    train_data_json = json.load(file)
train_data = pd.DataFrame(train_data_json)

with open('test_data.json', 'r', encoding='utf-8') as file:
    test_data = json.load(file)

Функции в коде ниже принимают на вход или json формат или DataFrame по образу `train_data`. Формат прописан в документации функций.

# О решении

## Идея

Известные вакансии можно кластеризовать и для центра каждого кластера сделать отдельный классификатор, подходит ли вакансия. На вход этим классификатором подаётся схожесть между описаниями вакансий и резюме, названий вакансий и описанием резюме и т.д. Поскольку в наших данных всего 29 вакансий, то имеет смысл обучить классификатор под каждую из них без дополнительной кластеризации: она понадобиться при выводе решения в production. 

Когда приходит новая вакансия, считаются схожести её с известными нам вакансиями, и находятся подходящие классификаторы. Для каждого резюме мы используем обученные классификаторы k ближайших соседей новой вакансии. Ответы усредняются обратно-пропорционально расстоянию до этих объектов. Резюме считаем подходящим, если его score > заданного порога.

При этом, кандидатов можно ранжировать по score, чтобы сначала выдавать самых релевантных.


<a href="https://ibb.co/S0Mfd22"><img src="https://i.ibb.co/pZMhRgg/01-03-2024-08-31-16.png" alt="01-03-2024-08-31-16" border="0"></a>

<a href="https://ibb.co/G3N2DFN"><img src="https://i.ibb.co/Vp5SG35/01-03-2024-08-31-30.png" alt="01-03-2024-08-31-30" border="0"></a>



## Преимущества

- Модульное решение: можно менять

      1. классификаторы,
      2. кодировщики текста,
      3. модель, находящую ближайших соседей
      4. кластеризующую модель
- Настраиваемые гиперпараметры:
  
      1. гиперпараметры классификаторов
      2. количество ближайших соседей
      3. способ взвешивания scores моделей
      4. гиперпараметры кодировщика (к примеру, учитывание контекста)
- При условии схожести сферы использования (например, только IT-вакансии) малый размер модели 

# Создание корпуса текстов

Нужно создать корпус всех тренировочных текстов для обучения TF-IDF vectorizer. Дополнительно полезно сделать нормализацию и очистку текста, которые не делались в этом варианте кода.

In [7]:
def create_corpus(data):
    '''
    data: pd.DataFrame
    returns
    list of descriptions in vacancies and resumes
    '''
    corpus = []
    for vacancy in data.loc[:, 'vacancy']:
        if vacancy['description'] != None:
            corpus.append(vacancy['description'].strip())

        if vacancy['name'] != None:
            corpus.append(vacancy['name'].strip())

        if vacancy['keywords'] != None:
            corpus.append(vacancy['keywords'].strip())

    for failed_resume in data.loc[:, 'failed_resumes']:
        for item in failed_resume[0]['experienceItem']:
            if item['description'] != None:
                corpus.append(item['description'].strip())
            if item['position'] != None:
                corpus.append(item['position'].strip())
        
        if failed_resume[0]['key_skills'] != None:
            corpus.append(failed_resume[0]['key_skills'].strip())

    for confirmed_resume in data.loc[: , 'confirmed_resumes']:
        for item in confirmed_resume[0]['experienceItem']:
            if item['description'] != None:
                corpus.append(item['description'].strip())
            if item['position'] != None:
                corpus.append(item['position'].strip())
                
        if confirmed_resume[0]['key_skills'] != None:
            corpus.append(confirmed_resume[0]['key_skills'].strip())

    return corpus
    

corpus = create_corpus(train_data)
print('Всего записей в корпусе:', len(corpus))

Всего записей в корпусе: 556


In [8]:
# на экспериментах выяснилось, что ngram_range = (1,2) 
# лучше всего подходит для понимания контекста 

vectorizer = TfidfVectorizer(ngram_range = (1,2))
vectorizer.fit(corpus)

In [9]:
# 24101 слово в словаре кодификатора
len(vectorizer.vocabulary_)

24101

# Вспомогательные функции

In [10]:
def calculcate_cosine_simularity(sent1, sent2, vectorizer):
    '''
    sent1, sent2: str to encode
    vectorizer: class with transform method

    returns 
    float - cosine simularity
    '''
    vec1 = vectorizer.transform([sent1, sent2])[0]
    vec2 = vectorizer.transform([sent1, sent2])[1]
    vec1 = np.array(vec1.todense()).reshape(-1)
    vec2 = np.array(vec2.todense()).reshape(-1)

    cosine_simularity = 1 - cosine(vec1, vec2)
    del vec2, vec1 
    return cosine_simularity

# эвристическая функция, для полной функциональности 
# нужно провести анализ, какие вузы более предпочтительны
# для каждой вакансии
def check_university_level(university):
    '''
    university: str

    returns
    int 1 or 0 - is university among best universities 
    '''
    TOP_UNIVERSITIES = ['ВШЭ', 'МГУ', 'МФТИ', 
                        'МИЭМ', 'Финансовый университет', 'МИСИС']

    for top_university in TOP_UNIVERSITIES:
        if top_university in university:
            return 1
    return 0

Датасет резюме для каждой вакансии создаётся следующим образом:
1. Отдельно парсятся тексты описания резюме, должностей, навыков
2. Считается косинусная схожесть каждого из текстов с названием вакансии и её описанием
3. Добавляются доп. признаки: количество дней работы, возраст, является ли вуз топовым.

In [11]:
def create_resume_dataset(data, vectorizer):
    '''
    data: pd.Series 
    returns
    pd.Dataframe for particular vacancy (n_resumes x n_features)
    '''
    dataset = pd.DataFrame({})

    # rejected resumes
    for index, resume in enumerate(data['failed_resumes']):

        # age
        if resume['birth_date'] != None:
            dataset.loc[index, 'age'] = pd.to_datetime('2024-02-28').year - pd.to_datetime(resume['birth_date']).year
        else:
            dataset.loc[index, 'age'] = None

        
        # common for vacancy
        vacancy_desc = data['vacancy']['description']
        if data['vacancy']['keywords'] != None:
            vacancy_decs += ' ' +  data['vacancy']['keywords']
        vacancy_name = data['vacancy']['name']
        
            
        # skills-decription, skills-name simularity
        if resume['key_skills'] != None:
            skills = resume['key_skills']

            dataset.loc[index, 'skills-description_sim'] = calculcate_cosine_simularity(skills, vacancy_desc, vectorizer)
            dataset.loc[index, 'skills-name_sim'] = calculcate_cosine_simularity(skills, vacancy_name, vectorizer)
        else:
            dataset.loc[index, 'skills-description_sim'] = None
            dataset.loc[index, 'skills-name_sim'] = None

        # description-description, description-name simularity, days of working experience
        desc = ''
        num_experience = 0
        if 'experienceItem' in resume.keys():
            for item in resume['experienceItem']:
                if item['description'] != None:
                    desc += ' ' + item['description'].strip()
                    
                    start = pd.to_datetime(item['starts'])
                    end = pd.to_datetime(item['ends']) if item['ends'] != None else pd.to_datetime('2024-02-28')
                    num_experience += (end - start).days
    
            dataset.loc[index, 'description-description_sim'] = calculcate_cosine_simularity(desc, vacancy_desc, vectorizer)
            dataset.loc[index, 'description-name_sim'] = calculcate_cosine_simularity(desc, vacancy_name, vectorizer)
            dataset.loc[index, 'num_experience'] = num_experience
        else:
            dataset.loc[index, 'description-description_sim'] = None
            dataset.loc[index, 'description-name_sim'] = None
            dataset.loc[index, 'num_experience'] = None


        # name-name, name-description similarity
        positions = ''
        if 'experienceItem' in resume.keys():
            for item in resume['experienceItem']:
                if item['position'] != None:
                    positions += ' ' + item['position']
    
            dataset.loc[index, 'name-name_sim'] = calculcate_cosine_simularity(positions, vacancy_name, vectorizer)
            dataset.loc[index, 'name-description_sim'] = calculcate_cosine_simularity(positions, vacancy_desc, vectorizer)

        else:
            dataset.loc[index, 'name-name_sim'] = None
            dataset.loc[index, 'name-description_sim'] = None

        # education experience (is university among best universities)
        top_university = 0
        if 'educationItem' in resume.keys():
            for item in resume['educationItem']:
                if check_university_level(item['organization']):
                    top_university = 1            
        dataset.loc[index, 'university_level'] = top_university

        dataset.loc[index, 'target'] = 0

    # confirmed resumes
    for index, resume in enumerate(data['confirmed_resumes']):
        index += dataset.shape[0] #so that vacancies are not overwritten

        # age
        if resume['birth_date'] != None:
            dataset.loc[index, 'age'] = pd.to_datetime('2024-02-28').year - pd.to_datetime(resume['birth_date']).year
        else:
            dataset.loc[index, 'age'] = None

        
        # common for vacancy
        vacancy_desc = data['vacancy']['description']
        if data['vacancy']['keywords'] != None:
            vacancy_decs += ' ' +  data['vacancy']['keywords']
        vacancy_name = data['vacancy']['name']
        
            
        # skills-decription, skills-name simularity
        if resume['key_skills'] != None:
            skills = resume['key_skills']

            dataset.loc[index, 'skills-description_sim'] = calculcate_cosine_simularity(skills, vacancy_desc, vectorizer)
            dataset.loc[index, 'skills-name_sim'] = calculcate_cosine_simularity(skills, vacancy_name, vectorizer)
        else:
            dataset.loc[index, 'skills-description_sim'] = None
            dataset.loc[index, 'skills-name_sim'] = None

        # description-description, description-name simularity
        desc = ''
        num_experience = 0
        if 'experienceItem' in resume.keys():
            for item in resume['experienceItem']:
                if item['description'] != None:
                    desc += ' ' + item['description'].strip()
                    
                    start = pd.to_datetime(item['starts'])
                    end = pd.to_datetime(item['ends']) if item['ends'] != None else pd.to_datetime('2024-02-28')
                    num_experience += (end - start).days
    
            dataset.loc[index, 'description-description_sim'] = calculcate_cosine_simularity(desc, vacancy_desc, vectorizer)
            dataset.loc[index, 'description-name_sim'] = calculcate_cosine_simularity(desc, vacancy_name, vectorizer)
            dataset.loc[index, 'num_experience'] = num_experience
        else:
            dataset.loc[index, 'description-description_sim'] = None
            dataset.loc[index, 'description-name_sim'] = None
            dataset.loc[index, 'num_experience'] = None
            


        # name-name, name-description similarity
        positions = ''
        if 'experienceItem' in resume.keys():
            for item in resume['experienceItem']:
                if item['position'] != None:
                    positions += ' ' + item['position']
    
            dataset.loc[index, 'name-name_sim'] = calculcate_cosine_simularity(positions, vacancy_name, vectorizer)
            dataset.loc[index, 'name-description_sim'] = calculcate_cosine_simularity(positions, vacancy_desc, vectorizer)

        else:
            dataset.loc[index, 'name-name_sim'] = None
            dataset.loc[index, 'name-description_sim'] = None
            

        # education experience (is university among best universities)
        top_university = 0
        if 'educationItem' in resume.keys():
            for item in resume['educationItem']:
                if check_university_level(item['organization']):
                    top_university = 1 
        dataset.loc[index, 'university_level'] = top_university
        
        dataset.loc[index, 'target'] = 1

    return dataset

In [12]:
# как выглядит итоговый датасет для каждой вакансии 
# (на нём будут обучаться классификаторы) 

create_resume_dataset(train_data.loc[10], vectorizer).head()

Unnamed: 0,age,skills-description_sim,skills-name_sim,description-description_sim,description-name_sim,num_experience,name-name_sim,name-description_sim,university_level,target
0,,,,0.083785,0.083386,953.0,0.0,0.0,0.0,0.0
1,,,,0.009051,0.0,2037.0,0.0,0.0,0.0,0.0
2,35.0,0.026395,0.064787,0.149975,0.027516,4655.0,0.395897,0.049107,0.0,0.0
3,33.0,0.054423,0.055209,0.088955,0.070518,1153.0,0.200915,0.04262,0.0,0.0
4,24.0,0.037654,0.061845,0.079083,0.059952,910.0,0.1364,0.053961,0.0,0.0


In [13]:
# итоговая сборка датасетов для каждой вакансии
from tqdm.notebook import tqdm

def create_final_dataset(data, vectorizer):
    '''
    data: pd.DataFrame

    returns
    dict {vacancy uuid: resume dataset}
    '''
    dataset = {}
    for index in tqdm(range(data.shape[0])):
        uuid_vacancy = data.loc[index, 'vacancy']['uuid']
        dataset[uuid_vacancy] = create_resume_dataset(data.loc[index], vectorizer)

    return dataset
final_data = create_final_dataset(train_data, vectorizer)

  0%|          | 0/29 [00:00<?, ?it/s]

In [14]:
# создание датасета вакансий для нахождения ближайших соседей

def parse_vacancy(data):
    '''
    data: dict json-like
    '''
    dataset = pd.DataFrame({})
    for index, item in enumerate(data):
        name = item['vacancy']['name']
        desc = item['vacancy']['description']
        uuid = item['vacancy']['uuid']

        dataset.loc[index, ['uuid', 'name', 'description']] = uuid, name, desc
        dataset.loc[index, 'uuid'] = uuid

    return dataset
        
vacancy_dataset = parse_vacancy(train_data_json)
vacancy_dataset.to_csv('vacancy_dataset.csv')

In [15]:
vacancy_dataset.head()

Unnamed: 0,uuid,name,description
0,779f3a59-206a-3241-adc4-d7db504f960b,Java разработчик команда Инвестиции,Описание Мы расширяем команды и ищем разработ...
1,7a4813fc-43bc-3896-a607-4c8682b01002,Системный аналитик,Уровень: СА уровня Middle+/Senior от 3х лет В...
2,c03085c3-9b1e-3564-bb1e-59aa72e5fbca,Ведущий/ Главный аналитик DWH,Желательно знания Oracle сегодня — не прост...
3,a8dd83c3-178d-3c70-90c2-7c3648f6b96a,Системный аналитик,Знания и опыт • Умение управлять ожиданиями з...
4,9d98eba0-13bb-38d3-b742-4fd445954b3d,Product manager,"- Продактов в компании сейчас порядка 250, вс..."


# Обучаем классификаторы

В качестве классификатора мы выбрали Catboost по результатам экспериментов. Также пытались заменить в нём стандартную кросс-энтропию на [FocalLoss](https://paperswithcode.com/method/focal-loss), который лучше работает с дисбалансом классов, но этот с этим лоссом итоговая модель не показала результатов.

In [16]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import math
from six.moves import xrange

In [17]:
class FocalLossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats
        # (containers with only __len__ and __getitem__ defined).
        # weights parameter can be None.
        # Returns list of pairs (der1, der2)
        gamma = 0.8
        # alpha = 1.
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        exponents = []
        for index in xrange(len(approxes)):
            exponents.append(math.exp(approxes[index]))

        result = []
        for index in xrange(len(targets)):
            p = exponents[index] / (1 + exponents[index])

            if targets[index] > 0.0:
                der1 = -((1-p)**(gamma-1))*(gamma * math.log(p) * p + p - 1)/p
                der2 = gamma*((1-p)**gamma)*((gamma*p-1)*math.log(p)+2*(p-1))
            else:
                der1 = (p**(gamma-1)) * (gamma * math.log(1 - p) - p)/(1 - p)
                der2 = p**(gamma-2)*((p*(2*gamma*(p-1)-p))/(p-1)**2 + (gamma-1)*gamma*math.log(1 - p))

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))

        return result

In [21]:
# эксперименты с FocalLoss
uuid = np.random.choice(vacancy_dataset['uuid'],1)[0]
df = final_data[uuid]

x_train, x_test, y_train, y_test = train_test_split(df.drop('target', axis = 1), 
                                                    df['target'], test_size = 0.3, shuffle = True)
clf = CatBoostClassifier(silent=True, loss_function = FocalLossObjective(), 
                         eval_metric = 'Precision').fit(x_train, y_train)
predictions = clf.predict(x_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       1.00      0.75      0.86         4
         1.0       0.75      1.00      0.86         3

    accuracy                           0.86         7
   macro avg       0.88      0.88      0.86         7
weighted avg       0.89      0.86      0.86         7



In [53]:
params = {'silent':True, 'loss_function': FocalLossObjective(), 'eval_metric': 'Precision'}
#params = {'silent' : True}
model = CatBoostClassifier

def train_models(model, params):
    models = {}
    for uuid in tqdm(vacancy_dataset['uuid']):
        df = final_data[uuid]
        
        clf = model(**params)
        clf.fit(df.drop('target', axis = 1), df['target'])
    
        models[uuid] = clf
    return models

models = train_models(model, params)

  0%|          | 0/29 [00:00<?, ?it/s]

# Поиск ближайших вакансий к данной

В этом разделе определяется функция, которая по заданной вакансии ищет похожие к ней на основе сконкатенированного вектора [название вакансии; текст описания вакансии].

In [22]:
def get_full_text_vacancy_representation(vacancy_dataset):
    '''
    vacancy_dataset: pd.DataFrame

    returns
    pd.DataFrame with concated names and vacancy description
    '''
    
    vacancy_dataset['full_text'] = vacancy_dataset.apply(
                                                        lambda x: x['name'] + ' ' \
                                                        + x['description'], axis = 1)

    return vacancy_dataset[['uuid','full_text']]

def find_knearest_neighbours(vacancy, vacancy_dataset, vectorizer, k = 2):
    '''
    vacancy: dict-like vacancy
    vacancy_dataset: pd.DataFrame
    vectorizer: transformer class with transform method
    k: int - number of nearest neighbours

    returns 
    pd.DataFrame with [uuid, sim_score] columns 
    '''
    # parsing full vacancy text
    vacancy_text = ''
    
    vacancy_text += ' ' + vacancy['name']
    
    if vacancy['keywords'] != None:
        vacancy_text += ' ' + vacancy['keywords']

    vacancy_text += ' ' + vacancy['description']
    
    vacancy_dataset_transformed = get_full_text_vacancy_representation(vacancy_dataset)
    vacancy_dataset_transformed['sim_score'] = vacancy_dataset_transformed['full_text'] \
                                                .apply(calculcate_cosine_simularity, args = (vacancy_text, vectorizer))

    knearest_neighbours = vacancy_dataset_transformed.sort_values(by = 'sim_score', ascending = False) \
                                                        .head(k)[['uuid', 'sim_score']]
    return knearest_neighbours

In [23]:
knearest_neighbours = find_knearest_neighbours(test_data['vacancy'], vacancy_dataset, vectorizer, k = 2)
knearest_neighbours

Unnamed: 0,uuid,sim_score
26,aecfdaf6-e12c-3309-8f1b-157028ef63d5,0.20884
24,b2315867-73a2-3d43-acac-cbb92bd793b3,0.202114


In [24]:
# одна из найденных похожих вакансий 
vacancy_dataset[vacancy_dataset['uuid'] == 'aecfdaf6-e12c-3309-8f1b-157028ef63d5']

Unnamed: 0,uuid,name,description,full_text
26,aecfdaf6-e12c-3309-8f1b-157028ef63d5,Java-разработчик,Опыт работы с java от 3 лет Уверенные знания ...,Java-разработчик Опыт работы с java от 3 лет ...


In [25]:
def create_test_dataset(data):
    '''
    data: dict json-like
    '''

    dataset = pd.DataFrame({})
    for index, resume in enumerate(data['resumes']):

        # age
        if resume['birth_date'] != None:
            dataset.loc[index, 'age'] = pd.to_datetime('2024-02-28').year - pd.to_datetime(resume['birth_date']).year
        else:
            dataset.loc[index, 'age'] = None

        
        # common for vacancy
        vacancy_desc = data['vacancy']['description']
        if data['vacancy']['keywords'] != None:
            vacancy_desc += ' ' +  data['vacancy']['keywords']
        vacancy_name = data['vacancy']['name']
        
            
        # skills-decription, skills-name simularity
        if resume['key_skills'] != None:
            skills = resume['key_skills']

            dataset.loc[index, 'skills-description_sim'] = calculcate_cosine_simularity(skills, vacancy_desc, vectorizer)
            dataset.loc[index, 'skills-name_sim'] = calculcate_cosine_simularity(skills, vacancy_name, vectorizer)
        else:
            dataset.loc[index, 'skills-description_sim'] = None
            dataset.loc[index, 'skills-name_sim'] = None

        # description-description, description-name simularity, days of working experience
        desc = ''
        num_experience = 0
        if 'experienceItem' in resume.keys():
            for item in resume['experienceItem']:
                if item['description'] != None:
                    desc += ' ' + item['description'].strip()
                    
                    start = pd.to_datetime(item['starts'])
                    end = pd.to_datetime(item['ends']) if item['ends'] != None else pd.to_datetime('2024-02-28')
                    num_experience += (end - start).days
    
            dataset.loc[index, 'description-description_sim'] = calculcate_cosine_simularity(desc, vacancy_desc, vectorizer)
            dataset.loc[index, 'description-name_sim'] = calculcate_cosine_simularity(desc, vacancy_name, vectorizer)
            dataset.loc[index, 'num_experience'] = num_experience
        else:
            dataset.loc[index, 'description-description_sim'] = None
            dataset.loc[index, 'description-name_sim'] = None
            dataset.loc[index, 'num_experience'] = None


        # name-name, name-description similarity
        positions = ''
        if 'experienceItem' in resume.keys():
            for item in resume['experienceItem']:
                if item['position'] != None:
                    positions += ' ' + item['position']
    
            dataset.loc[index, 'name-name_sim'] = calculcate_cosine_simularity(positions, vacancy_name, vectorizer)
            dataset.loc[index, 'name-description_sim'] = calculcate_cosine_simularity(positions, vacancy_desc, vectorizer)

        else:
            dataset.loc[index, 'name-name_sim'] = None
            dataset.loc[index, 'name-description_sim'] = None

        # education experience (is university among best universities)
        top_university = 0
        if 'educationItem' in resume.keys():
            for item in resume['educationItem']:
                if check_university_level(item['organization']):
                    top_university = 1            
        dataset.loc[index, 'university_level'] = top_university


    return dataset

In [26]:
test_dataset = create_test_dataset(test_data)
test_dataset.head()

Unnamed: 0,age,skills-description_sim,skills-name_sim,description-description_sim,description-name_sim,num_experience,name-name_sim,name-description_sim,university_level
0,33.0,0.063237,0.047638,0.170534,0.054532,1611.0,0.550199,0.018682,0.0
1,34.0,0.082959,0.068509,0.117585,0.033226,4233.0,0.118201,0.015664,0.0
2,34.0,0.033481,0.050157,0.049972,0.051185,1826.0,0.457527,0.019084,0.0
3,34.0,0.029594,0.074611,0.096917,0.046442,3632.0,0.666033,0.014082,0.0
4,29.0,,,0.190342,0.052352,3498.0,0.203875,0.0,0.0


# Формируем итоговое предсказание 

In [27]:
def get_final_prediction(test_data, models, knearest_neighbours, threshold = 0.5, df_resume = None):
    if df_resume is None: # for validation part
        df_resume =  create_test_dataset(test_data)
    
    # forming probs
    probs = np.zeros((knearest_neighbours.shape[0], df_resume.shape[0]))
    for i, uuid in enumerate(knearest_neighbours['uuid']):
        probs[i, :] += models[uuid].predict_proba(df_resume)[:, 1]

    # scaling probs to distance
    sim_scores = knearest_neighbours.loc[:,'sim_score'].to_numpy()**-1
    sim_scores /= sim_scores.sum()
    final_scores = (sim_scores.reshape(len(sim_scores), 1) * probs).sum(0)

    indx = np.where(final_scores >= threshold)[0]
    uuids_prediction = []

    for item in np.array(test_data['resumes'])[list(indx)]:
        uuids_prediction.append(item['uuid'])

    return uuids_prediction

In [28]:
get_final_prediction(test_data, models, knearest_neighbours, threshold = 0.35)

['9a9c3ff1-49f8-30dd-a294-e56fc60cae64',
 '6f48fd66-a056-3172-af60-632f22844934',
 'd9fffe2b-cba9-3ff2-bd47-b8bfc48cbe89']

# Валидация

Валидация - тонкое место этой модели, с ней есть несколько проблем. Во-первых, базовые классификаторы склонны отдавать предпочтения мажорному классу и выставлять низкий score (<0.5), поэтому необходимо подбирать порог.

Важной метрикой при подборке количества соседей (k) и порога (threshold) была accuracy, посчитанная как процент угаданных резюме среди общего пулла подходящих резюме. Естественно, это не единственный показатель, ведь достаточно поставить `threshold = 0`, чтобы максимизировать этот показатель в 1. 

Эвристическими соображениями были выбраны `k=4, threshold = 0.4`

In [46]:
def get_validation_vacancy(train_data_json, n = 1):
    '''
    Randomly choose one vacancy from dataset
    
    train_data_json: dict json-like
    returns 
    index: int
    val_data: dict with vacancy and resumes for vacancy
    
    '''
    indx = np.random.choice(range(len(train_data_json)), n)

    val_data = np.array(train_data_json)[indx][0]
    val_data['resumes'] = val_data['confirmed_resumes'] + val_data['failed_resumes']
    return indx[0], val_data

def check_accuracy(indx, train_data_json, predictions):
    '''
    indx: int
    train_data_json: dict json-like
    predictions: str[] - list of uuids

    returns
    int - percentage of right predictions
    '''
    real_uuids = []
    for item in train_data_json[indx]['confirmed_resumes']:
        real_uuids.append(item['uuid'])

    predictions, real_uuids = set(predictions), set(real_uuids)
    accuracy = len(real_uuids.intersection(predictions)) / len(real_uuids)

    return accuracy

def make_classification_report(indx, train_data_json, predictions):
    '''
    indx: int
    train_data_json: dict json-like
    predictions: str[] - list of uuids

    returns
    classification report in sklearn format
    '''
    
    resumes = pd.DataFrame({})
    for item in train_data_json[indx]['confirmed_resumes']:
        resumes.loc[item['uuid'], 'target'] = 1
    for item in train_data_json[indx]['failed_resumes']:
        resumes.loc[item['uuid'], 'target'] = 0
    resumes['predicted_label'] = resumes.apply(lambda x: 1 if x.index.values in predictions else 0, axis = 1)

    return classification_report(resumes['target'], resumes['predicted_label'])

        

In [65]:
def find_best_params(k_list, threshold_list, n = 5):
    '''
    k_list: int[]  
    threshold_list: int[]
    n: int

    returns
    dict with mean accuracy for each k and threshold
    '''
    final_accuracies = defaultdict(lambda: {})

    for k in tqdm(k_list):
        for threshold in threshold_list:

            mean_score = 0
            for _ in range(n):
    
                indx, val_data = get_validation_vacancy(train_data_json, n = 1)
                train_data_transformed_json = train_data_json[:indx] + train_data_json[indx+1:]
                
                train_dataset_transformed = pd.DataFrame(train_data_transformed_json)
                
                # dataset of resumes for deleted vacancy
                eval_dataset = create_resume_dataset(val_data, vectorizer)
                
                # dataset of vacancies without evaluated one
                df_vac = parse_vacancy(train_data_transformed_json)
                                
                knearest_neighbours = find_knearest_neighbours(val_data['vacancy'], df_vac, vectorizer, k = k)
            
                predictions = get_final_prediction(val_data, models, knearest_neighbours, threshold = threshold, 
                                                   df_resume = eval_dataset)
                score = check_accuracy(indx, train_data_json, predictions)
                mean_score += score / n
                                    
            
            final_accuracies[k][threshold] = mean_score

    return final_accuracies

In [66]:
# catboost with focal loss + n_grams = (1,2)
k_list, threshold_list = [2,3,4], [0.2, 0.3, 0.4]
final_accuracies = find_best_params(k_list, threshold_list)
final_accuracies

  0%|          | 0/3 [00:00<?, ?it/s]

defaultdict(<function __main__.find_best_params.<locals>.<lambda>()>,
            {2: {0.2: 0.8722222222222222,
              0.3: 0.6366666666666667,
              0.4: 0.7273913043478262},
             3: {0.2: 0.9538461538461539,
              0.3: 0.7057142857142856,
              0.4: 0.21666666666666667},
             4: {0.2: 0.9777777777777776,
              0.3: 0.45,
              0.4: 0.24239130434782608}})

In [157]:
# catboost + n_grams = (1,2), mean by five evaluations
final_accuracies

defaultdict(<function __main__.find_best_params.<locals>.<lambda>()>,
            {2: {0.2: 0.728623188405797, 0.3: 0.7927536231884058, 0.4: 0.5},
             3: {0.2: 1.0, 0.3: 0.6936507936507936, 0.4: 0.38666666666666666},
             4: {0.2: 1.0, 0.3: 0.8857142857142859, 0.4: 0.2658385093167702}})

In [150]:
# catboost + n_grams = (1,2), one evaluation
final_accuracies

defaultdict(<function __main__.find_best_params.<locals>.<lambda>()>,
            {1: {0.1: 1.0,
              0.2: 0.75,
              0.3: 0.0,
              0.4: 0.2222222222222222,
              0.5: 0.0},
             2: {0.1: 1.0,
              0.2: 1.0,
              0.3: 0.8333333333333334,
              0.4: 1.0,
              0.5: 0.0},
             3: {0.1: 1.0,
              0.2: 1.0,
              0.3: 0.125,
              0.4: 0.2222222222222222,
              0.5: 0.0},
             4: {0.1: 1.0, 0.2: 1.0, 0.3: 1.0, 0.4: 0.75, 0.5: 0.0},
             5: {0.1: 1.0,
              0.2: 1.0,
              0.3: 0.5,
              0.4: 0.14285714285714285,
              0.5: 0.0}})

In [143]:
# catboost + n_grams = (1,1), one_evaluation
final_accuracies

defaultdict(<function __main__.find_best_params.<locals>.<lambda>()>,
            {1: {0.1: 1.0,
              0.2: 0.9090909090909091,
              0.3: 0.0,
              0.4: 0.0,
              0.5: 0.2727272727272727},
             2: {0.1: 1.0, 0.2: 1.0, 0.3: 1.0, 0.4: 0.7, 0.5: 0.0},
             3: {0.1: 1.0,
              0.2: 1.0,
              0.3: 0.6666666666666666,
              0.4: 0.0,
              0.5: 0.08333333333333333},
             4: {0.1: 1.0,
              0.2: 1.0,
              0.3: 1.0,
              0.4: 0.6,
              0.5: 0.3333333333333333},
             5: {0.1: 1.0, 0.2: 1.0, 0.3: 1.0, 0.4: 1.0, 0.5: 0.0}})

# Предсказания для ансамбля

Здесь подсчитываются scores для каждого кандидата из тестового датасета, и формируется датафрейм, содержащий `uuid` резюме и его `resume`.

В итоге модель не использовалась в ансамбле, но сам датасет полезен для оценки алгоритма.

In [84]:
def get_final_prediction_score(test_data, models, knearest_neighbours, df_resume = None):
    if df_resume is None: # for validation part
        df_resume =  create_test_dataset(test_data)
    
    # forming probs
    probs = np.zeros((knearest_neighbours.shape[0], df_resume.shape[0]))
    for i, uuid in enumerate(knearest_neighbours['uuid']):
        probs[i, :] += models[uuid].predict_proba(df_resume)[:, 1]

    # scaling probs to distance
    sim_scores = knearest_neighbours.loc[:,'sim_score'].to_numpy()**-1
    sim_scores /= sim_scores.sum()
    final_scores = (sim_scores.reshape(len(sim_scores), 1) * probs).sum(0)

    final_df = pd.DataFrame({})

    for index, item in enumerate(np.array(test_data['resumes'])):
        final_df.loc[index, 'uuid'] = item['uuid']
        final_df.loc[index, 'score'] = final_scores[index]

    return final_df

In [85]:
knearest_neighbours = find_knearest_neighbours(test_data['vacancy'], vacancy_dataset, vectorizer, k = 4)
final_predictions = get_final_prediction_score(test_data, models, knearest_neighbours)

In [88]:
# 17 подходящих резюме с заданным порогом
len(final_predictions[final_predictions['score'] >= 0.4])

17

In [166]:
final_predictions.to_csv('final_score_1.csv')