In [1]:
import pickle

from sklearn.metrics.pairwise import cosine_similarity

import datetime

In [20]:
path_vacancies = r"/content/drive/MyDrive/hakathon/vacancies_train.pickle"

with open(path_vacancies, 'rb') as file:
    vacancies = pickle.load(file)

In [21]:
def simillarity_scores(resume: dict, vacancy: dict) -> dict:
    scores = {
        'description_about': 0.0,
        'keywords_key_skills': 0.0,
        'description_description': 0.0,
        'name_position': 0.0,
        'age_fits': 0.0,
        'experience_fits': 0.0,
        'country_fits': 0.0,
        'language_fits': 0.0,
        'education_fits': 0.0,
    }

    # Check description-about pair
    if 'description_embedded' in vacancy and 'about_embedded' in resume:
        scores['description_about'] = cosine_similarity([vacancy['description_embedded']], [resume['about_embedded']])[0][0]

    # Check keywords-key_skills pair
    if 'keywords_embedded' in vacancy and 'key_skills_embedded' in resume:
        scores['keywords_key_skills'] = cosine_similarity([vacancy['keywords_embedded']], [resume['key_skills_embedded']])[0][0]

    # Iterate over experienceItem
    if 'experienceItem' in resume and resume['experienceItem']:
        max_des_des_sim_score = -2
        max_name_pos_sim_score = -2
        total_experience = 0

        for experience in resume['experienceItem']:
            # Calculate maximal simillarity for description-description pair
            if 'description_embedded' in vacancy and 'description_embedded' in experience:
                sim_score = cosine_similarity([vacancy['description_embedded']], [experience['description_embedded']])[0][0]
                max_des_des_sim_score = max(max_des_des_sim_score, sim_score)

            # Calculate maximal simillarity for name-position pair
            if 'name_embedded' in vacancy and 'position_embedded' in experience:
                sim_score = cosine_similarity([vacancy['name_embedded']], [experience['position_embedded']])[0][0]
                max_name_pos_sim_score = max(max_name_pos_sim_score, sim_score)

            # Accumulate years of experience
            if 'years_at_work' in experience and experience['years_at_work'] is not None:
                total_experience += experience['years_at_work']

        # Similarities
        scores['description_description'] = max_des_des_sim_score if max_des_des_sim_score != -2 else 0
        scores['name_position'] = max_name_pos_sim_score if max_name_pos_sim_score != -2 else 0

        # Experience
        if vacancy['extra_features']['experience'] is not None:
            scores['experience_fits'] = float(total_experience >= vacancy['extra_features']['experience'])
        else:
            scores['experience_fits'] = 1

    # Calculate if candidates age is appropriate
    if resume['birth_date'] is not None:
        min_age = vacancy['extra_features']['min age'] if vacancy['extra_features']['min age'] is not None else 18
        max_age = vacancy['extra_features']['max age'] if vacancy['extra_features']['max age'] is not None else 99

        age = (datetime.datetime.now() - datetime.datetime.strptime(resume['birth_date'], "%Y-%m-%d")).days // 365

        scores['age_fits'] = float(min_age <= age <= max_age)

    # Check if the country is matching
    resumes_country = resume['country']

    if vacancy['extra_features']['country'] is not None:
        scores['country_fits'] = 1.0 if resumes_country in vacancy['extra_features']['country'] else 0.0

    # Check if the languages are matching
    if 'languageItems' in resume:
        if vacancy['extra_features']['languages'] is None or len(vacancy['extra_features']['languages']) == 0:
            scores['language_fits'] = 1.0
        else:
            for language in resume['languageItems']:
               if language in vacancy['extra_features']['languages']:
                    scores['language_fits'] = 1.0
                    break
    elif vacancy['extra_features']['languages'] is not None and 'Русский' in vacancy['extra_features']['languages']:
        scores['language_fits'] = 1.0
    else:
        scores['language_fits'] = 0.0

    # Check if the education matches
    if vacancy['extra_features']['education level'] is not None:
        if 'educationItem' in resume:
            for education in resume['educationItem']:
                education_level = education['education_level']
                if education_level == 'Высшее' or education_level == 'Бакалавр' or education_level == 'Магистр':
                    scores['education_fits'] = 1.0
                    break
    else:
        scores['education_fits'] = 1.0


    return scores

In [36]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

class Preceptron(nn.Module):
    def __init__(self, num_features: int):
        super().__init__()

        self.linear = nn.Linear(num_features, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        return self.sigmoid(self.linear(x))

In [90]:
num_features = 9
lr = 1e-3

model = Preceptron(num_features)

loss_fn = nn.BCELoss()

optimizer = torch.optim.SGD(model.parameters(), lr=lr)

In [None]:
from tqdm.auto import tqdm
from random import shuffle

num_iters = 1

for i in tqdm(range(num_iters)):

    for entry in vacancies:
        vacancy = entry['vacancy']

        failed = [*entry['failed_resumes']]
        confirmed = [*entry['confirmed_resumes']]
        shuffle(failed)
        shuffle(confirmed)

        clip = min(len(failed), len(confirmed))

        failed = failed[:clip]
        confirmed = confirmed[:clip]

        train_set = failed + confirmed

        shuffle(train_set)

        for resume in train_set:
            optimizer.zero_grad()

            target = torch.tensor([resume['accepted']], dtype=torch.float32)

            scores = simillarity_scores(resume, vacancy)

            input = torch.tensor([*scores.values()], dtype=torch.float32)

            output = model(input)

            loss = loss_fn(output, target)

            print(loss.item())

            loss.backward()

            optimizer.step()

In [109]:
def get_total_score(raw_scores: dict) -> float:
    scores_weights = {
      'description_about': 0.63205,
      'keywords_key_skills': 0.947156,
      'description_description': 0.862351,
      'name_position': 0.782359,
      'age_fits': 0.626857,
      'experience_fits': 0.68093,
      'country_fits': 0.23586,
      'language_fits': 0.758273,
      'education_fits': 0.41525,
    }

    weights_sum = sum(scores_weights.values())

    total = 0

    for key in scores_weights:
        total += scores_weights[key] * raw_scores[key]

    return total / weights_sum

In [110]:
def uuid_scores(resumes: list, vacancy: dict) -> list[tuple]:
    uuid_score_pairs = []

    for resume in resumes:
        scores_raw = simillarity_scores(resume, vacancy)
        total_score = get_total_score(scores_raw)
        uuid_score_pairs.append((resume['uuid'], total_score))

    return sorted(uuid_score_pairs, key=lambda x: -x[-1])

In [111]:
uuid_scores(vacancies[0]['confirmed_resumes'] + vacancies[0]['failed_resumes'], vacancies[0]['vacancy'])

[('9d7eae36-11f8-3cac-9cb2-4cb0ff9d0ae7', 0.7345311601420307),
 ('2bfafd4b-592f-361e-a4d9-54472e4db85c', 0.7297444528837738),
 ('da4c44dd-7c00-3f75-98b4-096b533488a4', 0.7262895470095746),
 ('73d59615-b5b2-35fd-a15d-28963fe143d1', 0.7258135772152774),
 ('0d17d82a-bea2-3b7c-82ab-6852ce5ad754', 0.72576212863346),
 ('2b5ad5e1-1f31-3f3f-8a66-43cd89233672', 0.6986513143657087),
 ('aff6b6bd-89c2-3b2c-ab2e-0b9f76ac367c', 0.6794881168583148),
 ('02c0d043-72b0-35be-b21b-513d5a4c7bc8', 0.6776999219711829),
 ('dc28a8cc-14d0-367a-8d87-6969f83ae793', 0.6761152926954043),
 ('8c8cf797-2c6b-3f4b-b28b-20d57bd88b82', 0.6757982825281744),
 ('8196f47f-310f-3cbd-b40c-bc52c44a791e', 0.66878299804181),
 ('74392e00-ecfb-335b-9fc1-c2652dca06e5', 0.6628756231330444),
 ('ecfc02a1-592c-3ed0-a801-1ad9ab3d30b8', 0.6577919947873367),
 ('f015f782-526a-3e23-99c7-fa23ae2b5912', 0.645511701115726),
 ('74221d62-5ea6-3a68-8849-25acd97e208b', 0.6424344099516073),
 ('26f77e3e-2318-3158-9952-9bd6063ff580', 0.6270640655789685