In [21]:
import os
import argparse
import pickle as pkl
import ipdb
import yaml
from tqdm import tqdm
import re
from collections import Counter
import pandas as pd
import unidecode
import fasttext
from nltk.tokenize import word_tokenize
import json
from datetime import datetime
import numpy as np

In [12]:
global CFG
with open("../../config.yaml", "r") as ymlfile:
    CFG = yaml.load(ymlfile, Loader=yaml.SafeLoader)

In [17]:
base_file = "bp_3jobs_desc_edu_skills_industry_date_company_FR"
splits = ["TEST", "VALID", "TRAIN"]
MIN_JOB_COUNT = 3

In [18]:
current_file = os.path.join(CFG["gpudatadir"], base_file + "_TRAIN.json")
language_classifier = fasttext.load_model(os.path.join(CFG["modeldir"], "lid.176.bin"))
with open(current_file, 'r') as f:
    num_lines = sum(1 for line in f)

In [40]:
def word_seq_into_list(position):
    number_regex = re.compile(r'\d+(,\d+)?')
    new_tup = []
    job = word_tokenize(position.lower())
    for tok in job:
        if re.match(number_regex, tok):
            new_tup.append("NUM")
        else:
            new_tup.append(tok.lower())
    cleaned_tup = [item for item in new_tup if item != ""]
    return cleaned_tup


def handle_date(job):
    if job["to"] == "Present":
        date_time_str = '2018-04-12'  # date of files creation
        time = datetime.timestamp(datetime.strptime(date_time_str, '%Y-%m-%d'))
    elif len(job["to"].split(" ")) == 2:
        try:
            time = datetime.timestamp(datetime.strptime(job["to"], "%B %Y"))
        except ValueError:
            time = datetime.timestamp(datetime.strptime(job["to"].split(" ")[-1], "%Y"))
    else:
        try:
            time = datetime.timestamp(datetime.strptime(job["to"].split(" ")[-1], "%Y"))
        except ValueError:
            date_time_str = '2018-04-13'  # date of files creation
            time = datetime.timestamp(datetime.strptime(date_time_str, '%Y-%m-%d'))
    tstmp = pd.Timestamp.fromtimestamp(time)
    return round(datetime.timestamp(tstmp.round("D").to_pydatetime()))

def identify_language(job, ft_model):
    jobs_str = " ".join(job)
    ft_model.predict([jobs_str])
    return ft_model.predict([jobs_str])

def build_new_person(person, language_classifier):
    person_id = person[0]
    industry = person[-1]
    new_p = [person_id, industry]
    jobs = []
    for job in person[1]:
        if 'company' in job.keys():
            try:
                end = handle_date(job)
                tstmp = pd.Timestamp.fromtimestamp(job["from_ts"])
                start = round(datetime.timestamp(tstmp.round("D").to_pydatetime()))
                if (end > 0) and (start > 0):  # corresponds to the timestamp of 01/01/1970
                    job = word_seq_into_list(job["position"])
                    predicted_lang = identify_language(job, language_classifier)
                    if (predicted_lang[0][0][0] == "__label__fr" or predicted_lang[0][0][0] == "__label__en") and (predicted_lang[1][0][0] > .6):
                        j = {'from': start,
                             'to': end,
                             'job': job}
                        jobs.append(j)
            except:
                continue
    if len(jobs) >= MIN_JOB_COUNT:
        #trimmed_jobs = trim_jobs_to_max_len(jobs, args.max_len)
        new_p.append(jobs)
    return new_p

In [41]:
titles_len = []
with open(current_file, 'r') as f:
    pbar = tqdm(f, total=num_lines)
    dataset = []
    for line in pbar:
        current_person = json.loads(line)
        jobs = current_person[1]
        skills = current_person[2]
        if len(jobs) >= MIN_JOB_COUNT and len(skills) > 0:
            new_p = build_new_person(current_person, language_classifier)
            if len(new_p) > 2 and len(new_p[-1]) >= MIN_JOB_COUNT:
                dataset.append(new_p)
                titles_len.append(len(new_p[-1]))
        pbar.update(1)

100%|██████████| 487649/487649 [28:19<00:00, 286.88it/s]


In [61]:
titles_len = []
for person in tqdm(dataset):
    for jobs in person[-1]:
        titles_len.append(len(jobs["job"]))

100%|██████████| 372889/372889 [00:01<00:00, 360774.01it/s]


In [83]:
np.percentile(titles_len, 95) # .8

9.0

In [78]:
ppl_file = os.path.join(CFG["gpudatadir"], f"profiles_jobs_ind_title_TRAIN.pkl")
with open(ppl_file, 'rb') as fp:
    toy_data = pkl.load(fp)

In [80]:
toy_data[-2]

[850898,
 'Internet',
 [{'from': 1465596000, 'to': 1523484000, 'job': ['développeur', 'web']},
  {'from': 1444514400, 'to': 1523484000, 'job': ['développeur', 'web']},
  {'from': 1394492400, 'to': 1523484000, 'job': ['editor', 'chief']}]]

In [47]:
new_p

[850901,
 'Financial Services',
 [{'from': 1431295200,
   'to': 1433109600,
   'job': ['assistante',
    'de',
    'la',
    'division',
    'des',
    'relations',
    'avec',
    'les',
    'elus',
    'et',
    'les',
    'acteurs',
    'économiques']},
  {'from': 1412978400,
   'to': 1388530800,
   'job': ['vendeuse', 'en', 'boulangerie']},
  {'from': 1192053600,
   'to': 1388530800,
   'job': ['global', 'client', 'services', 'representative']}]]