In [29]:
import os
import argparse
import pickle as pkl
import scipy.stats as stats
import ipdb
import yaml
from tqdm import tqdm
import numpy as np
import re
from collections import Counter
import pandas as pd
import unidecode
from nltk.tokenize import word_tokenize
import json
from datetime import datetime

In [4]:
base_file = "bp_3jobs_desc_edu_skills_industry_date_company_FR.json"
MIN_JOB_COUNT = 3
MAX_SEQ_LENGTH = 64

In [5]:
global CFG
with open("../../config.yaml", "r") as ymlfile:
    CFG = yaml.load(ymlfile, Loader=yaml.SafeLoader)
# with ipdb.launch_ipdb_on_exception():
cie_file = os.path.join(CFG["datadir"], "cie_list.pkl")
with open(cie_file, "rb") as f:
    cie_list = pkl.load(f)
synonym_file = os.path.join(CFG["datadir"], "cie_synonyms.pkl")
with open(synonym_file, "rb") as f:
    syn_cie = pkl.load(f)
blacklist_file = os.path.join(CFG["datadir"], "blacklist.pkl")
with open(blacklist_file, "rb") as f:
    blacklist = pkl.load(f)


In [6]:
def word_seq_into_list(position, description, cie_list, syn_cie):
    number_regex = re.compile(r'\d+(,\d+)?')
    whole_job = position.lower() + ' ' + description.lower()
    new_tup = []

    for cie in cie_list:
        if cie in whole_job.lower():
            if cie in syn_cie.keys():
                handle = syn_cie[cie]
            else:
                handle = cie
            whole_job = whole_job.replace(cie, handle)

    for name in syn_cie.keys():
        if name in whole_job.lower():
            handle = syn_cie[name]
            whole_job = whole_job.replace(cie, handle)

    job = word_tokenize(whole_job)

    for tok in job:
        if re.match(number_regex, tok):
            new_tup.append("NUM")
        elif tok.lower() in cie_list or tok.lower() in syn_cie.keys():
            new_tup.append("CIE")
        else:
            new_tup.append(tok.lower())
    cleaned_tup = [item for item in new_tup if item != ""]
    return cleaned_tup


In [7]:
def handle_date(job):
    if job["to"] == "Present":
        date_time_str = '2018-04-12'  # date of files creation
        time = datetime.timestamp(datetime.strptime(date_time_str, '%Y-%m-%d'))
    elif len(job["to"].split(" ")) == 2:
        try:
            time = datetime.timestamp(datetime.strptime(job["to"], "%B %Y"))
        except ValueError:
            time = datetime.timestamp(datetime.strptime(job["to"].split(" ")[-1], "%Y"))
    else:
        try:
            time = datetime.timestamp(datetime.strptime(job["to"].split(" ")[-1], "%Y"))
        except ValueError:
            date_time_str = '2018-04-13'  # date of files creation
            time = datetime.timestamp(datetime.strptime(date_time_str, '%Y-%m-%d'))
    tstmp = pd.Timestamp.fromtimestamp(time)
    return round(datetime.timestamp(tstmp.round("D").to_pydatetime()))

In [8]:
def get_edu_info(person, cie_list, syn_cie, blacklist):
    education = person[-2]
    jobs = []
    flag = False
    for job in person[1]:
        if 'company' in job.keys():
            threshold = min(len(job["company"].split(" ")), 5)
            tmp = job["company"].split(" ")[:threshold]
            normalized_name = [unidecode.unidecode(name.lower()) for name in tmp]
            company_name = "".join(normalized_name)
            if company_name in cie_list:
                flag = True
    if flag:
        for job in person[1]:
            if 'company' in job.keys():
                threshold = min(len(job["company"].split(" ")), 5)
                tmp = job["company"].split(" ")[:threshold]
                normalized_name = [unidecode.unidecode(name.lower()) for name in tmp]
                company_name = "".join(normalized_name)
                if company_name not in blacklist:
                    end = handle_date(job)
                    tstmp = pd.Timestamp.fromtimestamp(job["from_ts"])
                    start = round(datetime.timestamp(tstmp.round("D").to_pydatetime()))
                    if company_name in syn_cie.keys():
                        cie = syn_cie[company_name]
                    else:
                        cie = company_name
                    if (end > 0) and (start > 0):  # corresponds to the timestamp of 01/01/1970
                        j = {'from': start,
                             'to': end,
                             'company': cie,
                             'job': word_seq_into_list(job["position"],
                                                       job["description"], cie_list,  syn_cie)}
                        jobs.append(j)

    return education, jobs

In [None]:
current_file = os.path.join(CFG["prevdatadir"], base_file)
with open(current_file, 'r') as f:
    num_lines = sum(1 for line in f)
with open(current_file, 'r') as f:
    pbar = tqdm(f, total=num_lines)
    edu_backgrounds = []
    for line in pbar:
        try:
            current_person = json.loads(line)
            jobs = current_person[1]
            skills = current_person[2]
            if len(jobs) >= MIN_JOB_COUNT and len(skills) > 0:
                edu_info, new_jobs = get_edu_info(current_person, cie_list, syn_cie, blacklist)
                if len(new_jobs) >= MIN_JOB_COUNT:
                    edu_backgrounds.extend(edu_info)
        except OutOfBoundsDatetime:
            continue
        pbar.update(1)
tgt_file = "unprocessed_educations.pkl"
with open(os.path.join(CFG["datadir"], tgt_file), "wb") as f:
    pkl.dump(edu_backgrounds, f)

  5%|▍         | 39682/850902 [01:59<42:15, 319.92it/s]  

In [None]:
tgt_file = "unprocessed_educations.pkl"
with open(os.path.join(CFG["datadir"], tgt_file), "wb") as f:
    pkl.dump(edu_backgrounds, f)

In [10]:
global CFG
with open("../../config.yaml", "r") as ymlfile:
    CFG = yaml.load(ymlfile, Loader=yaml.SafeLoader)
    
tgt_file = "unprocessed_educations.pkl"
with open(os.path.join(CFG["datadir"], tgt_file), "rb") as f:
    data = pkl.load(f)

In [98]:
import re
accentedCharacters = "àèìòùÀÈÌÒÙáéíóúýÁÉÍÓÚÝâêîôûÂÊÎÔÛãñõÃÑÕäëïöüÿÄËÏÖÜŸçÇßØøÅåÆæœ"
regex = re.compile('[^a-z0-9' + accentedCharacters + '\s\-]')


In [100]:
degrees = [regex.sub(' ', i["degree"].lower()) for i in data]
institutions = [regex.sub(' ', i["institution"].lower()) for i in data]

In [101]:
count_degrees = Counter()
count_institutions = Counter()
for deg in degrees:
    count_degrees[deg] +=1
for ins in institutions:count_institutions[ins] +=1    

In [64]:
stats.describe(list(count_degrees.values()))

DescribeResult(nobs=463406, minmax=(1, 1736), mean=1.4687746813809057, variance=68.71964193230454, skewness=99.22586158959993, kurtosis=14855.39417105405)

In [65]:
stats.describe(list(count_institutions.values()))

DescribeResult(nobs=143947, minmax=(1, 5210), mean=4.728400036124407, variance=2933.6574654927012, skewness=46.350406061269425, kurtosis=2889.1022316311432)

In [66]:
len(set(degrees)) / len(degrees)

0.6808396227662534

In [102]:
regex = re.compile('[a-z]')
clean_degrees = []
for e in sorted(list(count_degrees.keys())):
    if regex.match(e):
        clean_degrees.append(e.strip())
clean_institutions = []
for e in sorted(list(count_institutions.keys())):
    if regex.match(e):
        clean_institutions.append(e.strip())

In [105]:
clean_institutions[:10]


['a',
 'a  cherioux',
 'a  f  p  a champs sur marne',
 'a  renoir',
 'a  schweitzer - le raincy',
 'a a a  - associations des amitiés asiatiques',
 'a a c  académie des arts chorégraphiques cité véron  paris',
 'a bouquinet formation',
 'a c e',
 'a c e p']

In [82]:
sorted(list(count_degrees.keys()))[:10]

['',
 ' ',
 '  ',
 '   ',
 '    ',
 '      ',
 '       ',
 '          ',
 '   annes de groupe de pratique professionnelle conseil accompagnemnt dveloppement professionnel',
 '   certificat de formation analyse transactionnelle obtenu']

In [106]:
with open("../../raw_degrees.txt", 'w') as f:
    for deg in clean_degrees:
        f.write(deg + '\n')

with open("../../raw_institutions.txt", 'w') as f:
    for deg in clean_institutions:
        f.write(deg + '\n')

In [49]:
faulty = "***"

In [53]:
''.join(i for i in faulty if ord(i)<128)

'***'

'***'