In [None]:
import threading
import openai
import os
import random
import datetime
import json
import jsonlines
import sqlite3
from os.path import join
import time

In [None]:
DATA_PATH = '/media/data/hr/data/resumes_json/'
DATABASE_PATH = '/media/data/hr/data/resumes.sqlite'

In [None]:
conn = sqlite3.connect(DATABASE_PATH)
c = conn.cursor()

In [None]:
def sorted_positions(positions):
    for pos in positions:
        end_date = pos['EndDate']
        if 'Year' in end_date:
            real_date = datetime.datetime(int(end_date['Year']), 1, 1)
        elif 'YearMonth' in end_date:
            year, month = [int(x) for x in end_date['YearMonth'].split('-')]
            real_date = datetime.datetime(year, month, 1)
        elif 'StringDate' in end_date:
            if end_date['StringDate'] == 'current':
                real_date = datetime.datetime.now()
            else:
                continue
        elif 'AnyDate' in end_date:
            year, month, date = [int(x) for x in end_date['AnyDate'].split('-')]
            real_date = datetime.datetime(year, month, date)

        pos['RealDate'] = real_date

    return sorted(positions, key=lambda x: x['RealDate'])

In [None]:
import os

def decode(contents):
    u = contents.decode("latin_1")
    return u

def read_and_decode(directory):
    for row in c.execute('SELECT * from llm_batch_1'):
        filepath = os.path.join(directory, row[1] + '.json')
        with open(filepath, "rb") as file:
            #print(filename)
            try:
                contents = file.read()
                decoded_contents = decode(contents)
                outer_json = json.loads(decoded_contents)
                if outer_json['Info']['Code'] != "Success":
                    continue
                parsed_document = json.loads(outer_json["Value"]["ParsedDocument"])
                employment_history = parsed_document["Resume"]["StructuredXMLResume"]["EmploymentHistory"]["EmployerOrg"]
                positions = []
                for emp in employment_history:
                    for pos in emp['PositionHistory']:
                        if 'Description' not in pos:
                            continue
                        if 'EndDate' not in pos or 'Title' not in pos:
                            continue
                        #print(pos['Title'])
                        # if 'AnyDate' in pos['EndDate']:
                        #     print(pos['EndDate'])
                        positions.append({'Title': pos['Title'], 'EndDate': pos['EndDate'], 'Description': pos['Description']})

                #print(positions)

                positions = sorted_positions(positions)
                        
                yield positions, filepath
            except Exception as e:
                print(f'Error reading JSON: {e}')
                continue

dataset_chunk = []
with open('batch_1_job_titles.json', 'w') as f:
    for i, (positions, filepath) in enumerate(read_and_decode(DATA_PATH)):
        f.write(json.dumps(positions, default=str) + '\n')

# Prepare for training

In [None]:
train_data = []
val_data = []
with open('batch_1_job_titles.json', 'r') as f:
    for i, line in enumerate(f):
        experiences = json.loads(line)
        if len(experiences) <= 2:
            continue

        last_job = experiences[-1]
        titles = '</s><s>'.join(t["Title"] for t in experiences[:-1])

        last_job_description = last_job["Description"]
        if len(last_job_description) < 30:
            continue

        datapoint = {"prev": titles, "cur": last_job_description}
        if i <= 5000:
            val_data.append(datapoint)
        else:
            train_data.append(datapoint)

In [None]:
with jsonlines.open('titles_train.jsonl', mode='w') as writer:
    writer.write_all(train_data)

with jsonlines.open('titles_val.jsonl', mode='w') as writer:
    writer.write_all(val_data)