In [1]:
import glob
import gzip
import json
import re

In [2]:
class RegexDict(dict):

    def get_dict_matching(self, regex):
        return {key: self[key] for key in self.keys() if re.match(regex, key)}
        
    def get_matching(self, regex, value_field=None):
        results = []
        for key in self.keys():
            if re.match(regex, key):
                value = self[key]
                if value_field:
                    value[value_field] = key
                results.append(value)
        return results

In [3]:
categories = {}
subcategories = {}

files = "harvester/occ/20230724/sneaky_spider*.jsonl"
num_files = 0
records = 0
num_exceptions = 0
total_lines = 0
exceptions = []

with open("occ-20230724.jsonl", "w") as fo:
    for infile in glob.glob(files):
        num_files += 1
        print(".", end="")
#        print(infile)
#        with gzip.open(infile) as fin:
        with open(infile) as fin:
            line_num = 0
            for line in fin:
                try:
                    #line = fin.readline()
                    j = json.loads(line)
                    jobp = json.loads(j["jobposting"])
                    jobp = jobp["props"]["pageProps"]["initialApolloState"]
                    jobext = RegexDict(jobp)
                    job = jobext.get_matching(r"Job:.*", value_field="Job:value")[0]
                    job['scraped_at'] = j['scraped_at']
                    job['scraped_url'] = j['url']
                    job['scraped_uuid'] = j['uuid']
                    job['scraped_record_version'] = j['version']
                    job['scraped_identifier'] = j['identifier']
                    job['scraped_crawler'] = j['crawler']
                    scraper_uuid = re.match(r"^.*-.*-(.*)-rv.*-b.*.jsonl$", infile)
                    job['scraped_by_uuid'] = scraper_uuid.group(1)
                    
                    fo.write(json.dumps(job))
                    fo.write("\n")
                    
                    cat = jobext.get_matching("JobCategory:.*")[0]
                    subcat = jobext.get_matching("JobSubcategory:.*")[0]
                    categories[cat['id']] = cat
                    subcategories[subcat['id']] = subcat
                    records += 1
                except Exception as ex:
#                    print(f"line {line_num}: {ex}")
                    exceptions.append({'file': infile, 'line': line_num, 'exception': str(ex)})
                    num_exceptions += 1
                finally:
                    line_num += 1
                    total_lines += 1

................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [4]:
print(f"\nFiles: {num_files}\tTotal lines: {total_lines} \tRecords: {records}\tExceptions: {num_exceptions}")


Files: 560	Total lines: 137344 	Records: 137284	Exceptions: 60


In [5]:
with open("occ-20230724-exceptions.jsonl", "w") as fo:
    for ex in exceptions:
        fo.write(json.dumps(ex))
        fo.write("\n")

In [6]:
with open("occ-20230724-categories.json", "w") as fo:
    fo.write(json.dumps([c for c in categories.values()]))

with open("occ-20230724-subcategories.json", "w") as fo:
    fo.write(json.dumps([c for c in subcategories.values()]))
             
len(categories), len(subcategories)

(24, 326)

In [7]:
jobext = RegexDict(jobp)
jobext.get_matching(r"Job:.*", value_field="Job:value")[0]

{'__typename': 'Job',
 'id': '17080449',
 'url': '/empleo/oferta/17080449-ejecutivo-telefonico-sin-experiencia-contratacion-i?rank=1&page=1&sessionid=&userid=&uuid=accebb12-c763-4ef9-b43a-e06792b4f402&origin=unknown&type=0&ai=false&ais=&showseo=true&returnURL=%2Fempleos-en-mexico-y-el-mundo%3Fsessionid%3D%26userid%3D%231',
 'title': 'Ejecutivo Telefónico - Sin Experiencia (Contratación I',
 'description': 'MEGA DIRECT, Empresa en desarrollo con un portafolio de marcas líderes y consumo masivo a nivel nacional. te invita a formar parte de su gran familia como:     Ejecutivo de Telefónicos (Contamos con c ...',
 'jobType': 'CLASSIC',
 'salary': {'__typename': 'JobSalary',
  'show': True,
  'from': 7000,
  'to': 10000,
  'time': 0,
  'performanceCompensation': 1,
  'variableCompensation': 0},
 'location': {'__typename': 'JobLocation',
  'description': 'Álvaro Obregón, Ciudad de México',
  'locations': [{'__typename': 'JobLocationData',
    'city': {'__typename': 'CityLocation',
     'desc

In [8]:
cat = jobext.get_matching("JobCategory:.*")[0]
subcat = jobext.get_matching("JobSubcategory:.*")[0]
cat, subcat

({'__typename': 'JobCategory',
  'description': 'Atención a clientes - Call Center',
  'id': '21',
  'url': 'empleos/trabajo-en-atencion-a-clientes-call-center/',
  'rel': None},
 {'__typename': 'JobSubcategory',
  'id': '435',
  'description': 'Call center',
  'url': 'empleos/trabajo-en-atencion-a-clientes-call-center-call-center/'})