# Structure Check

Cómo tener acceso a los JSON internos de los registros. 

Es necesario definir un schema. Abajo se ve cómo inferir schema a partir de un ejemplo

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession \
    .builder \
    .appName("Learning Spark") \
    .getOrCreate()

#df = spark.read.json("harvester/occ/tests/eduardofv-air-2.local-sneaky-20230627_200714.jsonl.gz")
df = spark.read.json("harvester/occ/tests/test-cdmx/sneaky_spider-20230707_1825-73d8e52733c04c82b8b2a29a3921a483-rv0_0_10-b0001.jsonl.gz")
df

DataFrame[crawler: string, identifier: string, jobposting: string, scraped_at: string, search_engine_type: string, url: string, uuid: string, version: string]

In [2]:
df.show()

+------------+----------+--------------------+--------------------+------------------+--------------------+--------------------+-------+
|     crawler|identifier|          jobposting|          scraped_at|search_engine_type|                 url|                uuid|version|
+------------+----------+--------------------+--------------------+------------------+--------------------+--------------------+-------+
|SneakySpider|  16970913|{"props":{"pagePr...|2023-07-07 18:26:...|               JDP|https://www.occ.c...|6452900c-6858-414...| 0.0.10|
|SneakySpider|  16970919|{"props":{"pagePr...|2023-07-07 18:26:...|               JDP|https://www.occ.c...|3e13378f-3a26-464...| 0.0.10|
|SneakySpider|  16970920|{"props":{"pagePr...|2023-07-07 18:26:...|               JDP|https://www.occ.c...|cfc283a4-17a2-485...| 0.0.10|
|SneakySpider|  16970921|{"props":{"pagePr...|2023-07-07 18:26:...|               JDP|https://www.occ.c...|fbd45aa1-2231-496...| 0.0.10|
|SneakySpider|  16970922|{"props":{"pageP

In [3]:
import json
json.loads(df.select("jobposting").first()["jobposting"])

{'props': {'pageProps': {'initialApolloState': {'CountryLocation:MX': {'__typename': 'CountryLocation',
     'id': 'MX',
     'description': 'México'},
    'JobLevel:4': {'__typename': 'JobLevel', 'id': '4', 'url': None},
    'JobCategory:1': {'__typename': 'JobCategory',
     'description': 'Administrativo',
     'id': '1',
     'url': 'empleos/trabajo-en-administrativo/',
     'rel': None},
    'JobSubcategory:29': {'__typename': 'JobSubcategory',
     'id': '29',
     'description': 'Administración',
     'url': 'empleos/trabajo-en-administrativo-administracion/'},
    'JobRequisition:0': {'__typename': 'JobRequisition',
     'id': '0',
     'hasIdRequisicion': True},
    'CompanyProfile:876781': {'__typename': 'CompanyProfile',
     'id': '876781',
     'name': 'citibanamex',
     'url': '/perfiles-empresas/876781/opiniones',
     'summary': {'__typename': 'SummaryCompanyProfile',
      'reviews': {'__typename': 'CompanyProfileReviews', 'count': 256},
      'companyRating': {'__typ

In [4]:
with open("occ_jobposting_sample.json", "w") as file:
    file.write(df.select("jobposting").first()["jobposting"])

In [5]:
jsonobj = spark.read.json("occ_jobposting_sample.json", multiLine=True)
schema = jsonobj.schema
schema

StructType([StructField('assetPrefix', StringType(), True), StructField('buildId', StringType(), True), StructField('dynamicIds', ArrayType(LongType(), True), True), StructField('gssp', BooleanType(), True), StructField('isFallback', BooleanType(), True), StructField('page', StringType(), True), StructField('props', StructType([StructField('__N_SSP', BooleanType(), True), StructField('pageProps', StructType([StructField('applyFromEmail', StringType(), True), StructField('initialApolloState', StructType([StructField('CompanyProfile:876781', StructType([StructField('__typename', StringType(), True), StructField('id', StringType(), True), StructField('name', StringType(), True), StructField('summary', StructType([StructField('__typename', StringType(), True), StructField('benefitsAndPerks', StructType([StructField('__typename', StringType(), True), StructField('average', DoubleType(), True), StructField('averagePercentage', DoubleType(), True)]), True), StructField('careerOpportunity', St

In [6]:
dfexp = df.withColumn("jp", F.from_json(F.col("jobposting"), schema))
dfexp.printSchema()

root
 |-- crawler: string (nullable = true)
 |-- identifier: string (nullable = true)
 |-- jobposting: string (nullable = true)
 |-- scraped_at: string (nullable = true)
 |-- search_engine_type: string (nullable = true)
 |-- url: string (nullable = true)
 |-- uuid: string (nullable = true)
 |-- version: string (nullable = true)
 |-- jp: struct (nullable = true)
 |    |-- assetPrefix: string (nullable = true)
 |    |-- buildId: string (nullable = true)
 |    |-- dynamicIds: array (nullable = true)
 |    |    |-- element: long (containsNull = true)
 |    |-- gssp: boolean (nullable = true)
 |    |-- isFallback: boolean (nullable = true)
 |    |-- page: string (nullable = true)
 |    |-- props: struct (nullable = true)
 |    |    |-- __N_SSP: boolean (nullable = true)
 |    |    |-- pageProps: struct (nullable = true)
 |    |    |    |-- applyFromEmail: string (nullable = true)
 |    |    |    |-- initialApolloState: struct (nullable = true)
 |    |    |    |    |-- CompanyProfile:876781:

In [7]:
dfexp.select("jp.props.pageProps.jobId").show()

+--------+
|   jobId|
+--------+
|16970913|
|16970919|
|16970920|
|16970921|
|16970922|
|16970937|
|16970939|
|16994937|
|16970947|
|16857294|
|16966477|
|16966486|
|16966497|
|16966500|
|16966503|
|16966507|
|16708999|
|16966521|
|16553358|
|16996019|
+--------+
only showing top 20 rows

