# Structure Check

Cómo tener acceso a los JSON internos de los registros. 

Es necesario definir un schema. Abajo se ve cómo inferir schema a partir de un ejemplo

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


spark = SparkSession \
    .builder \
    .appName("Learning Spark") \
    .getOrCreate()

date_str = "20230724"

#df = spark.read.json("harvester/occ/tests/test-gdl.jsonl")
#df = spark.read.json("harvester/occ/tests/test-cdmx/sneaky*.jsonl.gz")
#df = spark.read.json("test-cdmx.jsonl") #from ETL
#df = spark.read.json("test-full-occ.jsonl") #from ETL
df = spark.read.json(f"occ-{date_str}.jsonl")
categories = spark.read.json(f"occ-{date_str}-categories.json")
subcategories = spark.read.json(f"occ-{date_str}-subcategories.json")

In [2]:
#categories.count(), categories.printSchema()
#subcategories.show()

In [3]:
df.count(), df.printSchema()

root
 |-- Job:value: string (nullable = true)
 |-- __typename: string (nullable = true)
 |-- applied: boolean (nullable = true)
 |-- autoinclusion: struct (nullable = true)
 |    |-- __typename: string (nullable = true)
 |    |-- autoinclusion: boolean (nullable = true)
 |    |-- source: string (nullable = true)
 |-- bullets: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- __ref: string (nullable = true)
 |-- category: struct (nullable = true)
 |    |-- __ref: string (nullable = true)
 |-- company: struct (nullable = true)
 |    |-- __typename: string (nullable = true)
 |    |-- confidential: boolean (nullable = true)
 |    |-- logoUrl: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- namePretty: string (nullable = true)
 |    |-- profile: struct (nullable = true)
 |    |    |-- __ref: string (nullable = true)
 |    |-- rel: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- contact: struct (nullable =

(137284, None)

In [4]:
#df.toPandas()
df.show(1, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
df.count(), df.select('id').distinct().count()

(137284, 133901)

In [6]:
df.groupby("redirect.type").count().show()

+----+-----+
|type|count|
+----+-----+
|   0|77150|
|   1| 3840|
|   2|56294|
+----+-----+



In [7]:
df.groupby("jobType").count().show()

+--------+------+
| jobType| count|
+--------+------+
| PREMIUM|  3170|
|STANDOUT| 28321|
| CLASSIC|105793|
+--------+------+



In [8]:
df.groupby("jobType").pivot("redirect.type").count().show()

+--------+-----+----+-----+
| jobType|    0|   1|    2|
+--------+-----+----+-----+
| PREMIUM| 3145|  25| null|
|STANDOUT|27851| 312|  158|
| CLASSIC|46154|3503|56136|
+--------+-----+----+-----+



In [9]:
pdf = df.groupby("jobType").pivot("redirect.type").count().toPandas().fillna(0).set_index('jobType')
pdf.columns = ['NoRedir', 'Redir1', 'Redir2']
pdf.div(pdf.sum(axis=1), axis=0)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2
jobType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PREMIUM,0.992114,0.007886,0.0
STANDOUT,0.983405,0.011017,0.005579
CLASSIC,0.436267,0.033112,0.530621


In [10]:
pdf.div(pdf.sum(axis=0), axis=1)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2
jobType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PREMIUM,0.040765,0.00651,0.0
STANDOUT,0.360998,0.08125,0.002807
CLASSIC,0.598237,0.91224,0.997193


In [11]:
pdf.div(pdf.sum(axis=None))

Unnamed: 0_level_0,NoRedir,Redir1,Redir2
jobType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PREMIUM,0.040765,0.00651,0.0
STANDOUT,0.360998,0.08125,0.002807
CLASSIC,0.598237,0.91224,0.997193


In [12]:
cat_count = df.groupby("category").count().orderBy(F.col("count").desc())
split_col = F.split(cat_count["category.__ref"].cast("String"), ":")
cat_count = cat_count.withColumn("category_id", split_col.getItem(1).cast("INT"))
cat_count = cat_count.join(categories.select("id", "description"), cat_count.category_id == categories.id, how="inner")
cat_count = cat_count.select(["category_id", "description", "count"]).orderBy(F.col("count").desc())

In [13]:
sum = cat_count.select(F.sum("count")).collect()[0][0]
cat_count = cat_count.withColumn("percentage", F.col("count") / sum)
cat_count.show(cat_count.count(), truncate=False)

+-----------+------------------------------------------------+-----+---------------------+
|category_id|description                                     |count|percentage           |
+-----------+------------------------------------------------+-----+---------------------+
|19         |Ventas                                          |29980|0.21837941784913026  |
|1          |Administrativo                                  |16111|0.11735526354127211  |
|17         |Tecnologías de la Información - Sistemas        |15055|0.10966317997727339  |
|4          |Contabilidad - Finanzas                         |13767|0.10028116896360828  |
|10         |Logística - Transporte - Distribución - Almacén |11300|0.08231112147081962  |
|11         |Manufactura - Producción - Operación            |10276|0.07485213134815419  |
|9          |Ingeniería                                      |7938 |0.057821741790740365 |
|21         |Atención a clientes - Call Center               |6989 |0.05090906442119985  |

In [14]:
split_col = F.split(df["category.__ref"].cast("String"), ":")
cat_count_redir = df.withColumn("category_id", split_col.getItem(1).cast("INT"))
cat_count_redir = cat_count_redir.groupby("category_id").pivot("redirect.type").count()
cat_count_redir = cat_count_redir.join(categories.select("id", "description"), cat_count_redir.category_id == categories.id, how="inner")
cat_count_redir = cat_count_redir.select(["description", "0", "1", "2"]).sort(F.col("0").desc()).toPandas().fillna(0)
cat_count_redir.columns = ["Category", "NoRedir", "Redir1", "Redir2(AGG)"]
cat_count_redir = cat_count_redir.set_index("Category")
cat_count_redir = cat_count_redir.div(cat_count_redir.sum(axis=1), axis=0)
cat_count_redir

Unnamed: 0_level_0,NoRedir,Redir1,Redir2(AGG)
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ventas,0.546898,0.037525,0.415577
Contabilidad - Finanzas,0.650033,0.029128,0.32084
Tecnologías de la Información - Sistemas,0.534773,0.039987,0.425241
Logística - Transporte - Distribución - Almacén,0.576637,0.011062,0.412301
Administrativo,0.403637,0.016883,0.57948
Manufactura - Producción - Operación,0.536687,0.016933,0.44638
Ingeniería,0.628874,0.025699,0.345427
Recursos humanos,0.738235,0.031801,0.229963
Atención a clientes - Call Center,0.432823,0.023036,0.544141
Construcción - Inmobiliaria - Arquitectura,0.830593,0.018585,0.150822


In [15]:
cat_count_redir.sort_values("Redir2(AGG)", ascending=False)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2(AGG)
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,0.0,0.0,1.0
Servicios generales - Oficios - Seguridad,0.349676,0.034843,0.61548
Administrativo,0.403637,0.016883,0.57948
Atención a clientes - Call Center,0.432823,0.023036,0.544141
Turismo - Hospitalidad - Gastronomía,0.477372,0.009138,0.51349
Manufactura - Producción - Operación,0.536687,0.016933,0.44638
Tecnologías de la Información - Sistemas,0.534773,0.039987,0.425241
Ventas,0.546898,0.037525,0.415577
Logística - Transporte - Distribución - Almacén,0.576637,0.011062,0.412301
Ingeniería,0.628874,0.025699,0.345427


In [16]:
subcat_count = df.groupby("subcategory").count().orderBy(F.col("count").desc())
split_col = F.split(subcat_count["subcategory.__ref"].cast("String"), ":")
subcat_count = subcat_count.withColumn("subcategory_id", split_col.getItem(1).cast("INT"))
subcat_count = subcat_count.join(subcategories.select("id", "description"), subcat_count.subcategory_id == subcategories.id, how="inner")
subcat_count.orderBy(F.col("count").desc()).show(25)

+--------------------+-----+--------------+---+--------------------+
|         subcategory|count|subcategory_id| id|         description|
+--------------------+-----+--------------+---+--------------------+
| {JobSubcategory:29}| 9429|            29| 29|      Administración|
|{JobSubcategory:319}| 6166|           319|319|   Ventas en general|
|{JobSubcategory:260}| 5943|           260|260| Atención al cliente|
|{JobSubcategory:242}| 4001|           242|242|Desarrollo de sof...|
|{JobSubcategory:149}| 3998|           149|149|Almacén - Inventario|
|{JobSubcategory:275}| 3470|           275|275|Administración de...|
| {JobSubcategory:25}| 3268|            25| 25|  Crédito y cobranza|
|{JobSubcategory:286}| 3191|           286|286|     Ventas de campo|
| {JobSubcategory:78}| 3059|            78| 78|        Contabilidad|
|{JobSubcategory:190}| 2107|           190|190|Reclutamiento y s...|
|{JobSubcategory:424}| 1950|           424|424|              Cajero|
| {JobSubcategory:30}| 1856|      

In [17]:
split_col = F.split(df["subcategory.__ref"].cast("String"), ":")
subcat_count_redir = df.withColumn("subcategory_id", split_col.getItem(1).cast("INT"))
subcat_count_redir = subcat_count_redir.groupby("subcategory_id").pivot("redirect.type").count()
subcat_count_redir = subcat_count_redir.join(subcategories.select("id", "description"), subcat_count_redir.subcategory_id == subcategories.id, how="inner")
subcat_count_redir = subcat_count_redir.select(["description", "0", "1", "2"]).sort(F.col("0").desc()).toPandas().fillna(0)
subcat_count_redir.columns = ["subcategory", "NoRedir", "Redir1", "Redir2(AGG)"]
subcat_count_redir = subcat_count_redir.set_index("subcategory")
subcat_count_redir = subcat_count_redir.div(subcat_count_redir.sum(axis=1), axis=0)
pd.set_option('display.max_rows', None)
subcat_count_redir

Unnamed: 0_level_0,NoRedir,Redir1,Redir2(AGG)
subcategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ventas en general,0.570386,0.046221,0.383393
Contabilidad,0.787839,0.010461,0.2017
Administración de ventas,0.641499,0.039193,0.319308
Atención al cliente,0.347131,0.020528,0.632341
Administración,0.212324,0.009969,0.777707
Ventas de campo,0.621122,0.036666,0.342212
Almacén - Inventario,0.417959,0.011756,0.570285
Reclutamiento y selección de personal,0.789274,0.019459,0.191267
Desarrollo de software - Programador,0.408648,0.069733,0.52162
Crédito y cobranza,0.418911,0.018666,0.562424


In [18]:
subcat_count_redir.sort_values("Redir2(AGG)", ascending=False)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2(AGG)
subcategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,0.0,0.0,1.0
Ingeniería genética,0.0,0.0,1.0
Ilustración,0.0,0.0,1.0
Taquillero,0.0,0.0,1.0
Plataformas,0.0,0.0,1.0
Pedicurista,0.0,0.0,1.0
Mesero,0.066667,0.0,0.933333
Biotecnología,0.071429,0.0,0.928571
Música,0.090909,0.0,0.909091
Camarista,0.115385,0.0,0.884615


In [19]:
df.groupby("googleForJobs").count().sort(F.col("count").desc()).show(truncate=False)

+-----------------------------+------+
|googleForJobs                |count |
+-----------------------------+------+
|{GoogleForJobs, true, true}  |135814|
|{GoogleForJobs, false, false}|1470  |
+-----------------------------+------+



In [20]:
subcategories.where("description = 'Flebotomía'").show(truncate=False)

+--------------+-----------+---+-------------------------------------------+
|__typename    |description|id |url                                        |
+--------------+-----------+---+-------------------------------------------+
|JobSubcategory|Flebotomía |401|empleos/trabajo-en-sector-salud-flebotomia/|
+--------------+-----------+---+-------------------------------------------+



In [35]:
company_count = df.groupby("company.url").count().sort(F.col("count").desc())
print(company_count.count())
company_count.show(truncate=False)

14380
+--------------------------------------------------+-----+
|url                                               |count|
+--------------------------------------------------+-----+
|null                                              |16764|
|empleos/bolsa-de-trabajo-Test-and-QA-Corporation/ |9207 |
|empleos/bolsa-de-trabajo-grupo-salinas/           |7591 |
|empleos/bolsa-de-trabajo-Un-Mejor-Empleo/         |3190 |
|empleos/bolsa-de-trabajo-Trabajos-Diarios-MX/     |2985 |
|empleos/bolsa-de-trabajo-Grupo-Salinas/           |1886 |
|empleos/bolsa-de-trabajo-gepp/                    |1572 |
|empleos/bolsa-de-trabajo-truper/                  |1452 |
|empleos/bolsa-de-trabajo-Adecco/                  |952  |
|empleos/bolsa-de-trabajo-Manpower--S-A--de-C-V-/  |916  |
|empleos/bolsa-de-trabajo-Banamex/                 |844  |
|empleos/bolsa-de-trabajo-Santander/               |763  |
|empleos/bolsa-de-trabajo-Autofinauto/             |710  |
|empleos/bolsa-de-trabajo-CitiGroup/              

In [22]:
sum = company_count.select(F.sum(F.col("count"))).collect()[0][0]
sum

137284

In [23]:
from pyspark.sql.window import Window
ccp = company_count.withColumn("perc", F.col("count") / sum).orderBy(F.col("perc").desc())
window = Window.orderBy(F.col("perc").desc()).rowsBetween(Window.unboundedPreceding, Window.currentRow)
ccp.withColumn("cumsum", F.sum(F.col("perc")).over(window)).show(50, truncate=False)

+------------------------------------------------------------------------+-----+---------------------+-------------------+
|url                                                                     |count|perc                 |cumsum             |
+------------------------------------------------------------------------+-----+---------------------+-------------------+
|null                                                                    |16764|0.12211182657847965  |0.12211182657847965|
|empleos/bolsa-de-trabajo-Test-and-QA-Corporation/                       |9207 |0.06706535357361382  |0.18917718015209345|
|empleos/bolsa-de-trabajo-grupo-salinas/                                 |7591 |0.05529413478628245  |0.2444713149383759 |
|empleos/bolsa-de-trabajo-Un-Mejor-Empleo/                               |3190 |0.023236502432912795 |0.2677078173712887 |
|empleos/bolsa-de-trabajo-Trabajos-Diarios-MX/                           |2985 |0.021743247574371377 |0.2894510649456601 |
|empleos/bolsa-d

In [24]:
df.where("company.url='empleos/bolsa-de-trabajo-Manpower--S-A--de-C-V-/'").count()

916

In [25]:
df.select("company.name").where("company.confidential=TRUE").count()

16764

In [26]:
company_count = df.where("redirect.type != 2").groupby("company.url").count().sort(F.col("count").desc())
company_count.show(truncate=False)

+-----------------------------------------------------------------------+-----+
|url                                                                    |count|
+-----------------------------------------------------------------------+-----+
|null                                                                   |16188|
|empleos/bolsa-de-trabajo-Grupo-Salinas/                                |1886 |
|empleos/bolsa-de-trabajo-Adecco/                                       |952  |
|empleos/bolsa-de-trabajo-Manpower--S-A--de-C-V-/                       |916  |
|empleos/bolsa-de-trabajo-Banamex/                                      |844  |
|empleos/bolsa-de-trabajo-Autofinauto/                                  |710  |
|empleos/bolsa-de-trabajo-Banorte/                                      |492  |
|empleos/bolsa-de-trabajo-Santander/                                    |466  |
|empleos/bolsa-de-trabajo-Michael-Page/                                 |360  |
|empleos/bolsa-de-trabajo-BBVA-Bancomer/

In [27]:
company_count = df.where("redirect.type == 2").groupby("company.url").count().sort(F.col("count").desc())
company_count.show(truncate=False)

+---------------------------------------------------------+-----+
|url                                                      |count|
+---------------------------------------------------------+-----+
|empleos/bolsa-de-trabajo-Test-and-QA-Corporation/        |9207 |
|empleos/bolsa-de-trabajo-grupo-salinas/                  |7591 |
|empleos/bolsa-de-trabajo-Un-Mejor-Empleo/                |3190 |
|empleos/bolsa-de-trabajo-Trabajos-Diarios-MX/            |2985 |
|empleos/bolsa-de-trabajo-gepp/                           |1572 |
|empleos/bolsa-de-trabajo-truper/                         |1452 |
|empleos/bolsa-de-trabajo-CitiGroup/                      |700  |
|empleos/bolsa-de-trabajo-bairesdev/                      |619  |
|null                                                     |576  |
|empleos/bolsa-de-trabajo-Workable-ATS/                   |439  |
|empleos/bolsa-de-trabajo-marriott-international/         |349  |
|empleos/bolsa-de-trabajo-EnsenadaHoy-com/                |342  |
|empleos/b

In [28]:
df.where("company.url == 'empleos/bolsa-de-trabajo-bairesdev/'").first()

Row(Job:value='Job:17036943', __typename='Job', applied=False, autoinclusion=Row(__typename='JobAutoinclusion', autoinclusion=True, source='asp'), bullets=[], category=Row(__ref='JobCategory:17'), company=Row(__typename='JobCompany', confidential=False, logoUrl='https://cdn-h4.occ.com.mx/images/logos/300x300/01/01RfAPA_ZZ_STrdMpwQ-DQ2.jpg', name='bairesdev', namePretty='bairesdev', profile=None, rel=None, url='empleos/bolsa-de-trabajo-bairesdev/'), contact=Row(__typename='JobContact', emailAddress='', name='', phoneNumber='', showContactInfo=False), dates=Row(__typename='JobDates', active='2023-07-23 20:52:32', expires='2023-07-25 00:00:00', publish='2023-07-12 00:00:00'), description='Who We are   BairesDev is proud to be the fastest-growing company in America. With people in five continents and world-class clients, we are only as strong as the multicultural teams at the heart of o ...', education=Row(__typename='JobEducation', level='NOT_SPECIFIED', url='empleos/nivel-posgrado/'), fr

In [29]:
df.where("redirect.isRedirected == TRUE").where("autoinclusion.autoinclusion == TRUE").count()

56294

In [30]:
df.where("redirect.isRedirected == TRUE").where("autoinclusion.autoinclusion == FALSE").count()

2178

In [31]:
df.where("redirect.isRedirected == FALSE").where("autoinclusion.autoinclusion == TRUE").count()

0

In [32]:
df.where("redirect.isRedirected == FALSE").where("autoinclusion.autoinclusion == FALSE").count()

78812

In [33]:
df.where("redirect.isRedirected == TRUE")\
    .where("autoinclusion.autoinclusion == TRUE")\
    .select(["scraped_url", "redirect.externalUrl"])\
    .show(truncate=False)

+-------------------------------------------------------------------------------------+-----------------------------------------------------------+
|scraped_url                                                                          |externalUrl                                                |
+-------------------------------------------------------------------------------------+-----------------------------------------------------------+
|https://www.occ.com.mx/empleo/oferta/16979014-senior-contract-administrator/         |https://careers.wbd.com/global/en/job/R000070127           |
|https://www.occ.com.mx/empleo/oferta/17011509-senior-software-developer-web-sdk/     |https://paypal.eightfold.ai/careers/job?pid=274894789183   |
|https://www.occ.com.mx/empleo/oferta/16185010-medico-general/                        |https://empleo.gob.mx/resultados-ss-detalle/empleo/20220541|
|https://www.occ.com.mx/empleo/oferta/17074718-carboy/                                |https://empleo.gob.mx/res