# Structure Check

Cómo tener acceso a los JSON internos de los registros. 

Es necesario definir un schema. Abajo se ve cómo inferir schema a partir de un ejemplo

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


spark = SparkSession \
    .builder \
    .appName("Learning Spark") \
    .getOrCreate()

date_str = "20230802"

#df = spark.read.json("harvester/occ/tests/test-gdl.jsonl")
#df = spark.read.json("harvester/occ/tests/test-cdmx/sneaky*.jsonl.gz")
#df = spark.read.json("test-cdmx.jsonl") #from ETL
#df = spark.read.json("test-full-occ.jsonl") #from ETL
df = spark.read.json(f"occ-{date_str}.jsonl")
categories = spark.read.json(f"occ-{date_str}-categories.json")
subcategories = spark.read.json(f"occ-{date_str}-subcategories.json")

In [2]:
#categories.count(), categories.printSchema()
#subcategories.show()

In [3]:
df.count(), df.printSchema()

root
 |-- Job:value: string (nullable = true)
 |-- __typename: string (nullable = true)
 |-- applied: boolean (nullable = true)
 |-- autoinclusion: struct (nullable = true)
 |    |-- __typename: string (nullable = true)
 |    |-- autoinclusion: boolean (nullable = true)
 |    |-- source: string (nullable = true)
 |-- bullets: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- __ref: string (nullable = true)
 |-- category: struct (nullable = true)
 |    |-- __ref: string (nullable = true)
 |-- company: struct (nullable = true)
 |    |-- __typename: string (nullable = true)
 |    |-- confidential: boolean (nullable = true)
 |    |-- logoUrl: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- namePretty: string (nullable = true)
 |    |-- profile: struct (nullable = true)
 |    |    |-- __ref: string (nullable = true)
 |    |-- rel: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- contact: struct (nullable =

(132275, None)

In [4]:
#df.toPandas()
df.show(1, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
df.count(), df.select('id').distinct().count()

(132275, 129699)

In [6]:
df.groupby("redirect.type").count().show()

+----+-----+
|type|count|
+----+-----+
|   0|76548|
|   1| 3117|
|   2|52610|
+----+-----+



In [7]:
df.groupby("jobType").count().show()

+--------+------+
| jobType| count|
+--------+------+
| PREMIUM|  3038|
|STANDOUT| 28511|
| CLASSIC|100726|
+--------+------+



In [8]:
df.groupby("jobType").pivot("redirect.type").count().show()

+--------+-----+----+-----+
| jobType|    0|   1|    2|
+--------+-----+----+-----+
| PREMIUM| 3005|  33| null|
|STANDOUT|27927| 442|  142|
| CLASSIC|45616|2642|52468|
+--------+-----+----+-----+



In [9]:
pdf = df.groupby("jobType").pivot("redirect.type").count().toPandas().fillna(0).set_index('jobType')
pdf.columns = ['NoRedir', 'Redir1', 'Redir2']
pdf.div(pdf.sum(axis=1), axis=0)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2
jobType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PREMIUM,0.989138,0.010862,0.0
STANDOUT,0.979517,0.015503,0.004981
CLASSIC,0.452872,0.02623,0.520898


In [10]:
pdf.div(pdf.sum(axis=0), axis=1)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2
jobType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PREMIUM,0.039256,0.010587,0.0
STANDOUT,0.36483,0.141803,0.002699
CLASSIC,0.595914,0.84761,0.997301


In [11]:
pdf.div(pdf.sum(axis=None))

Unnamed: 0_level_0,NoRedir,Redir1,Redir2
jobType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PREMIUM,0.039256,0.010587,0.0
STANDOUT,0.36483,0.141803,0.002699
CLASSIC,0.595914,0.84761,0.997301


In [12]:
cat_count = df.groupby("category").count().orderBy(F.col("count").desc())
split_col = F.split(cat_count["category.__ref"].cast("String"), ":")
cat_count = cat_count.withColumn("category_id", split_col.getItem(1).cast("INT"))
cat_count = cat_count.join(categories.select("id", "description"), cat_count.category_id == categories.id, how="inner")
cat_count = cat_count.select(["category_id", "description", "count"]).orderBy(F.col("count").desc())

In [13]:
sum = cat_count.select(F.sum("count")).collect()[0][0]
cat_count = cat_count.withColumn("percentage", F.col("count") / sum)
cat_count.show(cat_count.count(), truncate=False)

+-----------+------------------------------------------------+-----+---------------------+
|category_id|description                                     |count|percentage           |
+-----------+------------------------------------------------+-----+---------------------+
|19         |Ventas                                          |28685|0.21685881685881686  |
|1          |Administrativo                                  |15713|0.1187903987903988   |
|17         |Tecnologías de la Información - Sistemas        |14559|0.11006615006615007  |
|4          |Contabilidad - Finanzas                         |13380|0.10115290115290115  |
|10         |Logística - Transporte - Distribución - Almacén |10699|0.08088452088452089  |
|11         |Manufactura - Producción - Operación            |9936 |0.07511623511623512  |
|9          |Ingeniería                                      |7672 |0.058000378000378    |
|21         |Atención a clientes - Call Center               |6830 |0.05163485163485163  |

In [14]:
split_col = F.split(df["category.__ref"].cast("String"), ":")
cat_count_redir = df.withColumn("category_id", split_col.getItem(1).cast("INT"))
cat_count_redir = cat_count_redir.groupby("category_id").pivot("redirect.type").count()
cat_count_redir = cat_count_redir.join(categories.select("id", "description"), cat_count_redir.category_id == categories.id, how="inner")
cat_count_redir = cat_count_redir.select(["description", "0", "1", "2"]).sort(F.col("0").desc()).toPandas().fillna(0)
cat_count_redir.columns = ["Category", "NoRedir", "Redir1", "Redir2(AGG)"]
cat_count_redir = cat_count_redir.set_index("Category")
cat_count_redir = cat_count_redir.div(cat_count_redir.sum(axis=1), axis=0)
cat_count_redir

Unnamed: 0_level_0,NoRedir,Redir1,Redir2(AGG)
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ventas,0.568938,0.029284,0.401778
Contabilidad - Finanzas,0.665994,0.024813,0.309193
Tecnologías de la Información - Sistemas,0.547359,0.02926,0.423381
Logística - Transporte - Distribución - Almacén,0.608375,0.010655,0.38097
Administrativo,0.408961,0.01241,0.578629
Manufactura - Producción - Operación,0.554046,0.017512,0.428442
Ingeniería,0.629562,0.024244,0.346194
Recursos humanos,0.746005,0.029893,0.224102
Atención a clientes - Call Center,0.449048,0.015666,0.535286
Construcción - Inmobiliaria - Arquitectura,0.838757,0.014991,0.146252


In [15]:
cat_count_redir.sort_values("Redir2(AGG)", ascending=False)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2(AGG)
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,0.0,0.0,1.0
Administrativo,0.408961,0.01241,0.578629
Servicios generales - Oficios - Seguridad,0.385578,0.039184,0.575238
Atención a clientes - Call Center,0.449048,0.015666,0.535286
Turismo - Hospitalidad - Gastronomía,0.535577,0.010577,0.453846
Manufactura - Producción - Operación,0.554046,0.017512,0.428442
Tecnologías de la Información - Sistemas,0.547359,0.02926,0.423381
Ventas,0.568938,0.029284,0.401778
Logística - Transporte - Distribución - Almacén,0.608375,0.010655,0.38097
Deportes - Salud - Belleza,0.635294,0.011765,0.352941


In [16]:
subcat_count = df.groupby("subcategory").count().orderBy(F.col("count").desc())
split_col = F.split(subcat_count["subcategory.__ref"].cast("String"), ":")
subcat_count = subcat_count.withColumn("subcategory_id", split_col.getItem(1).cast("INT"))
subcat_count = subcat_count.join(subcategories.select("id", "description"), subcat_count.subcategory_id == subcategories.id, how="inner")
subcat_count.orderBy(F.col("count").desc()).show(25)

+--------------------+-----+--------------+---+--------------------+
|         subcategory|count|subcategory_id| id|         description|
+--------------------+-----+--------------+---+--------------------+
| {JobSubcategory:29}| 9203|            29| 29|      Administración|
|{JobSubcategory:319}| 5863|           319|319|   Ventas en general|
|{JobSubcategory:260}| 5764|           260|260| Atención al cliente|
|{JobSubcategory:242}| 3807|           242|242|Desarrollo de sof...|
|{JobSubcategory:149}| 3752|           149|149|Almacén - Inventario|
|{JobSubcategory:275}| 3350|           275|275|Administración de...|
| {JobSubcategory:25}| 3167|            25| 25|  Crédito y cobranza|
|{JobSubcategory:286}| 2991|           286|286|     Ventas de campo|
| {JobSubcategory:78}| 2915|            78| 78|        Contabilidad|
|{JobSubcategory:190}| 2101|           190|190|Reclutamiento y s...|
| {JobSubcategory:30}| 1872|            30| 30|  Control de Calidad|
|{JobSubcategory:424}| 1782|      

In [17]:
split_col = F.split(df["subcategory.__ref"].cast("String"), ":")
subcat_count_redir = df.withColumn("subcategory_id", split_col.getItem(1).cast("INT"))
subcat_count_redir = subcat_count_redir.groupby("subcategory_id").pivot("redirect.type").count()
subcat_count_redir = subcat_count_redir.join(subcategories.select("id", "description"), subcat_count_redir.subcategory_id == subcategories.id, how="inner")
subcat_count_redir = subcat_count_redir.select(["description", "0", "1", "2"]).sort(F.col("0").desc()).toPandas().fillna(0)
subcat_count_redir.columns = ["subcategory", "NoRedir", "Redir1", "Redir2(AGG)"]
subcat_count_redir = subcat_count_redir.set_index("subcategory")
subcat_count_redir = subcat_count_redir.div(subcat_count_redir.sum(axis=1), axis=0)
pd.set_option('display.max_rows', None)
subcat_count_redir

Unnamed: 0_level_0,NoRedir,Redir1,Redir2(AGG)
subcategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ventas en general,0.601057,0.026266,0.372676
Contabilidad,0.802058,0.01235,0.185592
Administración de ventas,0.658507,0.034925,0.306567
Atención al cliente,0.359473,0.013012,0.627516
Administración,0.214387,0.006846,0.778768
Ventas de campo,0.649281,0.037111,0.313607
Reclutamiento y selección de personal,0.804379,0.022846,0.172775
Almacén - Inventario,0.442964,0.010661,0.546375
Desarrollo de software - Programador,0.427108,0.04518,0.527712
Crédito y cobranza,0.430692,0.015472,0.553836


In [18]:
subcat_count_redir.sort_values("Redir2(AGG)", ascending=False)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2(AGG)
subcategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,0.0,0.0,1.0
Plataformas,0.0,0.0,1.0
Biología,0.0,0.0,1.0
Taquillero,0.0,0.0,1.0
Ilustración,0.0,0.0,1.0
Pedicurista,0.0,0.0,1.0
Mesero,0.050847,0.0,0.949153
Biotecnología,0.111111,0.0,0.888889
Cajero,0.107183,0.012907,0.87991
Lavandería,0.121212,0.0,0.878788


In [19]:
df.groupby("googleForJobs").count().sort(F.col("count").desc()).show(truncate=False)

+-----------------------------+------+
|googleForJobs                |count |
+-----------------------------+------+
|{GoogleForJobs, true, true}  |130150|
|{GoogleForJobs, false, false}|2125  |
+-----------------------------+------+



In [20]:
subcategories.where("description = 'Flebotomía'").show(truncate=False)

+--------------+-----------+---+-------------------------------------------+
|__typename    |description|id |url                                        |
+--------------+-----------+---+-------------------------------------------+
|JobSubcategory|Flebotomía |401|empleos/trabajo-en-sector-salud-flebotomia/|
+--------------+-----------+---+-------------------------------------------+



In [21]:
company_count = df.groupby("company.url").count().sort(F.col("count").desc())
print(company_count.count())
company_count.show(truncate=False)

14306
+--------------------------------------------------+-----+
|url                                               |count|
+--------------------------------------------------+-----+
|null                                              |16550|
|empleos/bolsa-de-trabajo-grupo-salinas/           |7315 |
|empleos/bolsa-de-trabajo-Test-and-QA-Corporation/ |6886 |
|empleos/bolsa-de-trabajo-Un-Mejor-Empleo/         |3012 |
|empleos/bolsa-de-trabajo-Trabajos-Diarios-MX/     |2892 |
|empleos/bolsa-de-trabajo-Grupo-Salinas/           |1817 |
|empleos/bolsa-de-trabajo-truper/                  |1478 |
|empleos/bolsa-de-trabajo-Adecco/                  |1007 |
|empleos/bolsa-de-trabajo-gepp/                    |999  |
|empleos/bolsa-de-trabajo-Manpower--S-A--de-C-V-/  |911  |
|empleos/bolsa-de-trabajo-Santander/               |726  |
|empleos/bolsa-de-trabajo-Autofinauto/             |723  |
|empleos/bolsa-de-trabajo-bairesdev/               |613  |
|empleos/bolsa-de-trabajo-CitiGroup/              

In [22]:
sum = company_count.select(F.sum(F.col("count"))).collect()[0][0]
sum

132275

In [23]:
from pyspark.sql.window import Window
ccp = company_count.withColumn("perc", F.col("count") / sum).orderBy(F.col("perc").desc())
window = Window.orderBy(F.col("perc").desc()).rowsBetween(Window.unboundedPreceding, Window.currentRow)
ccp.withColumn("cumsum", F.sum(F.col("perc")).over(window)).show(50, truncate=False)

+------------------------------------------------------------------------+-----+---------------------+-------------------+
|url                                                                     |count|perc                 |cumsum             |
+------------------------------------------------------------------------+-----+---------------------+-------------------+
|null                                                                    |16550|0.12511812511812512  |0.12511812511812512|
|empleos/bolsa-de-trabajo-grupo-salinas/                                 |7315 |0.055301455301455305 |0.18041958041958042|
|empleos/bolsa-de-trabajo-Test-and-QA-Corporation/                       |6886 |0.05205821205821206  |0.2324777924777925 |
|empleos/bolsa-de-trabajo-Un-Mejor-Empleo/                               |3012 |0.022770742770742772 |0.25524853524853525|
|empleos/bolsa-de-trabajo-Trabajos-Diarios-MX/                           |2892 |0.021863541863541863 |0.2771120771120771 |
|empleos/bolsa-d

In [24]:
df.where("company.url='empleos/bolsa-de-trabajo-Manpower--S-A--de-C-V-/'").count()

911

In [25]:
df.select("company.name").where("company.confidential=TRUE").count()

16548

In [26]:
company_count = df.where("redirect.type != 2").groupby("company.url").count().sort(F.col("count").desc())
company_count.show(truncate=False)

+-----------------------------------------------------------------------+-----+
|url                                                                    |count|
+-----------------------------------------------------------------------+-----+
|null                                                                   |16058|
|empleos/bolsa-de-trabajo-Grupo-Salinas/                                |1817 |
|empleos/bolsa-de-trabajo-Adecco/                                       |1007 |
|empleos/bolsa-de-trabajo-Manpower--S-A--de-C-V-/                       |911  |
|empleos/bolsa-de-trabajo-Autofinauto/                                  |723  |
|empleos/bolsa-de-trabajo-Banorte/                                      |441  |
|empleos/bolsa-de-trabajo-Santander/                                    |433  |
|empleos/bolsa-de-trabajo-BBVA-Bancomer/                                |348  |
|empleos/bolsa-de-trabajo-Michael-Page/                                 |347  |
|empleos/bolsa-de-trabajo-Femsa/        

In [27]:
company_count = df.where("redirect.type == 2").groupby("company.url").count().sort(F.col("count").desc())
company_count.show(truncate=False)

+---------------------------------------------------+-----+
|url                                                |count|
+---------------------------------------------------+-----+
|empleos/bolsa-de-trabajo-grupo-salinas/            |7315 |
|empleos/bolsa-de-trabajo-Test-and-QA-Corporation/  |6886 |
|empleos/bolsa-de-trabajo-Un-Mejor-Empleo/          |3012 |
|empleos/bolsa-de-trabajo-Trabajos-Diarios-MX/      |2892 |
|empleos/bolsa-de-trabajo-truper/                   |1478 |
|empleos/bolsa-de-trabajo-gepp/                     |999  |
|empleos/bolsa-de-trabajo-bairesdev/                |613  |
|empleos/bolsa-de-trabajo-CitiGroup/                |587  |
|null                                               |492  |
|empleos/bolsa-de-trabajo-continental/              |457  |
|empleos/bolsa-de-trabajo-Workable-ATS/             |431  |
|empleos/bolsa-de-trabajo-marriott-international/   |385  |
|empleos/bolsa-de-trabajo-EnsenadaHoy-com/          |329  |
|empleos/bolsa-de-trabajo-Michael-Page-M

In [28]:
df.where("company.url == 'empleos/bolsa-de-trabajo-bairesdev/'").first()

Row(Job:value='Job:17037114', __typename='Job', applied=False, autoinclusion=Row(__typename='JobAutoinclusion', autoinclusion=True, source='asp'), bullets=[], category=Row(__ref='JobCategory:17'), company=Row(__typename='JobCompany', confidential=False, logoUrl='https://cdn-h4.occ.com.mx/images/logos/300x300/01/01RfAPA_ZZ_STrdMpwQ-DQ2.jpg', name='bairesdev', namePretty='bairesdev', profile=None, rel=None, url='empleos/bolsa-de-trabajo-bairesdev/'), contact=Row(__typename='JobContact', emailAddress='', name='', phoneNumber='', showContactInfo=False), dates=Row(__typename='JobDates', active='2023-08-01 20:50:40', expires='2023-08-03 00:00:00', publish='2023-07-12 00:00:00'), description='BairesDev is proud to be one of the fastest-growing companies in Latin America and a welcoming, highly rated employer (Glassdoor Employee Score: 4.3). With more than 3500 employees in 27 countries and ...', education=Row(__typename='JobEducation', level='NOT_SPECIFIED', url='empleos/nivel-posgrado/'), fr

In [29]:
df.where("redirect.isRedirected == TRUE").where("autoinclusion.autoinclusion == TRUE").count()

52610

In [30]:
df.where("redirect.isRedirected == TRUE").where("autoinclusion.autoinclusion == FALSE").count()

1568

In [31]:
df.where("redirect.isRedirected == FALSE").where("autoinclusion.autoinclusion == TRUE").count()

0

In [32]:
df.where("redirect.isRedirected == FALSE").where("autoinclusion.autoinclusion == FALSE").count()

78097

In [33]:
df.where("redirect.isRedirected == TRUE")\
    .where("autoinclusion.autoinclusion == TRUE")\
    .select(["scraped_url", "redirect.externalUrl"])\
    .show(truncate=False)

+-------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|scraped_url                                                                                      |externalUrl                                                                                                                                        |
+-------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|https://www.occ.com.mx/empleo/oferta/17092509-databricks-engineer/                               |https://www.capgemini.com/jobs/e15kiokBVlin9iX11wIM/databricks-engineer/                                                                           |
|https:/