# Structure Check

Cómo tener acceso a los JSON internos de los registros. 

Es necesario definir un schema. Abajo se ve cómo inferir schema a partir de un ejemplo

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


spark = SparkSession \
    .builder \
    .appName("Learning Spark") \
    .getOrCreate()

#df = spark.read.json("harvester/occ/tests/test-gdl.jsonl")
#df = spark.read.json("harvester/occ/tests/test-cdmx/sneaky*.jsonl.gz")
#df = spark.read.json("test-cdmx.jsonl") #from ETL
#df = spark.read.json("test-full-occ.jsonl") #from ETL
df = spark.read.json("occ-20230714.jsonl")
categories = spark.read.json("categories.json")
subcategories = spark.read.json("subcategories.json")

In [2]:
#categories.count(), categories.printSchema()
#subcategories.show()

In [3]:
df.count(), df.printSchema()

root
 |-- Job:value: string (nullable = true)
 |-- __typename: string (nullable = true)
 |-- applied: boolean (nullable = true)
 |-- autoinclusion: struct (nullable = true)
 |    |-- __typename: string (nullable = true)
 |    |-- autoinclusion: boolean (nullable = true)
 |    |-- source: string (nullable = true)
 |-- bullets: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- __ref: string (nullable = true)
 |-- category: struct (nullable = true)
 |    |-- __ref: string (nullable = true)
 |-- company: struct (nullable = true)
 |    |-- __typename: string (nullable = true)
 |    |-- confidential: boolean (nullable = true)
 |    |-- logoUrl: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- namePretty: string (nullable = true)
 |    |-- profile: struct (nullable = true)
 |    |    |-- __ref: string (nullable = true)
 |    |-- rel: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- contact: struct (nullable =

(142015, None)

In [4]:
#df.toPandas()
df.show(1, vertical=True, truncate=False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Job:value              | Job:17038265                                                                                                                                                                                                                                                                                                                 
 __typename             | Job                                                                                                                                                                                                                                                                                           

In [5]:
df.count(), df.select('id').distinct().count()

(142015, 139142)

In [6]:
df.groupby("redirect.type").count().show()

+----+-----+
|type|count|
+----+-----+
|   0|81815|
|   1| 4046|
|   2|56154|
+----+-----+



In [7]:
df.groupby("jobType").count().show()

+--------+------+
| jobType| count|
+--------+------+
| PREMIUM|  3180|
|STANDOUT| 30031|
| CLASSIC|108804|
+--------+------+



In [8]:
df.groupby("jobType").pivot("redirect.type").count().show()

+--------+-----+----+-----+
| jobType|    0|   1|    2|
+--------+-----+----+-----+
| PREMIUM| 3157|  23| null|
|STANDOUT|29457| 350|  224|
| CLASSIC|49201|3673|55930|
+--------+-----+----+-----+



In [9]:
pdf = df.groupby("jobType").pivot("redirect.type").count().toPandas().fillna(0).set_index('jobType')
pdf.columns = ['NoRedir', 'Redir1', 'Redir2']
pdf.div(pdf.sum(axis=1), axis=0)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2
jobType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PREMIUM,0.992767,0.007233,0.0
STANDOUT,0.980886,0.011655,0.007459
CLASSIC,0.452198,0.033758,0.514044


In [10]:
pdf.div(pdf.sum(axis=0), axis=1)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2
jobType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PREMIUM,0.038587,0.005685,0.0
STANDOUT,0.360044,0.086505,0.003989
CLASSIC,0.601369,0.90781,0.996011


In [11]:
pdf.div(pdf.sum(axis=None))

Unnamed: 0_level_0,NoRedir,Redir1,Redir2
jobType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PREMIUM,0.038587,0.005685,0.0
STANDOUT,0.360044,0.086505,0.003989
CLASSIC,0.601369,0.90781,0.996011


In [12]:
cat_count = df.groupby("category").count().orderBy(F.col("count").desc())
split_col = F.split(cat_count["category.__ref"].cast("String"), ":")
cat_count = cat_count.withColumn("category_id", split_col.getItem(1).cast("INT"))
cat_count = cat_count.join(categories.select("id", "description"), cat_count.category_id == categories.id, how="inner")
cat_count = cat_count.select(["category_id", "description", "count"]).orderBy(F.col("count").desc())

In [13]:
sum = cat_count.select(F.sum("count")).collect()[0][0]
cat_count = cat_count.withColumn("percentage", F.col("count") / sum)
cat_count.show(cat_count.count(), truncate=False)

+-----------+------------------------------------------------+-----+---------------------+
|category_id|description                                     |count|percentage           |
+-----------+------------------------------------------------+-----+---------------------+
|19         |Ventas                                          |30356|0.21375206844347427  |
|1          |Administrativo                                  |17105|0.12044502341301976  |
|17         |Tecnologías de la Información - Sistemas        |15401|0.10844629088476569  |
|4          |Contabilidad - Finanzas                         |14145|0.09960215470196811  |
|10         |Logística - Transporte - Distribución - Almacén |11732|0.0826109917966412   |
|11         |Manufactura - Producción - Operación            |10718|0.07547090096116607  |
|9          |Ingeniería                                      |8323 |0.05860648523043341  |
|21         |Atención a clientes - Call Center               |7146 |0.0503186283139105   |

In [14]:
split_col = F.split(df["category.__ref"].cast("String"), ":")
cat_count_redir = df.withColumn("category_id", split_col.getItem(1).cast("INT"))
cat_count_redir = cat_count_redir.groupby("category_id").pivot("redirect.type").count()
cat_count_redir = cat_count_redir.join(categories.select("id", "description"), cat_count_redir.category_id == categories.id, how="inner")
cat_count_redir = cat_count_redir.select(["description", "0", "1", "2"]).sort(F.col("0").desc()).toPandas().fillna(0)
cat_count_redir.columns = ["Category", "NoRedir", "Redir1", "Redir2(AGG)"]
cat_count_redir = cat_count_redir.set_index("Category")
cat_count_redir = cat_count_redir.div(cat_count_redir.sum(axis=1), axis=0)
cat_count_redir

Unnamed: 0_level_0,NoRedir,Redir1,Redir2(AGG)
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ventas,0.569146,0.038081,0.392772
Contabilidad - Finanzas,0.672605,0.026935,0.30046
Tecnologías de la Información - Sistemas,0.546393,0.037335,0.416272
Administrativo,0.408769,0.023736,0.567495
Logística - Transporte - Distribución - Almacén,0.591885,0.012359,0.395755
Manufactura - Producción - Operación,0.543665,0.018567,0.437768
Ingeniería,0.640514,0.026433,0.333053
Recursos humanos,0.749468,0.02764,0.222892
Atención a clientes - Call Center,0.45382,0.02351,0.52267
Construcción - Inmobiliaria - Arquitectura,0.82696,0.014906,0.158134


In [15]:
cat_count_redir.sort_values("Redir2(AGG)", ascending=False)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2(AGG)
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,0.0,0.0,1.0
Servicios generales - Oficios - Seguridad,0.353835,0.042209,0.603956
Administrativo,0.408769,0.023736,0.567495
Atención a clientes - Call Center,0.45382,0.02351,0.52267
Turismo - Hospitalidad - Gastronomía,0.485954,0.010482,0.503564
Manufactura - Producción - Operación,0.543665,0.018567,0.437768
Tecnologías de la Información - Sistemas,0.546393,0.037335,0.416272
Logística - Transporte - Distribución - Almacén,0.591885,0.012359,0.395755
Ventas,0.569146,0.038081,0.392772
Deportes - Salud - Belleza,0.641176,0.005882,0.352941


In [16]:
subcat_count = df.groupby("subcategory").count().orderBy(F.col("count").desc())
split_col = F.split(subcat_count["subcategory.__ref"].cast("String"), ":")
subcat_count = subcat_count.withColumn("subcategory_id", split_col.getItem(1).cast("INT"))
subcat_count = subcat_count.join(subcategories.select("id", "description"), subcat_count.subcategory_id == subcategories.id, how="inner")
subcat_count.orderBy(F.col("count").desc()).show(25)

+--------------------+-----+--------------+---+--------------------+
|         subcategory|count|subcategory_id| id|         description|
+--------------------+-----+--------------+---+--------------------+
| {JobSubcategory:29}|10060|            29| 29|      Administración|
|{JobSubcategory:319}| 6311|           319|319|   Ventas en general|
|{JobSubcategory:260}| 5994|           260|260| Atención al cliente|
|{JobSubcategory:149}| 4135|           149|149|Almacén - Inventario|
|{JobSubcategory:242}| 3961|           242|242|Desarrollo de sof...|
|{JobSubcategory:275}| 3575|           275|275|Administración de...|
| {JobSubcategory:78}| 3271|            78| 78|        Contabilidad|
|{JobSubcategory:286}| 3198|           286|286|     Ventas de campo|
| {JobSubcategory:25}| 3132|            25| 25|  Crédito y cobranza|
|{JobSubcategory:190}| 2187|           190|190|Reclutamiento y s...|
| {JobSubcategory:30}| 1929|            30| 30|  Control de Calidad|
|{JobSubcategory:424}| 1907|      

In [17]:
split_col = F.split(df["subcategory.__ref"].cast("String"), ":")
subcat_count_redir = df.withColumn("subcategory_id", split_col.getItem(1).cast("INT"))
subcat_count_redir = subcat_count_redir.groupby("subcategory_id").pivot("redirect.type").count()
subcat_count_redir = subcat_count_redir.join(subcategories.select("id", "description"), subcat_count_redir.subcategory_id == subcategories.id, how="inner")
subcat_count_redir = subcat_count_redir.select(["description", "0", "1", "2"]).sort(F.col("0").desc()).toPandas().fillna(0)
subcat_count_redir.columns = ["subcategory", "NoRedir", "Redir1", "Redir2(AGG)"]
subcat_count_redir = subcat_count_redir.set_index("subcategory")
subcat_count_redir = subcat_count_redir.div(subcat_count_redir.sum(axis=1), axis=0)
pd.set_option('display.max_rows', None)
subcat_count_redir

Unnamed: 0_level_0,NoRedir,Redir1,Redir2(AGG)
subcategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ventas en general,0.587546,0.032008,0.380447
Contabilidad,0.803118,0.008254,0.188627
Administración de ventas,0.653427,0.033566,0.313007
Administración,0.220378,0.026143,0.753479
Atención al cliente,0.364031,0.021522,0.614448
Ventas de campo,0.65541,0.04065,0.30394
Almacén - Inventario,0.441354,0.014268,0.544377
Reclutamiento y selección de personal,0.798811,0.018747,0.182442
Desarrollo de software - Programador,0.422368,0.066145,0.511487
Crédito y cobranza,0.445402,0.01788,0.536718


In [18]:
subcat_count_redir.sort_values("Redir2(AGG)", ascending=False)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2(AGG)
subcategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,0.0,0.0,1.0
Ilustración,0.0,0.0,1.0
Hipoteca,0.0,0.0,1.0
Pedicurista,0.0,0.0,1.0
Ingeniería genética,0.0,0.0,1.0
Plataformas,0.0,0.0,1.0
Mesero,0.067797,0.0,0.932203
Camarista,0.118421,0.013158,0.868421
Lavandería,0.132075,0.0,0.867925
Biotecnología,0.142857,0.0,0.857143


In [19]:
df.groupby("googleForJobs").count().sort(F.col("count").desc()).show(truncate=False)

+-----------------------------+------+
|googleForJobs                |count |
+-----------------------------+------+
|{GoogleForJobs, true, true}  |139560|
|{GoogleForJobs, false, false}|2424  |
|{GoogleForJobs, false, true} |31    |
+-----------------------------+------+



In [20]:
subcategories.where("description = 'Flebotomía'").show(truncate=False)

+--------------+-----------+---+-------------------------------------------+
|__typename    |description|id |url                                        |
+--------------+-----------+---+-------------------------------------------+
|JobSubcategory|Flebotomía |401|empleos/trabajo-en-sector-salud-flebotomia/|
+--------------+-----------+---+-------------------------------------------+



In [1]:
company_count = df.groupby("company.url").count().sort(F.col("count").desc())
print(company_count.count())
company_count.show(truncate=False)

NameError: name 'df' is not defined

In [22]:
sum = company_count.select(F.sum(F.col("count"))).collect()[0][0]
sum

142015

In [23]:
from pyspark.sql.window import Window
ccp = company_count.withColumn("perc", F.col("count") / sum).orderBy(F.col("perc").desc())
window = Window.orderBy(F.col("perc").desc()).rowsBetween(Window.unboundedPreceding, Window.currentRow)
ccp.withColumn("cumsum", F.sum(F.col("perc")).over(window)).show(50, truncate=False)

+------------------------------------------------------------------------+-----+---------------------+-------------------+
|url                                                                     |count|perc                 |cumsum             |
+------------------------------------------------------------------------+-----+---------------------+-------------------+
|null                                                                    |17819|0.125472661338591    |0.125472661338591  |
|empleos/bolsa-de-trabajo-Test-and-QA-Corporation/                       |9886 |0.06961236489103263  |0.19508502622962365|
|empleos/bolsa-de-trabajo-grupo-salinas/                                 |6331 |0.04457979790867162  |0.23966482413829526|
|empleos/bolsa-de-trabajo-Trabajos-Diarios-MX/                           |3222 |0.02268774425236771  |0.26235256839066295|
|empleos/bolsa-de-trabajo-Un-Mejor-Empleo/                               |3174 |0.022349751786783088 |0.28470232017744607|
|empleos/bolsa-d

In [24]:
df.where("company.url='empleos/bolsa-de-trabajo-Manpower--S-A--de-C-V-/'").count()

934

In [25]:
df.select("company.name").where("company.confidential=TRUE").count()

17813

In [26]:
company_count = df.where("redirect.type != 2").groupby("company.url").count().sort(F.col("count").desc())
company_count.show(truncate=False)

+-----------------------------------------------------------------------+-----+
|url                                                                    |count|
+-----------------------------------------------------------------------+-----+
|null                                                                   |17261|
|empleos/bolsa-de-trabajo-Grupo-Salinas/                                |1919 |
|empleos/bolsa-de-trabajo-Adecco/                                       |958  |
|empleos/bolsa-de-trabajo-Manpower--S-A--de-C-V-/                       |934  |
|empleos/bolsa-de-trabajo-Banamex/                                      |912  |
|empleos/bolsa-de-trabajo-Autofinauto/                                  |744  |
|empleos/bolsa-de-trabajo-Santander/                                    |544  |
|empleos/bolsa-de-trabajo-Banorte/                                      |469  |
|empleos/bolsa-de-trabajo-Atento-Servicios--S--A--de-C-V-/              |427  |
|empleos/bolsa-de-trabajo-BBVA-Bancomer/

In [27]:
company_count = df.where("redirect.type == 2").groupby("company.url").count().sort(F.col("count").desc())
company_count.show(truncate=False)

+---------------------------------------------------------+-----+
|url                                                      |count|
+---------------------------------------------------------+-----+
|empleos/bolsa-de-trabajo-Test-and-QA-Corporation/        |9886 |
|empleos/bolsa-de-trabajo-grupo-salinas/                  |6331 |
|empleos/bolsa-de-trabajo-Trabajos-Diarios-MX/            |3222 |
|empleos/bolsa-de-trabajo-Un-Mejor-Empleo/                |3174 |
|empleos/bolsa-de-trabajo-gepp/                           |1604 |
|empleos/bolsa-de-trabajo-truper/                         |1412 |
|empleos/bolsa-de-trabajo-CitiGroup/                      |720  |
|empleos/bolsa-de-trabajo-bairesdev/                      |613  |
|null                                                     |558  |
|empleos/bolsa-de-trabajo-Workable-ATS/                   |503  |
|empleos/bolsa-de-trabajo-EnsenadaHoy-com/                |364  |
|empleos/bolsa-de-trabajo-Jabil/                          |362  |
|empleos/b

In [28]:
df.where("company.url == 'empleos/bolsa-de-trabajo-bairesdev/'").first()

Row(Job:value='Job:17037115', __typename='Job', applied=False, autoinclusion=Row(__typename='JobAutoinclusion', autoinclusion=True, source='asp'), bullets=[], category=Row(__ref='JobCategory:17'), company=Row(__typename='JobCompany', confidential=False, logoUrl='https://cdn-h4.occ.com.mx/images/logos/300x300/01/01RfAPA_ZZ_STrdMpwQ-DQ2.jpg', name='bairesdev', namePretty='bairesdev', profile=None, rel=None, url='empleos/bolsa-de-trabajo-bairesdev/'), contact=Row(__typename='JobContact', emailAddress='', name='', phoneNumber='', showContactInfo=False), dates=Row(__typename='JobDates', active='2023-07-13 20:49:58', expires='2023-07-15 00:00:00', publish='2023-07-12 00:00:00'), description='Who We are   BairesDev is proud to be the fastest-growing company in America. With people in five continents and world-class clients, we are only as strong as the multicultural teams at the heart of o ...', education=Row(__typename='JobEducation', level='NOT_SPECIFIED', url='empleos/nivel-posgrado/'), fr

In [29]:
df.where("redirect.isRedirected == TRUE").where("autoinclusion.autoinclusion == TRUE").count()

56154

In [30]:
df.where("redirect.isRedirected == TRUE").where("autoinclusion.autoinclusion == FALSE").count()

2326

In [31]:
df.where("redirect.isRedirected == FALSE").where("autoinclusion.autoinclusion == TRUE").count()

0

In [32]:
df.where("redirect.isRedirected == FALSE").where("autoinclusion.autoinclusion == FALSE").count()

83535

In [33]:
df.where("redirect.isRedirected == TRUE")\
    .where("autoinclusion.autoinclusion == TRUE")\
    .select(["scraped_url", "redirect.externalUrl"])\
    .show(truncate=False)

+---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|scraped_url                                                                                        |externalUrl                                                                                                                                              |
+---------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|https://www.occ.com.mx/empleo/oferta/17025688-full-stack-engineer/                                 |https://paypal.eightfold.ai/careers/job?pid=274893560254                                                                           