# Structure Check

Cómo tener acceso a los JSON internos de los registros. 

Es necesario definir un schema. Abajo se ve cómo inferir schema a partir de un ejemplo

In [69]:
import pandas as pd
import matplotlib.pyplot as plt

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


spark = SparkSession \
    .builder \
    .appName("Learning Spark") \
    .getOrCreate()

#df = spark.read.json("harvester/occ/tests/test-gdl.jsonl")
df = spark.read.json("harvester/occ/20230714/sneaky*.jsonl")
#df = spark.read.json("test-cdmx.jsonl") #from ETL
#df = spark.read.json("test-full-occ.jsonl") #from ETL
categories = spark.read.json("categories.json")
subcategories = spark.read.json("subcategories.json")

In [79]:
#categories.count(), categories.printSchema()
subcategories.show()

+--------------+--------------------+---+--------------------+
|    __typename|         description| id|                 url|
+--------------+--------------------+---+--------------------+
|JobSubcategory|   Ventas en general|319|empleos/trabajo-e...|
|JobSubcategory|Reclutamiento y s...|190|empleos/trabajo-e...|
|JobSubcategory| Ventas corporativas|285|empleos/trabajo-e...|
|JobSubcategory|            Actuaría| 73|empleos/trabajo-e...|
|JobSubcategory|Promotor - Demost...|289|empleos/trabajo-e...|
|JobSubcategory|Máquinas y herram...|308|empleos/trabajo-e...|
|JobSubcategory|Administración de...| 32|empleos/trabajo-e...|
|JobSubcategory|              Cajero|424|empleos/trabajo-e...|
|JobSubcategory| Atención al cliente|260|empleos/trabajo-e...|
|JobSubcategory|    Ayudante general| 37|empleos/trabajo-e...|
|JobSubcategory|            Finanzas| 79|empleos/trabajo-e...|
|JobSubcategory|Organización de e...|175|empleos/trabajo-e...|
|JobSubcategory| Alimentos y bebidas|256|empleos/trabaj

In [23]:
df.count(), df.printSchema()

root
 |-- Job:value: string (nullable = true)
 |-- __typename: string (nullable = true)
 |-- applied: boolean (nullable = true)
 |-- autoinclusion: struct (nullable = true)
 |    |-- __typename: string (nullable = true)
 |    |-- autoinclusion: boolean (nullable = true)
 |    |-- source: string (nullable = true)
 |-- bullets: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- __ref: string (nullable = true)
 |-- category: struct (nullable = true)
 |    |-- __ref: string (nullable = true)
 |-- company: struct (nullable = true)
 |    |-- __typename: string (nullable = true)
 |    |-- confidential: boolean (nullable = true)
 |    |-- logoUrl: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- namePretty: string (nullable = true)
 |    |-- profile: struct (nullable = true)
 |    |    |-- __ref: string (nullable = true)
 |    |-- rel: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- contact: struct (nullable =

(125951, None)

In [24]:
#df.toPandas()
df.show(1, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [125]:
df.count(), df.select('id').distinct().count()

(125951, 122697)

In [25]:
df.groupby("redirect.type").count().show()

+----+-----+
|type|count|
+----+-----+
|   0|79293|
|   1| 3750|
|   2|42908|
+----+-----+



In [26]:
df.groupby("jobType").count().show()

+--------+-----+
| jobType|count|
+--------+-----+
| PREMIUM| 3098|
|STANDOUT|29015|
| CLASSIC|93838|
+--------+-----+



In [98]:
df.groupby("jobType").pivot("redirect.type").count().show()

+--------+-----+----+-----+
| jobType|    0|   1|    2|
+--------+-----+----+-----+
| PREMIUM| 3075|  23| null|
|STANDOUT|28379| 410|  226|
| CLASSIC|47839|3317|42682|
+--------+-----+----+-----+



In [118]:
pdf = df.groupby("jobType").pivot("redirect.type").count().toPandas().fillna(0).set_index('jobType')
pdf.columns = ['NoRedir', 'Redir1', 'Redir2']
pdf.div(pdf.sum(axis=1), axis=0)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2
jobType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PREMIUM,0.992576,0.007424,0.0
STANDOUT,0.97808,0.014131,0.007789
CLASSIC,0.509804,0.035348,0.454848


In [119]:
pdf.div(pdf.sum(axis=0), axis=1)

Unnamed: 0_level_0,NoRedir,Redir1,Redir2
jobType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PREMIUM,0.03878,0.006133,0.0
STANDOUT,0.3579,0.109333,0.005267
CLASSIC,0.603319,0.884533,0.994733


In [121]:
pdf.div(pdf.sum(axis=None))

Unnamed: 0_level_0,NoRedir,Redir1,Redir2
jobType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PREMIUM,0.03878,0.006133,0.0
STANDOUT,0.3579,0.109333,0.005267
CLASSIC,0.603319,0.884533,0.994733


In [122]:
pd.__version__

'2.0.2'

In [66]:
cat_count = df.groupby("category").count().orderBy(F.col("count").desc())
split_col = F.split(cat_count["category.__ref"].cast("String"), ":")
cat_count = cat_count.withColumn("category_id", split_col.getItem(1).cast("INT"))
cat_count = cat_count.join(categories.select("id", "description"), cat_count.category_id == categories.id, how="inner")
cat_count.orderBy(F.col("count").desc()).show()

+----------------+-----+-----------+---+--------------------+
|        category|count|category_id| id|         description|
+----------------+-----+-----------+---+--------------------+
|{JobCategory:19}|27657|         19| 19|              Ventas|
| {JobCategory:1}|14521|          1|  1|      Administrativo|
|{JobCategory:17}|14516|         17| 17|Tecnologías de la...|
| {JobCategory:4}|12887|          4|  4|Contabilidad - Fi...|
|{JobCategory:10}| 9618|         10| 10|Logística - Trans...|
|{JobCategory:11}| 9118|         11| 11|Manufactura - Pro...|
| {JobCategory:9}| 7760|          9|  9|          Ingeniería|
|{JobCategory:21}| 6351|         21| 21|Atención a client...|
|{JobCategory:13}| 5370|         13| 13|    Recursos humanos|
| {JobCategory:5}| 2842|          5|  5|Construcción - In...|
|{JobCategory:12}| 2819|         12| 12|Mercadotecnia - P...|
|{JobCategory:15}| 2673|         15| 15|        Sector salud|
|{JobCategory:23}| 2468|         23| 23|Servicios general...|
|{JobCat

In [78]:
subcat_count = df.groupby("subcategory").count().orderBy(F.col("count").desc())
split_col = F.split(subcat_count["subcategory.__ref"].cast("String"), ":")
subcat_count = subcat_count.withColumn("subcategory_id", split_col.getItem(1).cast("INT"))
subcat_count = subcat_count.join(subcategories.select("id", "description"), subcat_count.subcategory_id == subcategories.id, how="inner")
subcat_count.orderBy(F.col("count").desc()).show(25)

+--------------------+-----+--------------+---+--------------------+
|         subcategory|count|subcategory_id| id|         description|
+--------------------+-----+--------------+---+--------------------+
| {JobSubcategory:29}| 8281|            29| 29|      Administración|
|{JobSubcategory:319}| 5661|           319|319|   Ventas en general|
|{JobSubcategory:260}| 5200|           260|260| Atención al cliente|
|{JobSubcategory:242}| 3606|           242|242|Desarrollo de sof...|
|{JobSubcategory:275}| 3234|           275|275|Administración de...|
|{JobSubcategory:286}| 3127|           286|286|     Ventas de campo|
|{JobSubcategory:149}| 3030|           149|149|Almacén - Inventario|
| {JobSubcategory:78}| 2977|            78| 78|        Contabilidad|
| {JobSubcategory:25}| 2805|            25| 25|  Crédito y cobranza|
|{JobSubcategory:190}| 2124|           190|190|Reclutamiento y s...|
| {JobSubcategory:30}| 1678|            30| 30|  Control de Calidad|
|{JobSubcategory:251}| 1662|      

In [89]:
df.groupby("googleForJobs").count().sort(F.col("count").desc()).show(truncate=False)

+-----------------------------+------+
|googleForJobs                |count |
+-----------------------------+------+
|{GoogleForJobs, true, true}  |124566|
|{GoogleForJobs, false, false}|1316  |
|{GoogleForJobs, false, true} |69    |
+-----------------------------+------+

