# CT daily EDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [None]:
spark = SparkSession \
    .builder \
    .appName("Learning Spark") \
    .getOrCreate()

day = "20230724"

In [3]:
df = spark.read.json(f"harvester/ct/bigharvey-CT_MX-{day}_*-rv0_0_9.jsonl.gz")

In [4]:
df.printSchema()

root
 |-- crawler: string (nullable = true)
 |-- identifier: string (nullable = true)
 |-- jobposting: struct (nullable = true)
 |    |-- @context: string (nullable = true)
 |    |-- @type: string (nullable = true)
 |    |-- baseSalary: struct (nullable = true)
 |    |    |-- @context: string (nullable = true)
 |    |    |-- @type: string (nullable = true)
 |    |    |-- currency: string (nullable = true)
 |    |    |-- value: struct (nullable = true)
 |    |    |    |-- @context: string (nullable = true)
 |    |    |    |-- @type: string (nullable = true)
 |    |    |    |-- unitText: string (nullable = true)
 |    |    |    |-- value: string (nullable = true)
 |    |-- datePosted: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- directApply: boolean (nullable = true)
 |    |-- employmentType: string (nullable = true)
 |    |-- hiringOrganization: struct (nullable = true)
 |    |    |-- @context: string (nullable = true)
 |    |    |-- @type: string (

In [8]:
df.show(1, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Registros Totales

In [22]:
record_count = df.count()
df.count()

116414

### Registros Únicos por Identifier (asume único por jobad)

In [6]:
df.dropDuplicates(["identifier"]).count()

116414

## Ubicación

### Estados

#### Distribución de vacantes por estado con suma acumulada

In [24]:
state_count = df.groupby("jobposting.JobLocation.address.addressLocality").count()
state_count_p = state_count.withColumn("perc", F.col("count") / record_count)
window = Window.orderBy(F.col("perc").desc()).rowsBetween(Window.unboundedPreceding, Window.currentRow)
state_count_p = state_count_p.withColumn("cumsum", F.sum(F.col("perc")).over(window))
state_count_p.show(33)

+-------------------+-----+--------------------+-------------------+
|    addressLocality|count|                perc|             cumsum|
+-------------------+-----+--------------------+-------------------+
|Ciudad de México DF|25080| 0.21543800573814145|0.21543800573814145|
|   Estado de México|14851|   0.127570567113921|0.34300857285206243|
|            Jalisco|13081| 0.11236621024962633| 0.4553747831016888|
|         Nuevo León| 7866| 0.06756919270878073| 0.5229439758104695|
|          Querétaro| 6148|0.052811517515075505|  0.575755493325545|
|         Guanajuato| 5182|0.044513546480663836| 0.6202690398062088|
|             Puebla| 4417| 0.03794217190372292| 0.6582112117099318|
|       Quintana Roo| 4359| 0.03744395004037315|  0.695655161750305|
|            Yucatán| 3531|0.030331403439448863| 0.7259865651897538|
|           Veracruz| 3025|0.025984847183328465| 0.7519714123730823|
|    Baja California| 2758|0.023691308605494184| 0.7756627209785765|
|            Sinaloa| 2475| 0.0212

## Industria (Giro)

In [49]:
print(f"Industrias únicas: {df.select('jobposting.industry').distinct().count()}")

Industrias únicas: 25


In [48]:
cat_count = df.groupby("jobposting.industry").count()
cat_count = cat_count.withColumn("pct", F.col("count") / F.lit(record_count))\
            .sort(F.col("count").desc())
window = Window.orderBy(F.col("pct").desc()).rowsBetween(Window.unboundedPreceding, Window.currentRow)
cat_count = cat_count.withColumn("cumsum", F.sum(F.col("pct")).over(window))
cat_count.show(25, truncate=False)

+-------------------------------+-----+---------------------+-------------------+
|industry                       |count|pct                  |cumsum             |
+-------------------------------+-----+---------------------+-------------------+
|RRHH / Personal                |27628|0.23732540759702442  |0.23732540759702442|
|Venta al consumidor            |19262|0.1654611988248836   |0.402786606421908  |
|Servicios Profesionales        |13896|0.11936708643290325  |0.5221536928548113 |
|Finanzas / Banca               |8055 |0.06919270878073085  |0.5913464016355421 |
|Fabricación                    |6680 |0.05738141460649063  |0.6487278162420327 |
|Salud / Medicina               |6481 |0.055671998213273316 |0.704399814455306  |
|Venta al por mayor             |5135 |0.04410981497070799  |0.748509629426014  |
|Transporte                     |4516 |0.038792585084268215 |0.7873022145102822 |
|Construcción / obras           |4383 |0.037650110811414436 |0.8249523253216966 |
|Hostelería / Tu

## Compañias

In [55]:

print(f"Número de nombres de compañías: {df.select('jobposting.hiringOrganization.name').distinct().count()}")
company_count = df.groupby('jobposting.hiringOrganization.name').count()\
                .withColumn('pct', F.col('count') / record_count)\
                .withColumn('cumsum', F.sum(F.col('pct')).over(window))\
                .sort(F.col("count").desc())

company_count.show(30, truncate=False)

Número de nombres de compañías: 21311
+------------------------------------------------------------+-----+---------------------+-------------------+
|name                                                        |count|pct                  |cumsum             |
+------------------------------------------------------------+-----+---------------------+-------------------+
|Grupo Salinas                                               |4906 |0.04214269761368908  |0.04214269761368908|
|ManpowerGroup                                               |3474 |0.029841771608225813 |0.0719844692219149 |
|Atento Servicios SA de CV                                   |2870 |0.02465339220368684  |0.09663786142560174|
|Adecco                                                      |1734 |0.014895115707732746 |0.11153297713333449|
|GRUPO FINANCIERO INBURSA                                    |1045 |0.00897658357242256  |0.12050956070575705|
|Smart Jobs                                                  |1010 |0.0086

## Salarios

In [84]:
salary_f = "jobposting.baseSalary.value.value"
df.select(F.col(salary_f).cast("double")).summary().show()
df.select(salary_f).min()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `CAST(jobposting`.`baseSalary`.`value`.`value AS DOUBLE)` cannot be resolved. Did you mean one of the following? [`CAST(jobposting.baseSalary.value.value AS DOUBLE)`].;
'Aggregate [map(cast(count as string), cast(count('CAST(jobposting.baseSalary.value.value AS DOUBLE)) as string), cast(mean as string), cast(avg('CAST(jobposting.baseSalary.value.value AS DOUBLE)) as string), cast(stddev as string), cast(stddev_samp('CAST(jobposting.baseSalary.value.value AS DOUBLE)) as string), cast(min as string), cast(min('CAST(jobposting.baseSalary.value.value AS DOUBLE)) as string), cast(25% as string), cast(get(percentile_approx('CAST(jobposting.baseSalary.value.value AS DOUBLE), [0.25,0.5,0.75], 10000, 0, 0), 0) as string), cast(50% as string), cast(get(percentile_approx('CAST(jobposting.baseSalary.value.value AS DOUBLE), [0.25,0.5,0.75], 10000, 0, 0), 1) as string), cast(75% as string), cast(get(percentile_approx('CAST(jobposting.baseSalary.value.value AS DOUBLE), [0.25,0.5,0.75], 10000, 0, 0), 2) as string), cast(max as string), cast(max('CAST(jobposting.baseSalary.value.value AS DOUBLE)) as string)) AS CAST(jobposting.baseSalary.value.value AS DOUBLE)#2412]
+- Project [cast(jobposting#10.baseSalary.value.value as double) AS CAST(jobposting.baseSalary.value.value AS DOUBLE)#2396]
   +- Relation [crawler#8,identifier#9,jobposting#10,scraped_at#11,search_engine_type#12,url#13,uuid#14,version#15] json


In [72]:
df.where(f"{salary_f} is null").count()

37166

La proporción de vacantes sin salario es bastante independiente de directApply: 

(si directApply == False implica agregadas, estas tienen salario de todas formas, a diferencia de OCC)

In [76]:
df.withColumn("has_salary", F.when(F.col(salary_f).isNull(), False).otherwise(True)).groupby("jobposting.directApply").pivot("has_salary").count().show()

+-----------+-----+-----+
|directApply|false| true|
+-----------+-----+-----+
|       true|29998|63509|
|      false| 7168|15739|
+-----------+-----+-----+



In [80]:
#seleccionar rangos para eliminar outliers - los percentiles son ajustados a algo que permita comparar con OCC
outlier_limits = df.select(F.percentile_approx(salary_f, [0.05, 0.95]))
outlier_min, outlier_max = outlier_limits.collect()[0][0]
outlier_limits.show()

+------------------------------------------------------------------------------+
|percentile_approx(jobposting.baseSalary.value.value, array(0.05, 0.95), 10000)|
+------------------------------------------------------------------------------+
|                                                             [6200.0, 22145.0]|
+------------------------------------------------------------------------------+



In [81]:
salary = df.select(salary_f).where(f"{salary_f} >= {outlier_min} and {salary_f} <= {outlier_max}")
salary.summary().show()

+-------+-----------------+
|summary|            value|
+-------+-----------------+
|  count|            71877|
|   mean|10402.14161692892|
| stddev|3547.813984793715|
|    min|            10000|
|    25%|           8000.0|
|    50%|           9540.0|
|    75%|          12000.0|
|    max|             9999|
+-------+-----------------+

