# CT daily EDA

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [2]:
spark = SparkSession \
    .builder \
    .appName("Learning Spark") \
    .getOrCreate()

day = "20231211"

In [3]:
df = spark.read.json(f"harvester/hireline/bigharvey-HIRELINE_MX-{day}_*-rv0_0_1.jsonl.gz")
#df = spark.read.json(f"harvester/ct/bigharvey-CT_MX-{day}_*-rv0_0_9.jsonl.gz")

In [4]:
df.printSchema()

root
 |-- crawler: string (nullable = true)
 |-- identifier: string (nullable = true)
 |-- jobposting: struct (nullable = true)
 |    |-- @context: string (nullable = true)
 |    |-- @type: string (nullable = true)
 |    |-- applicantLocationRequirements: struct (nullable = true)
 |    |    |-- @type: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |-- baseSalary: struct (nullable = true)
 |    |    |-- @type: string (nullable = true)
 |    |    |-- currency: string (nullable = true)
 |    |    |-- value: struct (nullable = true)
 |    |    |    |-- @type: string (nullable = true)
 |    |    |    |-- maxValue: string (nullable = true)
 |    |    |    |-- minValue: string (nullable = true)
 |    |    |    |-- unitText: string (nullable = true)
 |    |-- datePosted: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- educationRequirements: string (nullable = true)
 |    |-- employmentType: string (nullable = true)
 |    |-- expe

In [5]:
df.show(1, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Registros Totales

In [6]:
record_count = df.count()
df.count()

1563

### Registros Únicos por Identifier (asume único por jobad)

In [7]:
df.dropDuplicates(["identifier"]).count()

1563

## Ubicación

### Estados

#### Distribución de vacantes por estado con suma acumulada

In [8]:
state_count = df.groupby("jobposting.JobLocation.address.addressLocality").count()
state_count_p = state_count.withColumn("perc", F.col("count") / record_count)
window = Window.orderBy(F.col("perc").desc()).rowsBetween(Window.unboundedPreceding, Window.currentRow)
state_count_p = state_count_p.withColumn("cumsum", F.sum(F.col("perc")).over(window))
state_count_p.show(33)

+--------------------+-----+--------------------+------------------+
|     addressLocality|count|                perc|            cumsum|
+--------------------+-----+--------------------+------------------+
|                    |  429|  0.2744721689059501|0.2744721689059501|
|       Benito Juárez|  290|  0.1855406269993602|0.4600127959053103|
|           Querétaro|  110| 0.07037747920665387|0.5303902751119641|
|      Miguel Hidalgo|  101| 0.06461932181701856|0.5950095969289827|
|           Monterrey|   94|0.060140754958413305| 0.655150351887396|
|             Tlalpan|   89| 0.05694177863083813|0.7120921305182342|
|         Guadalajara|   81| 0.05182341650671785| 0.763915547024952|
|          Cuauhtémoc|   52| 0.03326935380678183|0.7971849008317339|
|Cuajimalpa de Mor...|   42|0.026871401151631478|0.8240563019833653|
|      Álvaro Obregón|   41|0.026231605886116442|0.8502879078694817|
|            Culiacán|   27| 0.01727447216890595|0.8675623800383877|
|            Coyoacán|   22|0.0140

## Industria (Giro)

In [9]:
print(f"Industrias únicas: {df.select('jobposting.industry').distinct().count()}")

Industrias únicas: 1


In [10]:
cat_count = df.groupby("jobposting.industry").count()
cat_count = cat_count.withColumn("pct", F.col("count") / F.lit(record_count))\
            .sort(F.col("count").desc())
window = Window.orderBy(F.col("pct").desc()).rowsBetween(Window.unboundedPreceding, Window.currentRow)
cat_count = cat_count.withColumn("cumsum", F.sum(F.col("pct")).over(window))
cat_count.show(25, truncate=False)

+-----------------------------+-----+---+------+
|industry                     |count|pct|cumsum|
+-----------------------------+-----+---+------+
|Tecnologías de la Información|1563 |1.0|1.0   |
+-----------------------------+-----+---+------+



## Compañias

In [11]:

print(f"Número de nombres de compañías: {df.select('jobposting.hiringOrganization.name').distinct().count()}")
company_count = df.groupby('jobposting.hiringOrganization.name').count()\
                .withColumn('pct', F.col('count') / record_count)\
                .withColumn('cumsum', F.sum(F.col('pct')).over(window))\
                .sort(F.col("count").desc())

company_count.show(30, truncate=False)

Número de nombres de compañías: 184
+----------------------------------+-----+--------------------+-------------------+
|name                              |count|pct                 |cumsum             |
+----------------------------------+-----+--------------------+-------------------+
|Capital Empresarial Horizonte     |371  |0.23736404350607807 |0.23736404350607807|
|Confidencial                      |80   |0.05118362124120281 |0.2885476647472809 |
|Thomson Reuters                   |63   |0.04030710172744722 |0.3288547664747281 |
|Grupo Salinas                     |46   |0.02943058221369162 |0.3582853486884197 |
|EmployIT                          |44   |0.02815099168266155 |0.38643634037108127|
|OCDTECH                           |40   |0.025591810620601407|0.41202815099168266|
|EON IGNITING BUSINESS             |37   |0.023672424824056303|0.43570057581573896|
|DEINTEC                           |37   |0.023672424824056303|0.45937300063979525|
|TESYS 21                          |33  

## Salarios

In [13]:
salary_f = "jobposting.baseSalary.value"
df.select(F.col(salary_f).cast("double")).summary().show()
df.select(salary_f).min()

AnalysisException: [DATATYPE_MISMATCH.CAST_WITHOUT_SUGGESTION] Cannot resolve "CAST(jobposting.baseSalary.value AS DOUBLE)" due to data type mismatch: cannot cast "STRUCT<`@type`: STRING, maxValue: STRING, minValue: STRING, unitText: STRING>" to "DOUBLE".;
'Project [unresolvedalias(cast(jobposting#10.baseSalary.value as double), None)]
+- Relation [crawler#8,identifier#9,jobposting#10,scraped_at#11,search_engine_type#12,url#13,uuid#14,version#15] json


In [None]:
df.where(f"{salary_f} is null").count()

La proporción de vacantes sin salario es bastante independiente de directApply: 

(si directApply == False implica agregadas, estas tienen salario de todas formas, a diferencia de OCC)

In [None]:
df.withColumn("has_salary", F.when(F.col(salary_f).isNull(), False).otherwise(True)).groupby("jobposting.directApply").pivot("has_salary").count().show()

In [None]:
#seleccionar rangos para eliminar outliers - los percentiles son ajustados a algo que permita comparar con OCC
outlier_limits = df.select(F.percentile_approx(salary_f, [0.05, 0.95]))
outlier_min, outlier_max = outlier_limits.collect()[0][0]
outlier_limits.show()

In [None]:
salary = df.select(salary_f).where(f"{salary_f} >= {outlier_min} and {salary_f} <= {outlier_max}")
salary.summary().show()