In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession \
    .builder \
    .appName("Learning Spark") \
    .getOrCreate()

df = spark.read.json("harvester/ct/bigharvey-CT*-202306*.jsonl.gz")

In [2]:
df.printSchema()

root
 |-- crawler: string (nullable = true)
 |-- debug_steps: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- identifier: string (nullable = true)
 |-- jobposting: string (nullable = true)
 |-- scraped_at: string (nullable = true)
 |-- search_engine_type: string (nullable = true)
 |-- url: string (nullable = true)
 |-- uuid: string (nullable = true)
 |-- version: string (nullable = true)



In [3]:
df.count()

3154275

In [4]:
spark.sparkContext.defaultParallelism, df.rdd.getNumPartitions()

(4, 19)

In [5]:
spark.conf.get("spark.sql.adaptive.enabled")

'true'

In [6]:
#import json
#s = json.loads((df.select("jobposting").where("version = '0.0.9'").limit(1).first()['jobposting']))
jsonobj = spark.read.json("jobposting_sample.json", multiLine=True)
schema = jsonobj.schema
schema

StructType([StructField('@context', StringType(), True), StructField('@type', StringType(), True), StructField('baseSalary', StructType([StructField('@context', StringType(), True), StructField('@type', StringType(), True), StructField('currency', StringType(), True), StructField('value', StructType([StructField('@context', StringType(), True), StructField('@type', StringType(), True), StructField('unitText', StringType(), True), StructField('value', StringType(), True)]), True)]), True), StructField('datePosted', StringType(), True), StructField('description', StringType(), True), StructField('directApply', BooleanType(), True), StructField('employmentType', StringType(), True), StructField('hiringOrganization', StructType([StructField('@context', StringType(), True), StructField('@type', StringType(), True), StructField('logo', StringType(), True), StructField('name', StringType(), True)]), True), StructField('identifier', StructType([StructField('@context', StringType(), True), Stru

In [7]:
df.where("version = '0.0.7'").withColumn("jp", F.from_json(F.col("jobposting"), schema)).printSchema()

root
 |-- crawler: string (nullable = true)
 |-- debug_steps: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- identifier: string (nullable = true)
 |-- jobposting: string (nullable = true)
 |-- scraped_at: string (nullable = true)
 |-- search_engine_type: string (nullable = true)
 |-- url: string (nullable = true)
 |-- uuid: string (nullable = true)
 |-- version: string (nullable = true)
 |-- jp: struct (nullable = true)
 |    |-- @context: string (nullable = true)
 |    |-- @type: string (nullable = true)
 |    |-- baseSalary: struct (nullable = true)
 |    |    |-- @context: string (nullable = true)
 |    |    |-- @type: string (nullable = true)
 |    |    |-- currency: string (nullable = true)
 |    |    |-- value: struct (nullable = true)
 |    |    |    |-- @context: string (nullable = true)
 |    |    |    |-- @type: string (nullable = true)
 |    |    |    |-- unitText: string (nullable = true)
 |    |    |    |-- value: string (nullable = true)
 |  

In [8]:
df.where("version = '0.0.9'").withColumn("jp", F.from_json(F.col("jobposting"), schema)).select("jp.url").count()

1497858

In [9]:
df.where("version = '0.0.9'").withColumn("jp", F.from_json(F.col("jobposting"), schema)).select("jp.url").distinct().count()

175775

In [None]:
jps = df.where("version = '0.0.9'").withColumn("jp", F.from_json(F.col("jobposting"), schema))

In [30]:
jps.dropDuplicates(["identifier"]).count()

175795

In [28]:
jps.select("identifier").distinct().count()

175795

In [10]:
df.where("version = '0.0.9'").withColumn("jp", F.from_json(F.col("jobposting"), schema)).select("uuid").distinct().count()

1497858

In [32]:
jps = df.where("version = '0.0.9'")\
    .withColumn("jp", F.from_json(F.col("jobposting"), schema))\
    .dropDuplicates(["identifier"])
dirApps = jps.groupby("jp.directApply").count().toPandas().set_index("directApply")
dirApps

Unnamed: 0_level_0,count
directApply,Unnamed: 1_level_1
True,140693
False,35102


In [33]:
dirApps.div(dirApps.sum(axis=0), axis=1)

Unnamed: 0_level_0,count
directApply,Unnamed: 1_level_1
True,0.800324
False,0.199676


In [36]:
jps.select("jp.url").where("jp.directApply=0").show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|url                                                                                                                                                                               |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|https://mx.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-gerente-de-cedis-chihuahua-en-chihuahua-002E37196B79FFBE61373E686DCF3405                                      |
|https://mx.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-capturista-zapopan-en-zapopan-014635CB94529D2761373E686DCF3405                                                |
|https://mx.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-tecnico-laboratorista-para

In [37]:
jps.select("jp.url").where("jp.directApply=1").show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|url                                                                                                                                                                                                                   |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|https://mx.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-asesor-de-ventas--altabrisa-merida-en-merida-000DD49C3F808E3961373E686DCF3405                                                                     |
|https://mx.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-bodegueroa-plaza-galerias-en-aguascalientes-000F3F16DF6495D1613

## Stats

In [None]:
stats_df = spark.read.json("harvester/ct/*stats.json")
stats_df.printSchema()

In [None]:
stats_df.toPandas().plot("start_time", "downloader/response_status_count/200")

In [None]:
df.rdd.mapPartitionsWithIndex(lambda x, it: [(x, sum(1 for _ in it))]).collect()