In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from pyspark_ai import SparkAI
import os

from langchain.chat_models import ChatOpenAI

#os.environ["OPENAI_API_KEY"] = ""

spark = SparkSession \
    .builder \
    .appName("Learning Spark") \
    .getOrCreate()

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
sa = SparkAI(llm=llm)
sa.activate()

In [None]:
df = spark.read.json("harvester/ct/*CT_MX*rv*jsonl.gz")

In [6]:
df.show()

+---------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+-------+
|        crawler|          identifier|          jobposting|          scraped_at|search_engine_type|                 url|                uuid|version|
+---------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+-------+
|CtCrawlerByCity|11C3B34F4F1343216...|{https://schema.o...|2023-06-24 12:01:...|               JDP|https://mx.comput...|d39aae32-0f28-4bc...|  0.0.9|
|CtCrawlerByCity|CF1CCD1FBC51927A6...|{https://schema.o...|2023-06-24 12:01:...|               JDP|https://mx.comput...|8ec50ec7-45fb-462...|  0.0.9|
|CtCrawlerByCity|FBCAA5F0426C1EBA6...|{https://schema.o...|2023-06-24 12:01:...|               JDP|https://mx.comput...|ab964d0f-e3b7-483...|  0.0.9|
|CtCrawlerByCity|BFFF4E38A682E75F6...|{https://schema.o...|2023-06-24 12:01:...|               JDP|h

In [10]:
jpdf = df.ai.transform("get the jobposting and expanded as a new dataframe")
jpdf.printSchema()

root
 |-- @context: string (nullable = true)
 |-- @type: string (nullable = true)
 |-- baseSalary: struct (nullable = true)
 |    |-- @context: string (nullable = true)
 |    |-- @type: string (nullable = true)
 |    |-- currency: string (nullable = true)
 |    |-- value: struct (nullable = true)
 |    |    |-- @context: string (nullable = true)
 |    |    |-- @type: string (nullable = true)
 |    |    |-- unitText: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- datePosted: string (nullable = true)
 |-- description: string (nullable = true)
 |-- directApply: boolean (nullable = true)
 |-- employmentType: string (nullable = true)
 |-- hiringOrganization: struct (nullable = true)
 |    |-- @context: string (nullable = true)
 |    |-- @type: string (nullable = true)
 |    |-- logo: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- identifier: struct (nullable = true)
 |    |-- @context: string (nullable = true)
 |    |-- @type: string (nu

In [13]:
jpdf.ai.transform("get all the unique addressLocality").show()

+-------------------+
|    addressLocality|
+-------------------+
|            Sinaloa|
|Ciudad de México DF|
|           Veracruz|
|            Durango|
|           Guerrero|
|            Chiapas|
|         Guanajuato|
|            Hidalgo|
|            Nayarit|
|          Chihuahua|
|   Estado de México|
|            Morelos|
|            Yucatán|
|             Colima|
|           Campeche|
|            Tabasco|
|          Michoacán|
|             Oaxaca|
|    San Luis Potosí|
|     Aguascalientes|
+-------------------+
only showing top 20 rows



In [23]:
jpdf.ai.transform("count the number of records per day in datePosted, ordered by dateposted").show()

+----------+-----+
|datePosted|count|
+----------+-----+
|2023-01-27|   24|
|2023-02-23|    1|
|2023-02-24|  106|
|2023-02-27| 1023|
|2023-02-28| 1301|
|2023-03-01| 1566|
|2023-03-02| 1632|
|2023-03-03| 1033|
|2023-03-06| 1754|
|2023-03-07|  510|
|2023-03-08| 2476|
|2023-03-09| 1506|
|2023-03-10| 1535|
|2023-03-13| 1956|
|2023-03-14| 1842|
|2023-03-15| 1227|
|2023-03-16|  357|
|2023-03-17| 2059|
|2023-03-20|  341|
|2023-03-21| 1458|
+----------+-----+
only showing top 20 rows



In [25]:
df.ai.transform("count the number of records per day scraped, order by day").show()

+----------+------+
|       day| count|
+----------+------+
|2023-06-24|122478|
|2023-06-25|120437|
|2023-06-26|116018|
|2023-06-27|118909|
|2023-06-28|120633|
|2023-06-29|121092|
+----------+------+



In [30]:
per_loc = jpdf.ai.transform("give the number of records per addressLocality")
per_loc.show()

+-------------------+------------+
|    addressLocality|record_count|
+-------------------+------------+
|            Sinaloa|       16740|
|Ciudad de México DF|      157658|
|           Veracruz|       18545|
|            Durango|        2882|
|           Guerrero|        5044|
|            Chiapas|        7094|
|         Guanajuato|       32750|
|            Hidalgo|        6879|
|            Nayarit|        3110|
|          Chihuahua|       12111|
|   Estado de México|       94233|
|            Morelos|        6658|
|            Yucatán|       21115|
|             Colima|        4745|
|           Campeche|        2484|
|            Tabasco|        5285|
|          Michoacán|       11659|
|             Oaxaca|        8025|
|    San Luis Potosí|       13206|
|     Aguascalientes|       10728|
+-------------------+------------+
only showing top 20 rows



In [38]:
x = jpdf.ai.transform("give the percentage of records per addressLocality order by percentage")
x.show()

+-------------------+------------------+
|    addressLocality|        percentage|
+-------------------+------------------+
|Ciudad de México DF|21.910120947736626|
|   Estado de México|13.095792330665526|
|            Jalisco|11.445772249144277|
|         Nuevo León| 6.484872152280468|
|          Querétaro| 4.930187182013627|
|         Guanajuato| 4.551348241372937|
|             Puebla| 3.764347169895229|
|       Quintana Roo|3.7051448996410343|
|            Yucatán| 2.934403606613422|
|           Veracruz|2.5772443705728585|
|            Sinaloa|2.3263990705521516|
|    Baja California| 2.312501824013608|
|             Sonora|1.8518081012608971|
|    San Luis Potosí|1.8352703778800308|
|          Chihuahua|1.6830955282829814|
|          Michoacán|1.6202799739287657|
|           Coahuila|1.5413436135898395|
|     Aguascalientes|1.4908966086549271|
|             Oaxaca|1.1152540347181015|
|Baja California Sur|1.0816226980948265|
+-------------------+------------------+
only showing top

In [34]:
x.ai.explain()

'In summary, this dataframe is retrieving the percentage of job postings in each locality (addressLocality) based on the total number of job postings in the dataset. It presents the results grouped by locality and calculates the percentage using the count of job postings in each locality divided by the total count of job postings in the dataset.'

In [37]:
x.select(F.sum("percentage")).show()

+-----------------+
|  sum(percentage)|
+-----------------+
|99.99999999999999|
+-----------------+



In [42]:
#from the original df
y = df.ai.transform("give the percentage of records per jobposting.joblocation.address.addressLocality order by percentage")
y.show()

+-------------------+------------------+
|    addressLocality|        percentage|
+-------------------+------------------+
|Ciudad de México DF|21.910120947736626|
|   Estado de México|13.095792330665526|
|            Jalisco|11.445772249144277|
|         Nuevo León| 6.484872152280468|
|          Querétaro| 4.930187182013627|
|         Guanajuato| 4.551348241372937|
|             Puebla| 3.764347169895229|
|       Quintana Roo|3.7051448996410343|
|            Yucatán| 2.934403606613422|
|           Veracruz|2.5772443705728585|
|            Sinaloa|2.3263990705521516|
|    Baja California| 2.312501824013608|
|             Sonora|1.8518081012608971|
|    San Luis Potosí|1.8352703778800308|
|          Chihuahua|1.6830955282829814|
|          Michoacán|1.6202799739287657|
|           Coahuila|1.5413436135898395|
|     Aguascalientes|1.4908966086549271|
|             Oaxaca|1.1152540347181015|
|Baja California Sur|1.0816226980948265|
+-------------------+------------------+
only showing top