In [56]:
#https://medium.com/geekculture/how-to-execute-a-rest-api-call-on-apache-spark-the-right-way-in-python-4367f2740e78
import requests
import json
from pyspark.sql import *
from pyspark.context import SparkContext, SparkConf

conf = SparkConf().setAppName('learning-pyspark').setMaster('local[4]') 
sc = SparkContext.getOrCreate(conf) 
    
def executeRestApi(verb, url, headers, body):
    #
    headers = {
      'content-type': "application/json"
    }

    res = None
    # Make API request, get response object back, create dataframe from above schema.
    try:
        if verb == "get":
            res = requests.get(url, data=body, headers=headers)
        else:
            res = requests.post(url, data=body, headers=headers)
            requests.get(api)
    except Exception as e:
        return e
    
    if res != None  and res.status_code == 200: # all ok
        return json.loads(res.content)
    return None

url = 'https://www.airquality.dli.mlsi.gov.cy/all_stations_data_PM'
json_txt = executeRestApi("get", url, None, None)

df = spark.read.json(sc.parallelize([json_txt]))
json_schema = df.schema
df.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- station_1: struct (nullable = true)
 |    |    |-- latitude: string (nullable = true)
 |    |    |-- longitude: string (nullable = true)
 |    |    |-- name_el: string (nullable = true)
 |    |    |-- name_en: string (nullable = true)
 |    |    |-- pollutants: struct (nullable = true)
 |    |    |    |-- date_time: string (nullable = true)
 |    |    |    |-- pollutant_10: struct (nullable = true)
 |    |    |    |    |-- code: string (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- notation: string (nullable = true)
 |    |    |    |    |-- value: string (nullable = true)
 |    |    |    |-- pollutant_20: struct (nullable = true)
 |    |    |    |    |-- code: string (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- notation: string (nullable = true)
 |    |    |    |    |-- value: string (nullable = true)
 |    |    |    

In [75]:
df.select('data').columns

['data']

In [80]:
#https://stackoverflow.com/questions/38753898/how-to-flatten-a-struct-in-a-spark-dataframe
import pyspark.sql.functions as F

def flatten_df(nested_df):
    flat_cols = [c[0] for c in nested_df.dtypes if c[1][:6] != 'struct']
    nested_cols = [c[0] for c in nested_df.dtypes if c[1][:6] == 'struct']
    #print(nested_cols)
    flat_df = nested_df.select(flat_cols +
                               [F.col(nc+'.'+c).alias(nc+'_'+c)
                                for nc in nested_cols
                                for c in nested_df.select(nc+'.*').columns])
    return flat_df

d = flatten_df(df)
d = flatten_df(d)
d = flatten_df(d)
#d = flatten_df(d)
d.printSchema()

root
 |-- method: string (nullable = true)
 |-- data_station_1_latitude: string (nullable = true)
 |-- data_station_1_longitude: string (nullable = true)
 |-- data_station_1_name_el: string (nullable = true)
 |-- data_station_1_name_en: string (nullable = true)
 |-- data_station_1_type_el: string (nullable = true)
 |-- data_station_1_type_en: string (nullable = true)
 |-- data_station_10_latitude: string (nullable = true)
 |-- data_station_10_longitude: string (nullable = true)
 |-- data_station_10_name_el: string (nullable = true)
 |-- data_station_10_name_en: string (nullable = true)
 |-- data_station_10_type_el: string (nullable = true)
 |-- data_station_10_type_en: string (nullable = true)
 |-- data_station_11_latitude: string (nullable = true)
 |-- data_station_11_longitude: string (nullable = true)
 |-- data_station_11_name_el: string (nullable = true)
 |-- data_station_11_name_en: string (nullable = true)
 |-- data_station_11_type_el: string (nullable = true)
 |-- data_station_1

In [81]:
d.groupby(F.col("data_station_1_name_el")).count().show()



+----------------------+-----+
|data_station_1_name_el|count|
+----------------------+-----+
|  Λευκωσία - Κυκλοφ...|    1|
+----------------------+-----+



In [71]:
d.groupby('data_station_1_pollutants_pollutant_10')

AttributeError: 'function' object has no attribute 'toDF'

In [18]:
df.write.mode('overwrite').parquet("df.parquet")

In [19]:
parquetDF = spark.read.parquet('df.parquet')
display(parquetDF)

DataFrame[data: struct<station_1:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_10:struct<code:string,description:string,notation:string,value:string>,pollutant_20:struct<code:string,description:string,notation:string,value:string>,pollutant_38:struct<code:string,description:string,notation:string,value:string>,pollutant_5:struct<code:string,description:string,notation:string,value:string>,pollutant_6001:struct<code:string,description:string,notation:string,value:string>,pollutant_7:struct<code:string,description:string,notation:string,value:string>,pollutant_8:struct<code:string,description:string,notation:string,value:string>,pollutant_9:struct<code:string,description:string,notation:string,value:string>>,type_el:string,type_en:string>,station_10:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_1:struct<code:string,description:string,notation:string,va

In [25]:
parquetDF.createOrReplaceTempView("ParquetTable")
parkSQL = spark.sql("select * from ParquetTable where ")
parkSQL.show()

+--------------------+------+
|                data|method|
+--------------------+------+
|[[35.151922469502...|   GET|
+--------------------+------+



In [22]:
from pyspark.sql.functions import explode

explodeDF = parquetDF.select(explode("data").alias("d"))
#flattenDF = explodeDF.selectExpr("e.firstName", "e.lastName", "e.email", "e.salary")

explodeDF.show()

AnalysisException: "cannot resolve 'explode(`data`)' due to data type mismatch: input to function explode should be array or map type, not struct<station_1:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_10:struct<code:string,description:string,notation:string,value:string>,pollutant_20:struct<code:string,description:string,notation:string,value:string>,pollutant_38:struct<code:string,description:string,notation:string,value:string>,pollutant_5:struct<code:string,description:string,notation:string,value:string>,pollutant_6001:struct<code:string,description:string,notation:string,value:string>,pollutant_7:struct<code:string,description:string,notation:string,value:string>,pollutant_8:struct<code:string,description:string,notation:string,value:string>,pollutant_9:struct<code:string,description:string,notation:string,value:string>>,type_el:string,type_en:string>,station_10:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_1:struct<code:string,description:string,notation:string,value:string>,pollutant_10:struct<code:string,description:string,notation:string,value:string>,pollutant_20:struct<code:string,description:string,notation:string,value:string>,pollutant_38:struct<code:string,description:string,notation:string,value:string>,pollutant_5:struct<code:string,description:string,notation:string,value:string>,pollutant_6001:struct<code:string,description:string,notation:string,value:string>,pollutant_7:struct<code:string,description:string,notation:string,value:string>,pollutant_8:struct<code:string,description:string,notation:string,value:string>,pollutant_9:struct<code:string,description:string,notation:string,value:string>>,type_el:string,type_en:string>,station_11:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_1:struct<code:string,description:string,notation:string,value:string>,pollutant_10:struct<code:string,description:string,notation:string,value:string>,pollutant_38:struct<code:string,description:string,notation:string,value:string>,pollutant_5:struct<code:string,description:string,notation:string,value:string>,pollutant_6001:struct<code:string,description:string,notation:string,value:string>,pollutant_7:struct<code:string,description:string,notation:string,value:string>,pollutant_8:struct<code:string,description:string,notation:string,value:string>,pollutant_9:struct<code:string,description:string,notation:string,value:string>>,type_el:string,type_en:string>,station_12:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_1:struct<code:string,description:string,notation:string,value:string>,pollutant_38:struct<code:string,description:string,notation:string,value:string>,pollutant_5:struct<code:string,description:string,notation:string,value:string>,pollutant_7:struct<code:string,description:string,notation:string,value:string>,pollutant_8:struct<code:string,description:string,notation:string,value:string>,pollutant_9:struct<code:string,description:string,notation:string,value:string>>,type_el:string,type_en:string>,station_13:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_1:struct<code:string,description:string,notation:string,value:string>,pollutant_38:struct<code:string,description:string,notation:string,value:string>,pollutant_5:struct<code:string,description:string,notation:string,value:string>,pollutant_7:struct<code:string,description:string,notation:string,value:string>,pollutant_8:struct<code:string,description:string,notation:string,value:string>,pollutant_9:struct<code:string,description:string,notation:string,value:string>>,type_el:string,type_en:string>,station_2:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_1:struct<code:string,description:string,notation:string,value:string>,pollutant_10:struct<code:string,description:string,notation:string,value:string>,pollutant_38:struct<code:string,description:string,notation:string,value:string>,pollutant_7:struct<code:string,description:string,notation:string,value:string>,pollutant_8:struct<code:string,description:string,notation:string,value:string>,pollutant_9:struct<code:string,description:string,notation:string,value:string>>,type_el:string,type_en:string>,station_3:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_1:struct<code:string,description:string,notation:string,value:string>,pollutant_10:struct<code:string,description:string,notation:string,value:string>,pollutant_38:struct<code:string,description:string,notation:string,value:string>,pollutant_5:struct<code:string,description:string,notation:string,value:string>,pollutant_6001:struct<code:string,description:string,notation:string,value:string>,pollutant_7:struct<code:string,description:string,notation:string,value:string>,pollutant_8:struct<code:string,description:string,notation:string,value:string>,pollutant_9:struct<code:string,description:string,notation:string,value:string>>,type_el:string,type_en:string>,station_5:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_1:struct<code:string,description:string,notation:string,value:string>,pollutant_10:struct<code:string,description:string,notation:string,value:string>,pollutant_38:struct<code:string,description:string,notation:string,value:string>,pollutant_5:struct<code:string,description:string,notation:string,value:string>,pollutant_6001:struct<code:string,description:string,notation:string,value:string>,pollutant_7:struct<code:string,description:string,notation:string,value:string>,pollutant_8:struct<code:string,description:string,notation:string,value:string>,pollutant_9:struct<code:string,description:string,notation:string,value:string>>,type_el:string,type_en:string>,station_7:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_1:struct<code:string,description:string,notation:string,value:string>,pollutant_10:struct<code:string,description:string,notation:string,value:string>,pollutant_20:struct<code:string,description:string,notation:string,value:string>,pollutant_38:struct<code:string,description:string,notation:string,value:string>,pollutant_5:struct<code:string,description:string,notation:string,value:string>,pollutant_6001:struct<code:string,description:string,notation:string,value:string>,pollutant_7:struct<code:string,description:string,notation:string,value:string>,pollutant_8:struct<code:string,description:string,notation:string,value:string>,pollutant_9:struct<code:string,description:string,notation:string,value:string>>,type_el:string,type_en:string>,station_8:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_20:struct<code:string,description:string,notation:string,value:string>,pollutant_5:struct<code:string,description:string,notation:string,value:string>,pollutant_6001:struct<code:string,description:string,notation:string,value:string>>,type_el:string,type_en:string>,station_9:struct<latitude:string,longitude:string,name_el:string,name_en:string,pollutants:struct<date_time:string,pollutant_1:struct<code:string,description:string,notation:string,value:string>,pollutant_20:struct<code:string,description:string,notation:string,value:string>,pollutant_38:struct<code:string,description:string,notation:string,value:string>,pollutant_8:struct<code:string,description:string,notation:string,value:string>,pollutant_9:struct<code:string,description:string,notation:string,value:string>>,type_el:string,type_en:string>>;;\n'Project [explode(data#31) AS d#43]\n+- Relation[data#31,method#32] parquet\n"

In [None]:
request_df = spark.createDataFrame([RestApiRequestRow("get", url, None, None)])

from pyspark.sql import Row
headers = {
    'content-type': "application/json"
}
body = json.dumps({
})

url = 'https://www.airquality.dli.mlsi.gov.cy/all_stations_data_PM'

RestApiRequestRow = Row("verb", "url", "headers", "body")
request_df = spark.createDataFrame([
            RestApiRequestRow("get", url, headers, body)
          ])

request_df.show()

request_df.take(20)
#request_df.write.json('gs://air_quality_cyprus/Data_Now.csv')

url = 'https://www.airquality.dli.mlsi.gov.cy/all_stations_data_PM'
from pyspark.sql.types import *
from pyspark.sql.functions import udf
            
udf_executeRestApi = udf(executeRestApi, schema)
#print(executeRestApi('get',url,None,None))
udf_executeRestApi

from pyspark.sql.functions import col
result_df = request_df.withColumn("result", udf_executeRestApi(col("verb"), col("url"), col("headers"), col("body")))