In [24]:
import re
from itertools import chain
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType, ArrayType

In [2]:
pyspark = SparkSession.builder.appName('OpenWeather').master('local[*]').getOrCreate()

schema = StructType([
    StructField('_id', StringType(), True),
    StructField('created_at', TimestampType(), True),
    StructField('city_id', IntegerType(), True),
    StructField('lat', DoubleType(), True),
    StructField('lon', DoubleType(), True),
    StructField('country', StringType(), True),
    StructField('temp', DoubleType(), True),
    StructField('max_temp', DoubleType(), True),
    StructField('min_temp', DoubleType(), True),
    StructField('feels_like', DoubleType(), True),
    StructField('humidity', IntegerType(), True)]
)

today = f'{datetime.today().date()}'.replace('-', '')

df_pyspark_schema = pyspark.read.schema(schema).json(f'../data/openweather_{today}.json')
print(df_pyspark_schema.printSchema())
print(df_pyspark_schema.show())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/22 11:56:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
21/12/22 11:56:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/12/22 11:56:59 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
21/12/22 11:56:59 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
21/12/22 11:56:59 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


root
 |-- _id: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- city_id: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- country: string (nullable = true)
 |-- temp: double (nullable = true)
 |-- max_temp: double (nullable = true)
 |-- min_temp: double (nullable = true)
 |-- feels_like: double (nullable = true)
 |-- humidity: integer (nullable = true)

None


[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+-------------------+-------+-------+-------+-------+------+--------+--------+----------+--------+
|                 _id|         created_at|city_id|    lat|    lon|country|  temp|max_temp|min_temp|feels_like|humidity|
+--------------------+-------------------+-------+-------+-------+-------+------+--------+--------+----------+--------+
|{"$oid":"61bf0e36...|2021-12-19 11:49:26|2950159|52.5244|13.4105|     DE|279.39|  280.79|  278.76|    275.21|      91|
|{"$oid":"61bf0e3b...|2021-12-19 11:49:31|2988507|48.8534| 2.3488|     FR|277.89|  278.62|  277.19|    276.44|      97|
|{"$oid":"61bf0e40...|2021-12-19 11:49:36|3128760|41.3888|  2.159|     ES|284.19|  286.78|  281.78|    283.36|      77|
|{"$oid":"61bf0e45...|2021-12-19 11:49:41|2759794| 52.374| 4.8897|     NL| 280.2|   281.4|  279.19|    278.34|      94|
|{"$oid":"61bf0e4a...|2021-12-19 11:49:46|3094802|50.0833|19.9167|     PL| 278.4|  279.01|  276.56|     278.4|      89|
|{"$oid":"61bf0e50...|2021-12-19 11:49:5

                                                                                

### Cleaning data

In [42]:
def extract(col):
    if col: return re.findall(r'"\d+\w+"', col)
    else: return None

extract_udf = F.udf(lambda x: extract(x), ArrayType(StringType()))

In [45]:
# _id - remove ('{"$oid":') from _id string, drop create id column, drop _id column and reorder columns
df_pyspark = df_pyspark_schema.withColumn('id', extract_udf(df_pyspark_schema._id)[0]).drop('_id')\
                .select('id', 'created_at', 'city_id', 'lat', 'lon', 'country', 'temp', 'max_temp', 'min_temp', 'feels_like', 'humidity')

In [46]:
df_pyspark.show()

+--------------------+-------------------+-------+-------+-------+-------+------+--------+--------+----------+--------+
|                  id|         created_at|city_id|    lat|    lon|country|  temp|max_temp|min_temp|feels_like|humidity|
+--------------------+-------------------+-------+-------+-------+-------+------+--------+--------+----------+--------+
|"61bf0e36c0963c89...|2021-12-19 11:49:26|2950159|52.5244|13.4105|     DE|279.39|  280.79|  278.76|    275.21|      91|
|"61bf0e3bc0963c89...|2021-12-19 11:49:31|2988507|48.8534| 2.3488|     FR|277.89|  278.62|  277.19|    276.44|      97|
|"61bf0e40c0963c89...|2021-12-19 11:49:36|3128760|41.3888|  2.159|     ES|284.19|  286.78|  281.78|    283.36|      77|
|"61bf0e45c0963c89...|2021-12-19 11:49:41|2759794| 52.374| 4.8897|     NL| 280.2|   281.4|  279.19|    278.34|      94|
|"61bf0e4ac0963c89...|2021-12-19 11:49:46|3094802|50.0833|19.9167|     PL| 278.4|  279.01|  276.56|     278.4|      89|
|"61bf0e50c0963c89...|2021-12-19 11:49:5

In [76]:
# create city column based on city_id column
cities_map = {
        "2950159": "Berlin",
        "2988507": "Paris",
        "3128760": "Barcelona",
        "2759794": "Amsterdam",
        "3094802": "Krakow",
        "2761369": "Vienna",
        "2643743": "London"
    }

mapping_expr = F.create_map([F.lit(x) for x in chain(*cities_map.items())])
df_cities = df_pyspark.withColumn('city', mapping_expr.getItem(F.col("city_id")))
df_cities.show()

+--------------------+-------------------+-------+-------+-------+-------+------+--------+--------+----------+--------+---------+
|                  id|         created_at|city_id|    lat|    lon|country|  temp|max_temp|min_temp|feels_like|humidity|     city|
+--------------------+-------------------+-------+-------+-------+-------+------+--------+--------+----------+--------+---------+
|"61bf0e36c0963c89...|2021-12-19 11:49:26|2950159|52.5244|13.4105|     DE|279.39|  280.79|  278.76|    275.21|      91|   Berlin|
|"61bf0e3bc0963c89...|2021-12-19 11:49:31|2988507|48.8534| 2.3488|     FR|277.89|  278.62|  277.19|    276.44|      97|    Paris|
|"61bf0e40c0963c89...|2021-12-19 11:49:36|3128760|41.3888|  2.159|     ES|284.19|  286.78|  281.78|    283.36|      77|Barcelona|
|"61bf0e45c0963c89...|2021-12-19 11:49:41|2759794| 52.374| 4.8897|     NL| 280.2|   281.4|  279.19|    278.34|      94|Amsterdam|
|"61bf0e4ac0963c89...|2021-12-19 11:49:46|3094802|50.0833|19.9167|     PL| 278.4|  279.01|

In [78]:
# create a new column with country names
country_map = {
   "NL": "Netherlands",
   "PL": "Poland",
   "AT": "Austria",
   "GB": "England",
   "DE": "Germany",
   "ES": "Spain",
   "FR": "France"
}
mapping_expr = F.create_map([F.lit(x) for x in chain(*country_map.items())])
df_countries = df_cities.withColumn('country', mapping_expr.getItem(F.col("country")))
df_countries.show()

+--------------------+-------------------+-------+-------+-------+-----------+------+--------+--------+----------+--------+---------+
|                  id|         created_at|city_id|    lat|    lon|    country|  temp|max_temp|min_temp|feels_like|humidity|     city|
+--------------------+-------------------+-------+-------+-------+-----------+------+--------+--------+----------+--------+---------+
|"61bf0e36c0963c89...|2021-12-19 11:49:26|2950159|52.5244|13.4105|    Germany|279.39|  280.79|  278.76|    275.21|      91|   Berlin|
|"61bf0e3bc0963c89...|2021-12-19 11:49:31|2988507|48.8534| 2.3488|     France|277.89|  278.62|  277.19|    276.44|      97|    Paris|
|"61bf0e40c0963c89...|2021-12-19 11:49:36|3128760|41.3888|  2.159|      Spain|284.19|  286.78|  281.78|    283.36|      77|Barcelona|
|"61bf0e45c0963c89...|2021-12-19 11:49:41|2759794| 52.374| 4.8897|Netherlands| 280.2|   281.4|  279.19|    278.34|      94|Amsterdam|
|"61bf0e4ac0963c89...|2021-12-19 11:49:46|3094802|50.0833|19.9

In [None]:
# temp/max_temp/min_temp/feels_like from Kelvin to Fahreheint(F) and Ceucius(C)

# drop Kelvin column

# reorder columns