In [19]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType


In [3]:
# SparkSession => entry to the DataFrame API
spark = SparkSession.builder.appName("MinTemperatures").getOrCreate()
spark


In [24]:
# create schema for dataset we are going to read
schema = StructType([
    StructField("id", StringType(), True),
    StructField("date", IntegerType(), True),
    StructField("min_max", StringType(), True),
    StructField("temp", FloatType(), True)])


In [26]:
df = spark.read.schema(schema).csv("temperatures_1800.csv")
df.printSchema()


root
 |-- id: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- min_max: string (nullable = true)
 |-- temp: float (nullable = true)



In [27]:
# withColumn => for creating new column
df = df.withColumn("temp_c", func.col("temp") / 10)
df.show(5)


+-----------+--------+-------+------+------+
|         id|    date|min_max|  temp|temp_c|
+-----------+--------+-------+------+------+
|ITE00100554|18000101|   TMAX| -75.0|  -7.5|
|ITE00100554|18000101|   TMIN|-148.0| -14.8|
|GM000010962|18000101|   PRCP|   0.0|   0.0|
|EZE00100082|18000101|   TMAX| -86.0|  -8.6|
|EZE00100082|18000101|   TMIN|-135.0| -13.5|
+-----------+--------+-------+------+------+
only showing top 5 rows



In [28]:
# filter only TMIN entries and aggregate by min temp
df.where(df.min_max == "TMIN").groupBy("id").min("temp_c").show()


+-----------+-----------+
|         id|min(temp_c)|
+-----------+-----------+
|ITE00100554|      -14.8|
|EZE00100082|      -13.5|
+-----------+-----------+



In [29]:
a = 0