In [1]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz
!tar xf spark-3.0.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
# Set the environment variables for running PySpark in the collaboration environmentimport os
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop3.2"

In [3]:
# Run the local session to test the installation
import findspark
findspark.init('spark-3.0.1-bin-hadoop3.2')
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [4]:
# Define schema for out data
schema_sales = "date STRING, region STRING, manager STRING, product STRING, amount INT"

In [6]:
# Connect Google Drive!!! (files section)
data = spark.read.csv("/content/drive/MyDrive/db/sales.csv", schema = schema_sales, header = True, sep=";")

In [7]:
# Change column type from string to date (for example, '10.01.2020' - '2020-01-10')
from datetime import datetime
from pyspark.sql.functions import col, udf
from pyspark.sql.types import *
udf_date = udf(lambda x:datetime.strptime(x, "%d.%m.%Y"),DateType())
df = data.withColumn('date',udf_date(col('date')))

In [8]:
df.show(10)

+----------+------+-------+-------+------+
|      date|region|manager|product|amount|
+----------+------+-------+-------+------+
|2020-01-01| north|   Mark|  metal|  3899|
|2020-01-02| south|  David|   wood|  2283|
|2020-01-03|  west|   John|  metal|  4812|
|2020-01-04|  east|William|   wood|  5452|
|2020-01-05| north|   Mark|  metal|  9855|
|2020-01-06| south|  David|   wood|  5040|
|2020-01-07|  west|   John|  metal|  4801|
|2020-01-08|  east|William|   wood|  5752|
|2020-01-09| north|   Mark|  metal|  8721|
|2020-01-10| south|  David|   wood|  8020|
+----------+------+-------+-------+------+
only showing top 10 rows



In [9]:
df.printSchema()

root
 |-- date: date (nullable = true)
 |-- region: string (nullable = true)
 |-- manager: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)



In [10]:
df.createOrReplaceTempView("tbl")

In [19]:
# Query SQL. Select all records where region = north and product = metal
spark.sql("""SELECT *
FROM tbl 
WHERE (product = 'metal' AND region = 'north')
ORDER BY amount DESC""").show(10)

+----------+------+-------+-------+------+
|      date|region|manager|product|amount|
+----------+------+-------+-------+------+
|2020-01-05| north|   Mark|  metal|  9855|
|2020-02-06| north|   Mark|  metal|  9382|
|2020-02-26| north|   Mark|  metal|  8763|
|2020-01-09| north|   Mark|  metal|  8721|
|2020-02-02| north|   Mark|  metal|  8589|
|2020-01-17| north|   Mark|  metal|  6467|
|2020-02-22| north|   Mark|  metal|  6352|
|2020-01-25| north|   Mark|  metal|  5163|
|2020-01-21| north|   Mark|  metal|  4060|
|2020-01-01| north|   Mark|  metal|  3899|
+----------+------+-------+-------+------+
only showing top 10 rows



In [25]:
# Query SQL. CASE
spark.sql("""SELECT date, 
CASE 
WHEN Month(date)=1 THEN 'January'
WHEN Month(date)=2 THEN 'February'
ELSE 'no'
END as Month
FROM tbl""").show(10)

+----------+-------+
|      date|  Month|
+----------+-------+
|2020-01-01|January|
|2020-01-02|January|
|2020-01-03|January|
|2020-01-04|January|
|2020-01-05|January|
|2020-01-06|January|
|2020-01-07|January|
|2020-01-08|January|
|2020-01-09|January|
|2020-01-10|January|
+----------+-------+
only showing top 10 rows

