In [None]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz
!tar xf spark-3.0.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
# Set the environment variables for running PySpark in the collaboration environmentimport os
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop3.2"

In [None]:
# Run the local session to test the installation
import findspark
findspark.init('spark-3.0.1-bin-hadoop3.2')
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [None]:
# Define schema for out data
schema_sales = "date STRING, region STRING, manager STRING, product STRING, amount INT"

In [None]:
# Connect Google Drive!!! (files section)
data = spark.read.csv("/content/drive/MyDrive/db/sales.csv", schema = schema_sales, header = True, sep=";")

In [None]:
data.show(10)

+----------+------+-------+-------+------+
|      date|region|manager|product|amount|
+----------+------+-------+-------+------+
|01.01.2020| north|   Mark|  metal|  3899|
|02.01.2020| south|  David|   wood|  2283|
|03.01.2020|  west|   John|  metal|  4812|
|04.01.2020|  east|William|   wood|  5452|
|05.01.2020| north|   Mark|  metal|  9855|
|06.01.2020| south|  David|   wood|  5040|
|07.01.2020|  west|   John|  metal|  4801|
|08.01.2020|  east|William|   wood|  5752|
|09.01.2020| north|   Mark|  metal|  8721|
|10.01.2020| south|  David|   wood|  8020|
+----------+------+-------+-------+------+
only showing top 10 rows



In [None]:
data.printSchema()

root
 |-- date: string (nullable = true)
 |-- region: string (nullable = true)
 |-- manager: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)



In [None]:
# Change column type from string to date
from datetime import datetime
from pyspark.sql.functions import col, udf
from pyspark.sql.types import *
udf_date = udf(lambda x:datetime.strptime(x, "%d.%m.%Y"),DateType())
df = data.withColumn('date',udf_date(col('date')))

In [None]:
df.show(10)

+----------+------+-------+-------+------+
|      date|region|manager|product|amount|
+----------+------+-------+-------+------+
|2020-01-01| north|   Mark|  metal|  3899|
|2020-01-02| south|  David|   wood|  2283|
|2020-01-03|  west|   John|  metal|  4812|
|2020-01-04|  east|William|   wood|  5452|
|2020-01-05| north|   Mark|  metal|  9855|
|2020-01-06| south|  David|   wood|  5040|
|2020-01-07|  west|   John|  metal|  4801|
|2020-01-08|  east|William|   wood|  5752|
|2020-01-09| north|   Mark|  metal|  8721|
|2020-01-10| south|  David|   wood|  8020|
+----------+------+-------+-------+------+
only showing top 10 rows



In [None]:
df.printSchema()

root
 |-- date: date (nullable = true)
 |-- region: string (nullable = true)
 |-- manager: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)



In [None]:
# List of column names
df.columns

['date', 'region', 'manager', 'product', 'amount']

In [None]:
# Function countDistinct
from pyspark.sql.functions import *
df.select("manager").where(col("manager").isNotNull()).agg(countDistinct("manager").alias("DistintManager")).show()

+--------------+
|DistintManager|
+--------------+
|             4|
+--------------+



In [None]:
df.select("manager").where(col("manager").isNotNull()).distinct().show()

+-------+
|manager|
+-------+
|   Mark|
|   John|
|  David|
|William|
+-------+



In [None]:
# Selection
df.select(_).where(col("manager")=="Mark").show(5)

+----------+------+-------+-------+------+
|      date|region|manager|product|amount|
+----------+------+-------+-------+------+
|2020-01-01| north|   Mark|  metal|  3899|
|2020-01-05| north|   Mark|  metal|  9855|
|2020-01-09| north|   Mark|  metal|  8721|
|2020-01-13| north|   Mark|  metal|  3283|
|2020-01-17| north|   Mark|  metal|  6467|
+----------+------+-------+-------+------+
only showing top 5 rows



In [None]:
# Count of records by manager
df.select("manager").where(col("manager").isNotNull()).groupBy("manager").count().orderBy("count",ascending=False).show()

+-------+-----+
|manager|count|
+-------+-----+
|   Mark|   15|
|   John|   15|
|  David|   15|
|William|   15|
+-------+-----+

