<a href="https://colab.research.google.com/github/dhiyashafa/BigData/blob/main/pyspark-demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Langkah Praktikum**

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [3]:
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz

In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"

In [5]:
!pip install -q findspark

In [6]:
import findspark
findspark.init()

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
 .master("local")\
 .appName("Colab")\
 .config('spark.ui.port', '4050')\
 .getOrCreate()

In [9]:
!wget --continue https://raw.githubusercontent.com/dhanifudin/pyspark-demo/main/sample_books.json -O /tmp/sample_books.json

--2022-06-02 07:30:55--  https://raw.githubusercontent.com/dhanifudin/pyspark-demo/main/sample_books.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1565 (1.5K) [text/plain]
Saving to: ‘/tmp/sample_books.json’


2022-06-02 07:30:55 (22.9 MB/s) - ‘/tmp/sample_books.json’ saved [1565/1565]



In [10]:
df = spark.read.json("/tmp/sample_books.json")

In [11]:
df.printSchema()

root
 |-- author: string (nullable = true)
 |-- edition: string (nullable = true)
 |-- price: double (nullable = true)
 |-- title: string (nullable = true)
 |-- year_written: long (nullable = true)



In [12]:
df.show(4,False)

+---------------+--------------+-----+----------------+------------+
|author         |edition       |price|title           |year_written|
+---------------+--------------+-----+----------------+------------+
|Austen, Jane   |Penguin       |18.2 |Northanger Abbey|1814        |
|Tolstoy, Leo   |Penguin       |12.7 |War and Peace   |1865        |
|Tolstoy, Leo   |Penguin       |13.5 |Anna Karenina   |1875        |
|Woolf, Virginia|Harcourt Brace|25.0 |Mrs. Dalloway   |1925        |
+---------------+--------------+-----+----------------+------------+
only showing top 4 rows



In [13]:
df.count()

13

In [14]:
df.select("title", "price", "year_written").show(5)

+----------------+-----+------------+
|           title|price|year_written|
+----------------+-----+------------+
|Northanger Abbey| 18.2|        1814|
|   War and Peace| 12.7|        1865|
|   Anna Karenina| 13.5|        1875|
|   Mrs. Dalloway| 25.0|        1925|
|       The Hours|12.35|        1999|
+----------------+-----+------------+
only showing top 5 rows



In [15]:
df_filtered = df.filter("year_written > 1950 AND price > 10 AND title IS NOT NULL")
df_filtered.select("title", "price", "year_written").show(50, False)

+-----------------------------+-----+------------+
|title                        |price|year_written|
+-----------------------------+-----+------------+
|The Hours                    |12.35|1999        |
|Harry Potter                 |19.95|2000        |
|One Hundred Years of Solitude|14.0 |1967        |
+-----------------------------+-----+------------+



In [17]:
from pyspark.sql.functions import *
maxValue = df_filtered.agg(max("price")).collect()[0][0]
print("maxValue: ",maxValue)
df_filtered.select("title","price").filter(df.price == maxValue).show(20, False)

maxValue:  19.95
+------------+-----+
|title       |price|
+------------+-----+
|Harry Potter|19.95|
+------------+-----+



**Tugas**

In [18]:
# No. 1 Tampilkan data buku dengan harga paling murah!

df.filter("price = "+str(df.agg(min("price")).collect()[0][0])).show()

+----------------+------------+-----+-----------+------------+
|          author|     edition|price|      title|year_written|
+----------------+------------+-----+-----------+------------+
|Dickens, Charles|Random House| 5.75|Bleak House|        1870|
+----------------+------------+-----+-----------+------------+



In [19]:
# No. 2 Tampilkan jumlah terbit buku dikategorikan setiap tahun ditulis!

df.groupBy('year_written').count().sort(df.year_written).show()

+------------+-----+
|year_written|count|
+------------+-----+
|        1603|    1|
|        1814|    1|
|        1862|    1|
|        1865|    2|
|        1870|    1|
|        1875|    1|
|        1922|    1|
|        1925|    1|
|        1937|    1|
|        1967|    1|
|        1999|    1|
|        2000|    1|
+------------+-----+



In [20]:
# No. 3 Tampilkan data buku termahal tiap tahun penulisannya!

df.groupBy('year_written').max('price').sort(asc("year_written")).show()

+------------+----------+
|year_written|max(price)|
+------------+----------+
|        1603|      7.95|
|        1814|      18.2|
|        1862|      7.75|
|        1865|      12.7|
|        1870|      5.75|
|        1875|      13.5|
|        1922|      29.0|
|        1925|      25.0|
|        1937|     27.45|
|        1967|      14.0|
|        1999|     12.35|
|        2000|     19.95|
+------------+----------+



In [21]:
# No. 4 Tampilkan data buku termurah tiap tahun penulisannya!

df.groupBy('year_written').min('price').sort(asc("year_written")).show()

+------------+----------+
|year_written|min(price)|
+------------+----------+
|        1603|      7.95|
|        1814|      18.2|
|        1862|      7.75|
|        1865|      5.76|
|        1870|      5.75|
|        1875|      13.5|
|        1922|      29.0|
|        1925|      25.0|
|        1937|     27.45|
|        1967|      14.0|
|        1999|     12.35|
|        2000|     19.95|
+------------+----------+

