In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!ls drive

MyDrive  Shareddrives


In [5]:
!pwd

/content


## Instalasi Java, Spark and findspark

In [4]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [6]:
!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz

In [7]:
!ls

drive  sample_data  spark-3.1.2-bin-hadoop3.2.tgz


In [8]:
!tar xf spark-3.1.2-bin-hadoop3.2.tgz

In [9]:
!ls

drive  sample_data  spark-3.1.2-bin-hadoop3.2  spark-3.1.2-bin-hadoop3.2.tgz


In [10]:
!pip install -q findspark

In [13]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = ""/content/spark-3.1.2-bin-hadoop3.2

In [15]:
os.environ["SPARK_HOME"]

'/content/spark-3.1.2-bin-hadoop3.2'

In [16]:
import findspark
findspark.init()

In [17]:
findspark.find()

'/content/spark-3.1.2-bin-hadoop3.2'

## Hands-On with Spark

In [18]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("firstSpark")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [20]:
!wget --continue https://raw.githubusercontent.com/GarvitArya/pyspark-demo/main/sample_books.json -O /tmp/sample_books.json

--2021-08-27 02:18:30--  https://raw.githubusercontent.com/GarvitArya/pyspark-demo/main/sample_books.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1565 (1.5K) [text/plain]
Saving to: ‘/tmp/sample_books.json’


2021-08-27 02:18:30 (19.5 MB/s) - ‘/tmp/sample_books.json’ saved [1565/1565]



In [24]:
!ls /tmp | grep book

sample_books.json


In [25]:
df = spark.read.json("/tmp/sample_books.json")

In [28]:
df.show(2)

+------------+-------+-----+----------------+------------+
|      author|edition|price|           title|year_written|
+------------+-------+-----+----------------+------------+
|Austen, Jane|Penguin| 18.2|Northanger Abbey|        1814|
|Tolstoy, Leo|Penguin| 12.7|   War and Peace|        1865|
+------------+-------+-----+----------------+------------+
only showing top 2 rows



In [29]:
df.printSchema()

root
 |-- author: string (nullable = true)
 |-- edition: string (nullable = true)
 |-- price: double (nullable = true)
 |-- title: string (nullable = true)
 |-- year_written: long (nullable = true)



In [31]:
# melihat jumlah baris
df.count()

13

In [32]:
df.select("title", "price", "year_written").show(5)

+----------------+-----+------------+
|           title|price|year_written|
+----------------+-----+------------+
|Northanger Abbey| 18.2|        1814|
|   War and Peace| 12.7|        1865|
|   Anna Karenina| 13.5|        1875|
|   Mrs. Dalloway| 25.0|        1925|
|       The Hours|12.35|        1999|
+----------------+-----+------------+
only showing top 5 rows



In [33]:
df_filtered = df.filter("year_written > 1950 AND price > 10 AND title IS NOT NULL")
df_filtered.select("title", "price", "year_written").show(50, False)

+-----------------------------+-----+------------+
|title                        |price|year_written|
+-----------------------------+-----+------------+
|The Hours                    |12.35|1999        |
|Harry Potter                 |19.95|2000        |
|One Hundred Years of Solitude|14.0 |1967        |
+-----------------------------+-----+------------+



## Spark SQL

In [34]:
df.createOrReplaceTempView('books')

In [36]:
spark.sql("select * from books").show()

+--------------------+-----------------+-----+--------------------+------------+
|              author|          edition|price|               title|year_written|
+--------------------+-----------------+-----+--------------------+------------+
|        Austen, Jane|          Penguin| 18.2|    Northanger Abbey|        1814|
|        Tolstoy, Leo|          Penguin| 12.7|       War and Peace|        1865|
|        Tolstoy, Leo|          Penguin| 13.5|       Anna Karenina|        1875|
|     Woolf, Virginia|   Harcourt Brace| 25.0|       Mrs. Dalloway|        1925|
|Cunnningham, Michael|   Harcourt Brace|12.35|           The Hours|        1999|
|         Twain, Mark|          Penguin| 5.76|    Huckleberry Finn|        1865|
|    Dickens, Charles|     Random House| 5.75|         Bleak House|        1870|
|         Twain, Mark|     Random House| 7.75|          Tom Sawyer|        1862|
|     Woolf, Virginia|          Penguin| 29.0| A Room of One's Own|        1922|
|       Rowling, J.K.|   Har

In [38]:
spark.sql("select author, edition, price from books where edition='Penguin'").show()

+---------------+-------+-----+
|         author|edition|price|
+---------------+-------+-----+
|   Austen, Jane|Penguin| 18.2|
|   Tolstoy, Leo|Penguin| 12.7|
|   Tolstoy, Leo|Penguin| 13.5|
|    Twain, Mark|Penguin| 5.76|
|Woolf, Virginia|Penguin| 29.0|
|  Tolkien, J.R.|Penguin|27.45|
+---------------+-------+-----+



In [39]:
spark.sql("select edition, count(*) from books group by edition").show()

+-----------------+--------+
|          edition|count(1)|
+-----------------+--------+
| Signet  Classics|       1|
|Harper  Perennial|       1|
|     Random House|       2|
|   Harcourt Brace|       3|
|          Penguin|       6|
+-----------------+--------+

