## Install java & spark

In [1]:
!apt-get install openjdk-17-jdk-headless -qq > /dev/null
!wget https://dlcdn.apache.org/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz -O spark-3.5.4-bin-hadoop3.tgz
!tar xf spark-3.5.4-bin-hadoop3.tgz

--2025-01-26 15:53:41--  https://dlcdn.apache.org/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz
Resolving dlcdn.apache.org (dlcdn.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to dlcdn.apache.org (dlcdn.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400879762 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.4-bin-hadoop3.tgz’


2025-01-26 15:53:58 (205 MB/s) - ‘spark-3.5.4-bin-hadoop3.tgz’ saved [400879762/400879762]



## configure spark env

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.4-bin-hadoop3"

In [3]:
pip install -q pyspark

In [5]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
        .master("local")
        .appName("My Spark Application")
        .getOrCreate())
sc = spark.sparkContext

In [6]:
spark

## Docs

### Load

https://spark.apache.org/docs/3.5.4/sql-data-sources-csv.html



In [8]:
!mkdir -p var/
!echo "id;name;age" > var/data.csv
!echo "1;Alice;25" >> var/data.csv
!echo "2;Bob;30" >> var/data.csv

In [9]:
df = spark.read.csv("var/data.csv", header=True, sep=";")
df.show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
|  2|  Bob| 30|
+---+-----+---+



In [10]:
df.createOrReplaceTempView("people")

In [23]:
spark.sql('''
  select
    id,
    md5(id) id_hash,
    name,
    age,
    (100-age) years_to_100
  from people
''').show(truncate=False)

+---+--------------------------------+-----+---+------------+
|id |id_hash                         |name |age|years_to_100|
+---+--------------------------------+-----+---+------------+
|1  |c4ca4238a0b923820dcc509a6f75849b|Alice|25 |75.0        |
|2  |c81e728d9d4c2f636f067f89cc14862c|Bob  |30 |70.0        |
+---+--------------------------------+-----+---+------------+

