### Установка JVM

In [1]:
import os

In [2]:
%%capture
%%bash
sudo apt-get update
sudo apt-get upgrade
sudo apt-get install default-jre
#!java -version

### Константы и переменные ноутбука

In [3]:
os.environ["SPARK_HOME"] = "/content/spark"
os.environ["PROJECT_HOME"] = "/content/TestProject"

PROJECT_FOLDER = "/content/TestProject"

### Установка Apache Spark

In [4]:
%%bash
wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3-scala2.13.tgz
tar -xzf ./spark-3.4.1-bin-hadoop3-scala2.13.tgz
mv ./spark-3.4.1-bin-hadoop3-scala2.13 ${SPARK_HOME}
rm ./spark-3.4.1-bin-hadoop3-scala2.13.tgz

### Проверка версии Apache Spark и Scala

In [5]:
%%bash
${SPARK_HOME}/bin/spark-shell --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.4.1
      /_/
                        
Using Scala version 2.13.8, OpenJDK 64-Bit Server VM, 11.0.20.1
Branch HEAD
Compiled by user centos on 2023-06-19T22:21:01Z
Revision 6b1ff22dde1ead51cbf370be6e48a802daae58b6
Url https://github.com/apache/spark
Type --help for more information.


### Создание структуры папок/файлов проекта

In [6]:
%%bash
mkdir -p ${PROJECT_HOME}/src/{main,test}/{java,resources,scala}
mkdir ${PROJECT_HOME}/{dataset,project,target}

In [7]:
%%bash
touch ${PROJECT_HOME}/build.sbt
touch ${PROJECT_HOME}/src/main/scala/SimpleApp.scala

### Заполнение файлов build.sbt и SimpleApp.scala

In [8]:
%%bash
# Содержимое файла build.sbt
cat >${PROJECT_HOME}/build.sbt <<EOL
name := "Simple Project"

version := "1.0"

scalaVersion := "2.13.8"

val sparkVersion = "3.4.1"

libraryDependencies ++= Seq(
  "org.apache.spark" %% "spark-core" % sparkVersion,
  "org.apache.spark" %% "spark-sql" % sparkVersion
)
EOL

In [9]:
%%bash
# Содержимое файла SimpleApp.scala
cat >${PROJECT_HOME}/src/main/scala/SimpleApp.scala <<EOL
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.types.{StringType,  IntegerType, DateType}
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{udf, col, lit, round, when}
import org.apache.spark.sql.functions.{sum, count, avg, min, max}

object SimpleApp extends App{

    // Загрузка датасета для анализа
    val pathFile = "${PROJECT_HOME}/dataset/data.csv"
    val spark = SparkSession.builder
                            .master("local[*]")
                            .appName("Simple Application")
                            .getOrCreate()

    val schemaDfSalary = new StructType()
      .add("employee", StringType, true)
      .add("age", IntegerType, true)
      .add("department", StringType, true)
      .add("state", StringType, true)
      .add("salary", IntegerType, true)
      .add("bonus", IntegerType, true)

    val dfSalary = spark.read.format("csv")
      .option("header", "true")
      .options(Map("delimiter"->","))
      .schema(schemaDfSalary)
      .load(pathFile).cache()

    val dataCluster = Seq(("GA","cluster1"),
                          ("AZ","cluster1"),
                          ("FL","cluster1"),
                          ("CA","cluster2"),
                          ("NY","cluster2"),
                          ("TX","cluster2"))

    val columnsCluster = Seq("state_code","cluster")

    val dfCluster = spark.createDataFrame(dataCluster).toDF(columnsCluster:_*)

    val dfSalaryJoin = dfSalary.join(dfCluster,
                                     dfSalary("state") ===  dfCluster("state_code"),
                                     "left")

    val dfSalarySelect = dfSalaryJoin.select(col("cluster"),
                                             col("department"),
                                             col("salary"),
                                             col("bonus"))

    val dfSalaryTotal = dfSalarySelect.withColumn("total_salary",
                        round(dfSalarySelect("salary") + dfSalarySelect("salary")/100*dfSalarySelect("bonus"),2).cast("Integer"))

    val dfSalaryGroup = dfSalaryTotal.groupBy("cluster","department")
                                     .agg(sum("total_salary").as("sum_salary"),
                                          avg("total_salary").as("avg_salary"),
                                          min("total_salary").as("min_salary"),
                                          max("total_salary").as("max_salary"))

    val dfSalaryGroupFilter = dfSalaryGroup
                                      .where(dfSalaryGroup("department")==="Finance" && dfSalaryGroup("sum_salary") >= 100000)
    println("*****")
    println(dfSalaryGroupFilter.show(false))
    println("*****")

    spark.stop()
  }

EOL

### Установка SBT

In [10]:
%%capture
%%bash
echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list
echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add
sudo apt-get update
sudo apt-get install sbt

### Смена рабочей папки

In [11]:
os.chdir(PROJECT_FOLDER)

### Сборка

In [None]:
%%bash
sbt package

### Вывод результата

In [None]:
%%bash
${SPARK_HOME}/bin/spark-submit \
                --class "SimpleApp" \
                --master local[*] \
                ${PROJECT_HOME}/target/scala-2.13/simple-project_2.13-1.0.jar