In [1]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz
!tar xf spark-3.0.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [71]:
# Set the environment variables for running PySpark in the collaboration environmentimport os
import os
import pandas as pd
import numpy as np
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop3.2"

In [106]:
# Run the local session to test the installation
import findspark
findspark.init('spark-3.0.1-bin-hadoop3.2')
from pyspark.sql import SparkSession
from pyspark.ml.feature import SQLTransformer
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [29]:
df = (spark.read
          .format("csv")
          .option('header', 'true')
          .load("drive/MyDrive/db/iris_frame2.csv"))

In [30]:
df.describe().toPandas()

Unnamed: 0,summary,sepal_length,sepal_width,petal_length,petal_width,target
0,count,150.0,150.0,150.0,150.0,150.0
1,mean,5.843333333333335,3.057333333333334,3.7580000000000022,1.199333333333334,1.0
2,stddev,0.8280661279778637,0.4358662849366979,1.7652982332594662,0.7622376689603467,0.8192319205190406
3,min,4.3,2.0,1.0,0.1,0.0
4,max,7.9,4.4,6.9,2.5,2.0


In [31]:
df.select('target').distinct().toPandas()

Unnamed: 0,target
0,0
1,1
2,2


In [32]:
df.where('target="0"').drop('target').describe().toPandas()

Unnamed: 0,summary,sepal_length,sepal_width,petal_length,petal_width
0,count,50.0,50.0,50.0,50.0
1,mean,5.005999999999999,3.428000000000001,1.4620000000000002,0.2459999999999999
2,stddev,0.3524896872134513,0.3790643690962886,0.1736639964801841,0.1053855893800456
3,min,4.3,2.3,1.0,0.1
4,max,5.8,4.4,1.9,0.6


In [33]:
df.registerTempTable('df')

In [42]:
df_sql1 = spark.sql('''SELECT * FROM df LIMIT 5''').toPandas()
df_sql1

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [103]:
df_sql2= spark.sql('''
SELECT
  target,
  min(sepal_length),avg(sepal_length),max(sepal_length),
  min(sepal_width),avg(sepal_width),max(sepal_width),
  min(petal_length),avg(petal_length),max(petal_length),
  min(petal_width),avg(petal_width),max(petal_width)
FROM df
GROUP BY target
''').toPandas()
df_sql2

Unnamed: 0,target,min(sepal_length),avg(CAST(sepal_length AS DOUBLE)),max(sepal_length),min(sepal_width),avg(CAST(sepal_width AS DOUBLE)),max(sepal_width),min(petal_length),avg(CAST(petal_length AS DOUBLE)),max(petal_length),min(petal_width),avg(CAST(petal_width AS DOUBLE)),max(petal_width)
0,0,4.3,5.006,5.8,2.3,3.428,4.4,1.0,1.462,1.9,0.1,0.246,0.6
1,1,4.9,5.936,7.0,2.0,2.77,3.4,3.0,4.26,5.1,1.0,1.326,1.8
2,2,4.9,6.588,7.9,2.2,2.974,3.8,4.5,5.552,6.9,1.4,2.026,2.5


In [104]:
column_list = list(df_sql2.columns)[1:]
df_sql2 = pd.melt(df_sql2, id_vars=['target'], value_vars=column_list)
df_sql2.head(5)

Unnamed: 0,target,variable,value
0,0,min(sepal_length),4.3
1,1,min(sepal_length),4.9
2,2,min(sepal_length),4.9
3,0,avg(CAST(sepal_length AS DOUBLE)),5.006
4,1,avg(CAST(sepal_length AS DOUBLE)),5.936


In [105]:
df_sql2 = pd.pivot_table(df_sql2, values='value',index=['variable'],columns=['target'], aggfunc=np.sum)
df_sql2.head()

target,0,1,2
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
avg(CAST(petal_length AS DOUBLE)),1.462,4.26,5.552
avg(CAST(petal_width AS DOUBLE)),0.246,1.326,2.026
avg(CAST(sepal_length AS DOUBLE)),5.006,5.936,6.588
avg(CAST(sepal_width AS DOUBLE)),3.428,2.77,2.974
max(petal_length),1.9,5.1,6.9


In [107]:
df_sql3= '''
SELECT
  target,
  min(sepal_length),avg(sepal_length),max(sepal_length),
  min(sepal_width),avg(sepal_width),max(sepal_width),
  min(petal_length),avg(petal_length),max(petal_length),
  min(petal_width),avg(petal_width),max(petal_width)
FROM df
GROUP BY target
'''
sql_transformer = SQLTransformer(statement=df_sql3)
sql_transformer.transform(df).toPandas()

Unnamed: 0,target,min(sepal_length),avg(CAST(sepal_length AS DOUBLE)),max(sepal_length),min(sepal_width),avg(CAST(sepal_width AS DOUBLE)),max(sepal_width),min(petal_length),avg(CAST(petal_length AS DOUBLE)),max(petal_length),min(petal_width),avg(CAST(petal_width AS DOUBLE)),max(petal_width)
0,0,4.3,5.006,5.8,2.3,3.428,4.4,1.0,1.462,1.9,0.1,0.246,0.6
1,1,4.9,5.936,7.0,2.0,2.77,3.4,3.0,4.26,5.1,1.0,1.326,1.8
2,2,4.9,6.588,7.9,2.2,2.974,3.8,4.5,5.552,6.9,1.4,2.026,2.5
