# Complex operations with DataFrame in Spark SQL

### Connecting

In [None]:
from pyspark.sql import SparkSession

spark_session = SparkSession.builder \
                            .appName("myApp") \
                            .master("local") \
                            .enableHiveSupport() \
                            .getOrCreate()

spark_session

### Read data from .parquet files

In [None]:
df = spark_session.read.parquet("/data/sample264")
df.printSchema()
df.show(3)

### Projection

In [None]:
# Select with changing column name
df.select(df.userId, df.trackId.alias("track")).limit(3).toPandas()

### Filtering

In [None]:
# Filter in pandas style
df[df.userId > 50000].limit(3).toPandas()

### Functions 

In [None]:
import pyspark.sql.functions as sparkf

df.select(df.userId, sparkf.length(df.timestamp).alias("timestamp_length")).limit(3).toPandas()

In [None]:
# `spark.concat` - concat two values (user `sparkf.lit(<str>)` to use the <str> as string rather than column name)

In [None]:
# `sparkf.split` - split string into array
# `sparkf.explod` - make several row with single value from array column
df.select(df.userId, sparkf.split(df.userId, "").alias("splittedUserId"))\
  .select(df.userId, sparkf.explode("splittedUserId"))\
  .limit(3).toPandas()

In [None]:
# Determinant
df.select(df.userId, sparkf.when(df.userId.like("13065"), "111").otherwise("000")).limit(3).toPandas()

### Aggregates

In [None]:
# Simple group by
df.groupBy(df.userId).agg(sparkf.count(df.trackId)).limit(3).toPandas()

### Joins

In [None]:
# df.join(<df to join>, on=<column>, how=inner|left|right|left_semi|right_semi|left_anti|right_anti)
# df.crossJoin(<df to join>)

### UDF

In [None]:
from pyspark.sql.types import IntegerType

len_udf = sparkf.udf(len, IntegerType())

### Window function

* do not change number of lines
* add additional column

In [None]:
# from pyspark.sql.window import Window

# Windows.partitionBy(<column>) - groups rows
#        .orderBy(<column>)