# Spark DataFrame
## -GroupBy and Aggregate Functions-

## Imports

In [1]:
import findspark

findspark.init('C:/spark')

from pyspark.sql import SparkSession

# --

In [None]:
spark = SparkSession.builder.appName('aggs').getOrCreate()

In [None]:
df = spark.read.csv('../data/sales_info.csv', inferSchema=True, header=True)

In [None]:
df.show()

In [None]:
df.printSchema()

In [None]:
df.createOrReplaceTempView('sales')

### Group By

In [None]:
df.groupBy("Company")

In [None]:
# Python
df.groupBy("Company").mean().show() # Python

In [None]:
# SQL
results = spark.sql("SELECT Company, avg(Sales) FROM sales GROUP BY Company")
results.show()

### Aggregate

In [None]:
df.agg({'Sales': 'sum'}).show()

In [None]:
group_data = df.groupBy("Company")
group_data.agg({'Sales':'max'}).show()

### Functions

In [None]:
from pyspark.sql.functions import countDistinct, avg, stddev

In [None]:
df.select(countDistinct('Sales')).show()

In [None]:
df.select(avg('Sales').alias('Average Sales')).show()

In [None]:
from pyspark.sql.functions import format_number

In [None]:
sales_std = df.select(stddev('Sales').alias('STD'))

In [None]:
sales_std.select(format_number('STD', 2).alias('STD')).show()

### Order By

In [None]:
# Ascending
df.orderBy('Sales').show()

In [None]:
# Descending
df.orderBy(df['Sales'].desc()).show()