# Setup dependencies
I will be using pandas and sklearn for managing data and machine learning.
<details>
    <summary>pip install...</summary>

```python
# Allows to install a python package
pip install package-name
# or install python package with a specific version
pip install package-name==version
```
</details>


In [1]:
# Install PySpark version 3.1.2 silently
#!pip install pyspark==3.1.2 -q
# Install findSpark silently
!pip install findspark -q

In [2]:
# Used to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

# Create Spark Session

In [3]:
import findspark

# Initializing FindSpark to locate Spark installation
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkContext
from datetime import datetime

# Initialize Spark context
spark = SparkSession.builder.appName("SparkSQL").config("spark.hadoop.fs.defaultFS", "file:///")\
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "/spark/tmp") \
.getOrCreate()

# Creating a dataset
Reading dataset like a plain file.

In [4]:
data = [("student1",64,90),
        ("student2",59,100),
        ("student3",69,95),
        ("",70,110),
        ("student5",60,80),
        ("student3",69,95),
        ("student6",62,85),
        ("student7",65,80),
        ("student7",65,80)]
df = spark.createDataFrame(data, ["student","height_inches","weight_pounds"])
df.show()


+--------+-------------+-------------+
| student|height_inches|weight_pounds|
+--------+-------------+-------------+
|student1|           64|           90|
|student2|           59|          100|
|student3|           69|           95|
|        |           70|          110|
|student5|           60|           80|
|student3|           69|           95|
|student6|           62|           85|
|student7|           65|           80|
|student7|           65|           80|
+--------+-------------+-------------+



In [5]:
df.rdd.getNumPartitions()

8

In [6]:
df = df.repartition(1)

# Reading Parquet Dataset

In [None]:
spark.sql("SELECT * FROM retail").show()

## Task 5 - Analyze the dataset

In [None]:
spark.sql("SELECT distinct promo_type_1 FROM retail").show()

In [None]:
spark.sql("select product_id, sum(revenue) total_price from retail group by product_id").show()

In [None]:
from pyspark.sql.functions import expr

# Convert inches to centimeters
# Multiply the column height_inches with 2.54 to get a new column height_centimeters
df = df.withColumn("height_centimeters", expr("height_inches * 2.54"))
df.show()

In [None]:
# Convert pounds to kilograms
# Multiply weight_pounds with 0.453592 to get a new column weight_kg
df = df.withColumn("weight_kg", expr("weight_pounds * 0.453592"))
df.show()

In [None]:
# drop the columns "height_inches","weight_pounds"
df = df.drop("height_inches","weight_pounds")
df.show()

In [None]:
# rename the lengthy column name "height_centimeters" to "height_cm"
df = df.withColumnRenamed("height_centimeters","height_cm")
df.show()

# Stop Spark Session

In [None]:
spark.stop()