# PySpark Practice

## Load libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

## Create dataframe

In [None]:
mydata = spark.read.format("csv").option("header", "true").load("original.csv")


In [None]:
mydata.show()

## Create a new city row where null entries are replaced with "unknown"

In [None]:
mydata2 = mydata.withColumn("clean_city", when(mydata.City.isNull(),"unknown").otherwise(mydata.City))

In [None]:
mydata2.show()

## Filter out all rows where the job title is null

In [None]:
mydata2 = mydata2.filter(mydata2.JobTitle.isNotNull())

In [None]:
mydata2.show()

## Change salary column from strings to float

In [None]:
mydata2 = mydata2.withColumn("clean_salary", mydata2.Salary.substr(2, 100).cast("float"))
# starting at column 2 means the dollar sign doesn't get processed

## Replace null salaries with the mean value

In [None]:
mean = mydata2.groupBy().avg("clean_salary").take(1)[0][0]
print(mean)

In [None]:
from pyspark.sql.functions import lit
mydata2 = mydata2.withColumn("new_salary", when(mydata2.clean_salary.isNull(), lit(mean)).otherwise(mydata2.clean_salary))
# If new_salary is null, insert the literal result of the mean, otherwise stay as clean salary
# safe to do whether you know for sure there are any null values or not. Important if the data is going to be regularly updated

In [None]:
mydata2.show()

## Remove nulls for Latitude

In [None]:
import numpy as np
latitudes = mydata2.select("Latitude")

In [None]:
latitudes = latitudes.filter(latitudes.Latitude.isNotNull())

In [None]:
latitudes.show()

## Convert Latitutde from string to float

In [None]:
latitudes = latitudes.withColumn("latitude2", latitudes.Latitude.cast("float")).select("latitude2")

## Replace Latitude nulls with median average

In [None]:
median = np.median(latitudes.collect())

In [None]:
mydata2 = mydata2.withColumn("lat", when(mydata2.Latitude.isNull(), lit(median)).otherwise(mydata2["Latitude"]))

In [None]:
mydata2.show()