### Setup

In [1]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"

!ls

import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:6 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Fetched 252 kB in 3s (86.6 kB/s)
Reading package l

### Loading Data 

In [2]:
df = spark.read.format('csv').option('header', 'true').load('/content/drive/MyDrive/original.csv')
df.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52|39.9947462|116.3397725|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.6489954|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145| -6.1644997|
|  7|     Masha|    Divers|Female|         Dachun|     

In [3]:
df = spark.read.csv('/content/drive/MyDrive/original.csv', header=True)
df.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52|39.9947462|116.3397725|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.6489954|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145| -6.1644997|
|  7|     Masha|    Divers|Female|         Dachun|     

In [4]:
df.dtypes

[('id', 'string'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('gender', 'string'),
 ('City', 'string'),
 ('JobTitle', 'string'),
 ('Salary', 'string'),
 ('Latitude', 'string'),
 ('Longitude', 'string')]

In [5]:
from pyspark.sql.types import *
schema = StructType([
  StructField('id', IntegerType()),
  StructField('first_name', StringType()),
  StructField('last_name', StringType()),
  StructField('gender', StringType()),
  StructField('city',  StringType()),
  StructField('job_title', StringType()),
  StructField('Salary', StringType()),
  StructField('latitude', FloatType()),
  StructField('longitude', FloatType())
])

df = spark.read.csv('/content/drive/MyDrive/original.csv', header=True,schema = schema)
df.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
| id|first_name| last_name|gender|           city|           job_title|   Salary|  latitude| longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52| 39.994747|116.339775|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.648994|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16| 53.426613|-6.1644998|
|  7|     Masha|    Divers|Female|         Dachun|              

### Inspecting Data

In [6]:
df.dtypes

[('id', 'int'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('gender', 'string'),
 ('city', 'string'),
 ('job_title', 'string'),
 ('Salary', 'string'),
 ('latitude', 'float'),
 ('longitude', 'float')]

In [7]:
df.head(6)

[Row(id=1, first_name='Melinde', last_name='Shilburne', gender='Female', city='Nowa Ruda', job_title='Assistant Professor', Salary='$57438.18', latitude=50.57740783691406, longitude=16.49671745300293),
 Row(id=2, first_name='Kimberly', last_name='Von Welden', gender='Female', city='Bulgan', job_title='Programmer II', Salary='$62846.60', latitude=48.823158264160156, longitude=103.52182006835938),
 Row(id=3, first_name='Alvera', last_name='Di Boldi', gender='Female', city=None, job_title=None, Salary='$57576.52', latitude=39.994747161865234, longitude=116.33977508544922),
 Row(id=4, first_name='Shannon', last_name="O'Griffin", gender='Male', city='Divnomorskoye', job_title='Budget/Accounting Analyst II', Salary='$61489.23', latitude=44.504722595214844, longitude=38.1300163269043),
 Row(id=5, first_name='Sherwood', last_name='Macieja', gender='Male', city='Mytishchi', job_title='VP Sales', Salary='$63863.09', latitude=None, longitude=37.64899444580078),
 Row(id=6, first_name='Maris', last

In [8]:
df.first()

Row(id=1, first_name='Melinde', last_name='Shilburne', gender='Female', city='Nowa Ruda', job_title='Assistant Professor', Salary='$57438.18', latitude=50.57740783691406, longitude=16.49671745300293)

In [9]:
df.describe().show()

+-------+-----------------+----------+---------+------+-------------------+-------------------+---------+------------------+-----------------+
|summary|               id|first_name|last_name|gender|               city|          job_title|   Salary|          latitude|        longitude|
+-------+-----------------+----------+---------+------+-------------------+-------------------+---------+------------------+-----------------+
|  count|             1000|      1000|     1000|  1000|                999|                998|     1000|               999|             1000|
|   mean|            500.5|      null|     null|  null|               null|               null|     null| 25.43151724702484|43.33756460386515|
| stddev|288.8194360957494|      null|     null|  null|               null|               null|     null|24.579082550156635| 69.4206453674681|
|    min|                1|   Abagail|    Abbay|Female|             Abéché|Account Coordinator|$10101.92|         -54.28115|       -123.04196|

In [10]:
df.columns

['id',
 'first_name',
 'last_name',
 'gender',
 'city',
 'job_title',
 'Salary',
 'latitude',
 'longitude']

In [11]:
df.count()

1000

In [12]:
df.distinct().count()

1000

###  Handling Null and Duplicate Values

In [13]:
df_dropped = df.na.drop()
df_dropped.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
| id|first_name| last_name|gender|           city|           job_title|   Salary|  latitude| longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16| 53.426613|-6.1644998|
|  8|   Goddart|     Flear|  Male|      Trélissac|Desktop Support T...|$46116.36| 45.190517| 0.7423124|
|  9|      Roth|O'Cannavan|  Male|         Heitan|VP Product Manage...|$73697.10| 32.027935| 106.65711|
| 10|      Bran|   Trahear|  Male|       Arbeláez|Mechanical Sys

In [14]:
df_null_jobs = df.filter(df.job_title.isNotNull())
df_null_jobs.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
| id|first_name| last_name|gender|           city|           job_title|   Salary|  latitude| longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.648994|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16| 53.426613|-6.1644998|
|  8|   Goddart|     Flear|  Male|      Trélissac|Desktop Support T...|$46116.36| 45.190517| 0.7423124|
|  9|      Roth|O'Cannavan|  Male|         Heitan|VP Product Man

In [15]:
from pyspark.sql.functions import *
df_handled = df.withColumn('clean_city', when(df.city.isNull(), 'Unknow').otherwise(df.city))
df_handled.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+---------------+
| id|first_name| last_name|gender|           city|           job_title|   Salary|  latitude| longitude|     clean_city|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+---------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|      Nowa Ruda|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|         Bulgan|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52| 39.994747|116.339775|         Unknow|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|  Divnomorskoye|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.648994|      Mytishchi|
|  6|     Maris|      Folk|Female|Kinsea

In [16]:
df_no_duplicates = df.dropDuplicates()
df_no_duplicates.show()

+---+----------+-------------+------+------------------+--------------------+---------+----------+----------+
| id|first_name|    last_name|gender|              city|           job_title|   Salary|  latitude| longitude|
+---+----------+-------------+------+------------------+--------------------+---------+----------+----------+
| 16|    Norbie|       Gwyllt|  Male|            Xijiao|              Editor|$32492.73| 43.494576|  5.897802|
|133|     Manya|      Westall|Female|           Macarse|  Help Desk Operator|$68709.02|  15.42495| 120.77476|
|208|  Maurizio|   Raddenbury|  Male|            Jiyang|   Assistant Manager|$41273.88|  36.97854| 117.17352|
|383|   Romonda|      Kellert|Female|         Cimongkor|     Health Coach II|$90053.02|-6.3624935|106.021194|
|555|    Blondy|         Tsar|Female|       Tambulatana|   Financial Analyst|$26980.55|   -9.5128|  120.1979|
|569|     Valma|      Bratton|Female|         Kurayoshi|    Web Developer II|$32665.89| 35.449905| 133.76134|
|901|     

### Selecting & Filtering

In [17]:
df_select = df.select('first_name', 'last_name')
df_select.show()

+----------+----------+
|first_name| last_name|
+----------+----------+
|   Melinde| Shilburne|
|  Kimberly|Von Welden|
|    Alvera|  Di Boldi|
|   Shannon| O'Griffin|
|  Sherwood|   Macieja|
|     Maris|      Folk|
|     Masha|    Divers|
|   Goddart|     Flear|
|      Roth|O'Cannavan|
|      Bran|   Trahear|
|    Kylynn|   Lockart|
|       Rey|    Meharg|
|      Kerr|    Braden|
|    Mickie| Whanstall|
|    Kaspar|     Pally|
|    Norbie|    Gwyllt|
|    Claude|    Briant|
|     Thain|    Habbon|
|  Tiffanie|  Pattison|
|    Ettore|  Gerriets|
+----------+----------+
only showing top 20 rows



In [18]:
df_renamed = df.withColumnRenamed('first_name', 'fn')
df_renamed.show()

+---+--------+----------+------+---------------+--------------------+---------+----------+----------+
| id|      fn| last_name|gender|           city|           job_title|   Salary|  latitude| longitude|
+---+--------+----------+------+---------------+--------------------+---------+----------+----------+
|  1| Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|
|  2|Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|
|  3|  Alvera|  Di Boldi|Female|           null|                null|$57576.52| 39.994747|116.339775|
|  4| Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|
|  5|Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.648994|
|  6|   Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16| 53.426613|-6.1644998|
|  7|   Masha|    Divers|Female|         Dachun|                null|$25090.87| 24

In [19]:
df_filter = df.filter((df.first_name == 'Alvera'))
df_filter.show()

+---+----------+---------+------+----+---------+---------+---------+----------+
| id|first_name|last_name|gender|city|job_title|   Salary| latitude| longitude|
+---+----------+---------+------+----+---------+---------+---------+----------+
|  3|    Alvera| Di Boldi|Female|null|     null|$57576.52|39.994747|116.339775|
+---+----------+---------+------+----+---------+---------+---------+----------+



In [20]:
df_filter = df.filter((df.first_name.like('%ndr%')))
df_filter.show()

+---+----------+-----------+------+--------------+--------------------+---------+---------+----------+
| id|first_name|  last_name|gender|          city|           job_title|   Salary| latitude| longitude|
+---+----------+-----------+------+--------------+--------------------+---------+---------+----------+
| 26|   Leandra|     Anfrey|Female|       Isfahan|VP Product Manage...|$30201.32| 32.65463| 51.667984|
|112|    Andros|    De Metz|  Male|       Cuogang|   Director of Sales|$11832.43|50.058876| 23.973646|
|132|    Andrus|      Shinn|  Male|       Siguiri|       Senior Editor|$43934.36|11.414811|  -9.17883|
|429|Alessandro|    Shearer|  Male|Banjar Kelodan|            VP Sales|$51979.53| -8.12334|115.338936|
|458|    Sandra|     Hegley|Female|         Maraã|       Social Worker|$57631.19|-2.301947|-65.000854|
|465|Cassaundra|    Chismon|Female|     København|   Chemical Engineer|$37744.72|55.678204| 12.572069|
|564|     Andra|Ambrozewicz|Female|      Zhangcun|   Chemical Engineer|$4

In [21]:
df_filter = df.filter((df.first_name.endswith('din')))
df_filter.show()

+---+----------+-------------+------+-----------+---------+---------+----------+---------+
| id|first_name|    last_name|gender|       city|job_title|   Salary|  latitude|longitude|
+---+----------+-------------+------+-----------+---------+---------+----------+---------+
|901|     Aldin|Matuszkiewicz|  Male|East London| Operator|$41468.83|-32.954933|27.931913|
+---+----------+-------------+------+-----------+---------+---------+----------+---------+



In [22]:
df_filter = df.filter((df.first_name.startswith('Alv')))
df_filter.show()

+---+----------+---------+------+----------+--------------------+---------+---------+----------+
| id|first_name|last_name|gender|      city|           job_title|   Salary| latitude| longitude|
+---+----------+---------+------+----------+--------------------+---------+---------+----------+
|  3|    Alvera| Di Boldi|Female|      null|                null|$57576.52|39.994747|116.339775|
| 81|     Alvin|    Doman|  Male|      Niny|Research Assistant I|$53258.86|44.486843| 43.940807|
|775|   Alverta| MacNulty|Female|Megalópoli| Geological Engineer|$17299.62|37.401245| 22.136488|
+---+----------+---------+------+----------+--------------------+---------+---------+----------+



In [23]:
df_filter = df.filter(df.id.between(1,5))
df_filter.show()

+---+----------+----------+------+-------------+--------------------+---------+---------+----------+
| id|first_name| last_name|gender|         city|           job_title|   Salary| latitude| longitude|
+---+----------+----------+------+-------------+--------------------+---------+---------+----------+
|  1|   Melinde| Shilburne|Female|    Nowa Ruda| Assistant Professor|$57438.18|50.577408| 16.496717|
|  2|  Kimberly|Von Welden|Female|       Bulgan|       Programmer II|$62846.60| 48.82316| 103.52182|
|  3|    Alvera|  Di Boldi|Female|         null|                null|$57576.52|39.994747|116.339775|
|  4|   Shannon| O'Griffin|  Male|Divnomorskoye|Budget/Accounting...|$61489.23|44.504723| 38.130016|
|  5|  Sherwood|   Macieja|  Male|    Mytishchi|            VP Sales|$63863.09|     null| 37.648994|
+---+----------+----------+------+-------------+--------------------+---------+---------+----------+



In [24]:
df_filter = df.filter(df.first_name.isin('Aldin', 'Valma'))
df_filter.show()

+---+----------+-------------+------+-----------+----------------+---------+----------+---------+
| id|first_name|    last_name|gender|       city|       job_title|   Salary|  latitude|longitude|
+---+----------+-------------+------+-----------+----------------+---------+----------+---------+
|569|     Valma|      Bratton|Female|  Kurayoshi|Web Developer II|$32665.89| 35.449905|133.76134|
|901|     Aldin|Matuszkiewicz|  Male|East London|        Operator|$41468.83|-32.954933|27.931913|
+---+----------+-------------+------+-----------+----------------+---------+----------+---------+



In [25]:
df_substring = df.select(df.first_name, df.first_name.substr(1,5).alias('name'))
df_substring.show()

+----------+-----+
|first_name| name|
+----------+-----+
|   Melinde|Melin|
|  Kimberly|Kimbe|
|    Alvera|Alver|
|   Shannon|Shann|
|  Sherwood|Sherw|
|     Maris|Maris|
|     Masha|Masha|
|   Goddart|Godda|
|      Roth| Roth|
|      Bran| Bran|
|    Kylynn|Kylyn|
|       Rey|  Rey|
|      Kerr| Kerr|
|    Mickie|Micki|
|    Kaspar|Kaspa|
|    Norbie|Norbi|
|    Claude|Claud|
|     Thain|Thain|
|  Tiffanie|Tiffa|
|    Ettore|Ettor|
+----------+-----+
only showing top 20 rows



### Multiple Filters

In [26]:
# df_filter = df.filter((df.first_name.isin('Aldin','Valma')) & (df.city.like('%ondon')))
df_filter = df.filter((df.first_name.isin('Aldin','Valma')) | (df.city.like('%Caxias')))
df_filter.show()

+---+----------+-------------+------+-----------+----------------+---------+----------+---------+
| id|first_name|    last_name|gender|       city|       job_title|   Salary|  latitude|longitude|
+---+----------+-------------+------+-----------+----------------+---------+----------+---------+
| 37|     Nicko|        Frays|  Male|     Caxias|  Health Coach I|$99786.40|-4.8654137|  -43.362|
|569|     Valma|      Bratton|Female|  Kurayoshi|Web Developer II|$32665.89| 35.449905|133.76134|
|901|     Aldin|Matuszkiewicz|  Male|East London|        Operator|$41468.83|-32.954933|27.931913|
+---+----------+-------------+------+-----------+----------------+---------+----------+---------+



In [27]:
# df_filter = df.filter((df.id > 10) & (df.id < 18))
df_filter = df.filter((df.id > 10) & (df.id <= 18))
df_filter.show()

+---+----------+---------+------+--------------+--------------------+---------+---------+----------+
| id|first_name|last_name|gender|          city|           job_title|   Salary| latitude| longitude|
+---+----------+---------+------+--------------+--------------------+---------+---------+----------+
| 11|    Kylynn|  Lockart|Female|      El Cardo|Nuclear Power Eng...|$13604.63|    -5.85| -79.88333|
| 12|       Rey|   Meharg|Female|   Wangqingtuo|Systems Administr...|$73423.70| 39.17238| 116.93161|
| 13|      Kerr|   Braden|  Male|     Sułkowice|Compensation Analyst|$33432.99| 49.81518| 19.377174|
| 14|    Mickie|Whanstall|  Male|   Springfield|Assistant Media P...|$50838.53| 42.10148|-72.576675|
| 15|    Kaspar|    Pally|  Male|        Chrást|  Analyst Programmer|$40163.03| 49.79233| 13.491532|
| 16|    Norbie|   Gwyllt|  Male|        Xijiao|              Editor|$32492.73|43.494576|  5.897802|
| 17|    Claude|   Briant|Female|     Mieścisko|Research Assistan...|$51862.48|52.744167| 1

### SQL on Dataframes

In [28]:
df.registerTempTable('original')
query1 = spark.sql("""
  SELECT *
  FROM original
""")
query1.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
| id|first_name| last_name|gender|           city|           job_title|   Salary|  latitude| longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52| 39.994747|116.339775|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.648994|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16| 53.426613|-6.1644998|
|  7|     Masha|    Divers|Female|         Dachun|              

In [29]:
query2 = spark.sql("""
  SELECT 
    id
    , CONCAT(first_name," ",last_name) as full_name
    --, city -- commented line
  FROM original
  WHERE gender = 'Female'
""")
query2.show()

+---+-------------------+
| id|          full_name|
+---+-------------------+
|  1|  Melinde Shilburne|
|  2|Kimberly Von Welden|
|  3|    Alvera Di Boldi|
|  6|         Maris Folk|
|  7|       Masha Divers|
| 11|     Kylynn Lockart|
| 12|         Rey Meharg|
| 17|      Claude Briant|
| 19|  Tiffanie Pattison|
| 23|    Lurleen Janczak|
| 24|      Nichol Holtum|
| 25|       Shaun Bridle|
| 26|     Leandra Anfrey|
| 28|    Jaquelyn Hazard|
| 29|  Prudence Honacker|
| 30|       Cherey Liger|
| 31|          Neda Krop|
| 34|    Barbi Fattorini|
| 38|   Lonnie Townshend|
| 39|    Valida Salzberg|
+---+-------------------+
only showing top 20 rows



### Adding Calculated Columns

In [30]:
from pyspark.sql.functions import *
df_clean_salary = df.withColumn('clean_salary', df.Salary.substr(2,100).cast('float'))
df_clean_salary.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+------------+
| id|first_name| last_name|gender|           city|           job_title|   Salary|  latitude| longitude|clean_salary|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|    57438.18|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|     62846.6|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52| 39.994747|116.339775|    57576.52|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|    61489.23|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.648994|    63863.09|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil En

In [32]:
df_monthly_salary = df_clean_salary.withColumn('monthly_salary', df_clean_salary.clean_salary/12)
df_monthly_salary.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+------------+------------------+
| id|first_name| last_name|gender|           city|           job_title|   Salary|  latitude| longitude|clean_salary|    monthly_salary|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+------------+------------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|    57438.18| 4786.514973958333|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|     62846.6|    5237.216796875|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52| 39.994747|116.339775|    57576.52| 4798.043294270833|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|    61489.23|   5124.1025390625|
|  5|  Sherwood|   Macieja|  Male|      Mytishch

In [33]:
# df_female = df.withColumn('Female?', when(df.gender == 'Female', 'yes'). otherwise('no'))
df_female = df.withColumn('Female?', when(df.gender == 'Female', 1). otherwise(0))
df_female.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+-------+
| id|first_name| last_name|gender|           city|           job_title|   Salary|  latitude| longitude|Female?|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+-------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|      1|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|      1|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52| 39.994747|116.339775|      1|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|      0|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.648994|      0|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16| 53.426613|-6.1644998| 

### Group_By and Aggregation

In [37]:
import pyspark.sql.functions as sqlfunc
df1 = df_clean_salary.groupBy('gender').agg(sqlfunc.sum('clean_salary'))
df1.show()

+------+--------------------+
|gender|   sum(clean_salary)|
+------+--------------------+
|Female|2.7364519950195312E7|
|  Male|2.8123435678710938E7|
+------+--------------------+



In [38]:
df1 = df_clean_salary.groupBy('gender').agg(sqlfunc.sum('clean_salary').alias('Total'),
                                            sqlfunc.avg('clean_salary').alias('Average'),
                                            sqlfunc.min('clean_salary').alias('Min'),
                                            sqlfunc.max('clean_salary').alias('Max')
)
df1.show()

+------+--------------------+-----------------+--------+--------+
|gender|               Total|          Average|     Min|     Max|
+------+--------------------+-----------------+--------+--------+
|Female|2.7364519950195312E7|55618.94298820185|10616.44|99948.28|
|  Male|2.8123435678710938E7|55361.09385573019|10101.92|99942.92|
+------+--------------------+-----------------+--------+--------+



In [41]:
df1 = df_clean_salary.groupBy('gender', 'job_title').agg(sqlfunc.sum('clean_salary').alias('Total'),
                                            sqlfunc.avg('clean_salary').alias('Average'),
                                            sqlfunc.min('clean_salary').alias('Min'),
                                            sqlfunc.max('clean_salary').alias('Max')
)
df1.show()

+------+--------------------+-----------------+------------------+--------+--------+
|gender|           job_title|            Total|           Average|     Min|     Max|
+------+--------------------+-----------------+------------------+--------+--------+
|Female|    Statistician III|    44224.8984375|     44224.8984375| 44224.9| 44224.9|
|  Male|     Cost Accountant| 322273.427734375|40284.178466796875|15849.42|81698.25|
|Female|         Engineer IV|   134404.5703125|    67202.28515625| 57365.1|77039.47|
|Female| Clinical Specialist|189083.8876953125|47270.971923828125| 12468.7|81003.76|
|Female|    Dental Hygienist| 155758.638671875|25959.773111979168|10808.16| 44627.3|
|Female|Research Assistan...|   70371.83984375|   35185.919921875|21039.36|49332.48|
|Female|  Nurse Practicioner|292473.1064453125|  58494.6212890625|12908.38|91322.55|
|  Male| Geological Engineer|   296738.3828125|     59347.6765625|23825.54|94839.56|
|  Male|            VP Sales|  405120.08984375| 67520.01497395833

### Writing dataframe to file

In [42]:
# df1.write.json('df1.json')
# df1.write.parquet('df1.parquet')
# df1.write.csv('df1.csv')