## Pyspark tutorial
Pyspark docs [link](https://spark.apache.org/docs/latest/api/python/getting_started/index.html)
<br>
Good intro to spark [link](https://www.youtube.com/watch?v=9U4ED7KQwlE&)
<br>

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Dataframe').getOrCreate()

In [2]:
# load CSV file into a Dataframe. inferSchema identify the correct data types
df_pyspark=spark.read.csv('contacts2.csv', header=True, inferSchema=True)

In [3]:
# display dataframe
df_pyspark.show()

+--------------+---------+---------+----+----------+--------------+--------------------+------+
|          name|     city|    phone| age|experience|    speciality|              e-mail|salary|
+--------------+---------+---------+----+----------+--------------+--------------------+------+
|  Susan Calvin|   London| 56152358|  28|        10|   engineering|SusanCalvin@email...| 15000|
| Bently Powell|Kathmandu| 96523995|  42|        22|    operations|   BentlyP@email.com| 90000|
|Gregory Powell|     null|895712365|  66|        35|   engineering|   GregP14@email.com| 50000|
|  Mike Donovan|Bangalore|886549702|  67|        34|administration|  MDonovan@email.com| 80000|
|          null|Sao Paulo|     null|  30|      null|   engineering|                null|  null|
|          null|    Tokyo| 98028909|null|      null|    operations|                null|  null|
+--------------+---------+---------+----+----------+--------------+--------------------+------+



In [4]:
# show schema
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- phone: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- speciality: string (nullable = true)
 |-- e-mail: string (nullable = true)
 |-- salary: integer (nullable = true)



In [5]:
# show datatypes
df_pyspark.dtypes

[('name', 'string'),
 ('city', 'string'),
 ('phone', 'int'),
 ('age', 'int'),
 ('experience', 'int'),
 ('speciality', 'string'),
 ('e-mail', 'string'),
 ('salary', 'int')]

In [6]:
# number of lines
df_pyspark.count()

6

### Working with columns and rows

In [7]:
df_pyspark.columns

['name',
 'city',
 'phone',
 'age',
 'experience',
 'speciality',
 'e-mail',
 'salary']

In [8]:
# select a column by name
df_pyspark.select('name').show()

+--------------+
|          name|
+--------------+
|  Susan Calvin|
| Bently Powell|
|Gregory Powell|
|  Mike Donovan|
|          null|
|          null|
+--------------+



In [9]:
# select 2 or more columns by name
df_pyspark.select('name', 'e-mail').show()

+--------------+--------------------+
|          name|              e-mail|
+--------------+--------------------+
|  Susan Calvin|SusanCalvin@email...|
| Bently Powell|   BentlyP@email.com|
|Gregory Powell|   GregP14@email.com|
|  Mike Donovan|  MDonovan@email.com|
|          null|                null|
|          null|                null|
+--------------+--------------------+



In [10]:
# show description
df_pyspark.select('age', 'experience', 'salary').describe().show()

+-------+-----------------+------------------+------------------+
|summary|              age|        experience|            salary|
+-------+-----------------+------------------+------------------+
|  count|                5|                 4|                 4|
|   mean|             46.6|             25.25|           58750.0|
| stddev|18.94201678808252|11.757976016304847|33757.715167548486|
|    min|               28|                10|             15000|
|    max|               67|                35|             90000|
+-------+-----------------+------------------+------------------+



In [11]:
# select columns by index
df_pyspark.select(df_pyspark.columns[1:3]).show()

+---------+---------+
|     city|    phone|
+---------+---------+
|   London| 56152358|
|Kathmandu| 96523995|
|     null|895712365|
|Bangalore|886549702|
|Sao Paulo|     null|
|    Tokyo| 98028909|
+---------+---------+



In [12]:
# retrieving data
df_pyspark.collect()

[Row(name='Susan Calvin', city='London', phone=56152358, age=28, experience=10, speciality='engineering', e-mail='SusanCalvin@email.com', salary=15000),
 Row(name='Bently Powell', city='Kathmandu', phone=96523995, age=42, experience=22, speciality='operations', e-mail='BentlyP@email.com', salary=90000),
 Row(name='Gregory Powell', city=None, phone=895712365, age=66, experience=35, speciality='engineering', e-mail='GregP14@email.com', salary=50000),
 Row(name='Mike Donovan', city='Bangalore', phone=886549702, age=67, experience=34, speciality='administration', e-mail='MDonovan@email.com', salary=80000),
 Row(name=None, city='Sao Paulo', phone=None, age=30, experience=None, speciality='engineering', e-mail=None, salary=None),
 Row(name=None, city='Tokyo', phone=98028909, age=None, experience=None, speciality='operations', e-mail=None, salary=None)]

In [13]:
# select 3rd row
df_pyspark.collect()[2]

Row(name='Gregory Powell', city=None, phone=895712365, age=66, experience=35, speciality='engineering', e-mail='GregP14@email.com', salary=50000)

In [14]:
# select last row
df_pyspark.collect()[-1]

Row(name=None, city='Tokyo', phone=98028909, age=None, experience=None, speciality='operations', e-mail=None, salary=None)

In [15]:
# two top rows
df_pyspark.show(2)

+-------------+---------+--------+---+----------+-----------+--------------------+------+
|         name|     city|   phone|age|experience| speciality|              e-mail|salary|
+-------------+---------+--------+---+----------+-----------+--------------------+------+
| Susan Calvin|   London|56152358| 28|        10|engineering|SusanCalvin@email...| 15000|
|Bently Powell|Kathmandu|96523995| 42|        22| operations|   BentlyP@email.com| 90000|
+-------------+---------+--------+---+----------+-----------+--------------------+------+
only showing top 2 rows



In [16]:
df_pyspark.head(2)

[Row(name='Susan Calvin', city='London', phone=56152358, age=28, experience=10, speciality='engineering', e-mail='SusanCalvin@email.com', salary=15000),
 Row(name='Bently Powell', city='Kathmandu', phone=96523995, age=42, experience=22, speciality='operations', e-mail='BentlyP@email.com', salary=90000)]

In [17]:
df_pyspark.tail(2)

[Row(name=None, city='Sao Paulo', phone=None, age=30, experience=None, speciality='engineering', e-mail=None, salary=None),
 Row(name=None, city='Tokyo', phone=98028909, age=None, experience=None, speciality='operations', e-mail=None, salary=None)]

In [18]:
# select an entry [row][column]
df_pyspark.collect()[0][1]

'London'

### Renaming columns

In [19]:
# rename columns
df_pyspark.withColumnRenamed('name', 'new_name').show()

+--------------+---------+---------+----+----------+--------------+--------------------+------+
|      new_name|     city|    phone| age|experience|    speciality|              e-mail|salary|
+--------------+---------+---------+----+----------+--------------+--------------------+------+
|  Susan Calvin|   London| 56152358|  28|        10|   engineering|SusanCalvin@email...| 15000|
| Bently Powell|Kathmandu| 96523995|  42|        22|    operations|   BentlyP@email.com| 90000|
|Gregory Powell|     null|895712365|  66|        35|   engineering|   GregP14@email.com| 50000|
|  Mike Donovan|Bangalore|886549702|  67|        34|administration|  MDonovan@email.com| 80000|
|          null|Sao Paulo|     null|  30|      null|   engineering|                null|  null|
|          null|    Tokyo| 98028909|null|      null|    operations|                null|  null|
+--------------+---------+---------+----+----------+--------------+--------------------+------+



### Adding/removing Columns and rows

In [20]:
# add column
df_pyspark.withColumn('exp_after_5_years',df_pyspark['experience']+5).show()

+--------------+---------+---------+----+----------+--------------+--------------------+------+-----------------+
|          name|     city|    phone| age|experience|    speciality|              e-mail|salary|exp_after_5_years|
+--------------+---------+---------+----+----------+--------------+--------------------+------+-----------------+
|  Susan Calvin|   London| 56152358|  28|        10|   engineering|SusanCalvin@email...| 15000|               15|
| Bently Powell|Kathmandu| 96523995|  42|        22|    operations|   BentlyP@email.com| 90000|               27|
|Gregory Powell|     null|895712365|  66|        35|   engineering|   GregP14@email.com| 50000|               40|
|  Mike Donovan|Bangalore|886549702|  67|        34|administration|  MDonovan@email.com| 80000|               39|
|          null|Sao Paulo|     null|  30|      null|   engineering|                null|  null|             null|
|          null|    Tokyo| 98028909|null|      null|    operations|                null|

In [21]:
# drop columns
df_pyspark.drop('e-mail').show()

+--------------+---------+---------+----+----------+--------------+------+
|          name|     city|    phone| age|experience|    speciality|salary|
+--------------+---------+---------+----+----------+--------------+------+
|  Susan Calvin|   London| 56152358|  28|        10|   engineering| 15000|
| Bently Powell|Kathmandu| 96523995|  42|        22|    operations| 90000|
|Gregory Powell|     null|895712365|  66|        35|   engineering| 50000|
|  Mike Donovan|Bangalore|886549702|  67|        34|administration| 80000|
|          null|Sao Paulo|     null|  30|      null|   engineering|  null|
|          null|    Tokyo| 98028909|null|      null|    operations|  null|
+--------------+---------+---------+----+----------+--------------+------+



### Missing values

In [22]:
# drop na
df_pyspark.na.drop().show()

+-------------+---------+---------+---+----------+--------------+--------------------+------+
|         name|     city|    phone|age|experience|    speciality|              e-mail|salary|
+-------------+---------+---------+---+----------+--------------+--------------------+------+
| Susan Calvin|   London| 56152358| 28|        10|   engineering|SusanCalvin@email...| 15000|
|Bently Powell|Kathmandu| 96523995| 42|        22|    operations|   BentlyP@email.com| 90000|
| Mike Donovan|Bangalore|886549702| 67|        34|administration|  MDonovan@email.com| 80000|
+-------------+---------+---------+---+----------+--------------+--------------------+------+



In [23]:
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.dropna.html
# how: ‘any’ or ‘all’. If ‘any’, drop a row if it contains any nulls. If ‘all’, drop a row only if all its values are null.
df_pyspark.na.drop(how='all').show()

+--------------+---------+---------+----+----------+--------------+--------------------+------+
|          name|     city|    phone| age|experience|    speciality|              e-mail|salary|
+--------------+---------+---------+----+----------+--------------+--------------------+------+
|  Susan Calvin|   London| 56152358|  28|        10|   engineering|SusanCalvin@email...| 15000|
| Bently Powell|Kathmandu| 96523995|  42|        22|    operations|   BentlyP@email.com| 90000|
|Gregory Powell|     null|895712365|  66|        35|   engineering|   GregP14@email.com| 50000|
|  Mike Donovan|Bangalore|886549702|  67|        34|administration|  MDonovan@email.com| 80000|
|          null|Sao Paulo|     null|  30|      null|   engineering|                null|  null|
|          null|    Tokyo| 98028909|null|      null|    operations|                null|  null|
+--------------+---------+---------+----+----------+--------------+--------------------+------+



In [24]:
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.dropna.html
# thresh: default None If specified, drop rows that have less than thresh non-null values. This overwrites the how parameter.
df_pyspark.na.drop(thresh=3).show()

+--------------+---------+---------+----+----------+--------------+--------------------+------+
|          name|     city|    phone| age|experience|    speciality|              e-mail|salary|
+--------------+---------+---------+----+----------+--------------+--------------------+------+
|  Susan Calvin|   London| 56152358|  28|        10|   engineering|SusanCalvin@email...| 15000|
| Bently Powell|Kathmandu| 96523995|  42|        22|    operations|   BentlyP@email.com| 90000|
|Gregory Powell|     null|895712365|  66|        35|   engineering|   GregP14@email.com| 50000|
|  Mike Donovan|Bangalore|886549702|  67|        34|administration|  MDonovan@email.com| 80000|
|          null|Sao Paulo|     null|  30|      null|   engineering|                null|  null|
|          null|    Tokyo| 98028909|null|      null|    operations|                null|  null|
+--------------+---------+---------+----+----------+--------------+--------------------+------+



In [25]:
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.dropna.html
# subset: optional list of column names to consider.
df_pyspark.na.drop(subset=['city']).show()

+-------------+---------+---------+----+----------+--------------+--------------------+------+
|         name|     city|    phone| age|experience|    speciality|              e-mail|salary|
+-------------+---------+---------+----+----------+--------------+--------------------+------+
| Susan Calvin|   London| 56152358|  28|        10|   engineering|SusanCalvin@email...| 15000|
|Bently Powell|Kathmandu| 96523995|  42|        22|    operations|   BentlyP@email.com| 90000|
| Mike Donovan|Bangalore|886549702|  67|        34|administration|  MDonovan@email.com| 80000|
|         null|Sao Paulo|     null|  30|      null|   engineering|                null|  null|
|         null|    Tokyo| 98028909|null|      null|    operations|                null|  null|
+-------------+---------+---------+----+----------+--------------+--------------------+------+



In [26]:
## filling missing values - only strings
df_pyspark.na.fill('Missing values').show()

+--------------+--------------+---------+----+----------+--------------+--------------------+------+
|          name|          city|    phone| age|experience|    speciality|              e-mail|salary|
+--------------+--------------+---------+----+----------+--------------+--------------------+------+
|  Susan Calvin|        London| 56152358|  28|        10|   engineering|SusanCalvin@email...| 15000|
| Bently Powell|     Kathmandu| 96523995|  42|        22|    operations|   BentlyP@email.com| 90000|
|Gregory Powell|Missing values|895712365|  66|        35|   engineering|   GregP14@email.com| 50000|
|  Mike Donovan|     Bangalore|886549702|  67|        34|administration|  MDonovan@email.com| 80000|
|Missing values|     Sao Paulo|     null|  30|      null|   engineering|      Missing values|  null|
|Missing values|         Tokyo| 98028909|null|      null|    operations|      Missing values|  null|
+--------------+--------------+---------+----+----------+--------------+-------------------

In [27]:
## filling missing values - only integers
df_pyspark.na.fill(999).show()

+--------------+---------+---------+---+----------+--------------+--------------------+------+
|          name|     city|    phone|age|experience|    speciality|              e-mail|salary|
+--------------+---------+---------+---+----------+--------------+--------------------+------+
|  Susan Calvin|   London| 56152358| 28|        10|   engineering|SusanCalvin@email...| 15000|
| Bently Powell|Kathmandu| 96523995| 42|        22|    operations|   BentlyP@email.com| 90000|
|Gregory Powell|     null|895712365| 66|        35|   engineering|   GregP14@email.com| 50000|
|  Mike Donovan|Bangalore|886549702| 67|        34|administration|  MDonovan@email.com| 80000|
|          null|Sao Paulo|      999| 30|       999|   engineering|                null|   999|
|          null|    Tokyo| 98028909|999|       999|    operations|                null|   999|
+--------------+---------+---------+---+----------+--------------+--------------------+------+



In [28]:
## filling missing values on selected columns
df_pyspark.na.fill('Missing values', ['city','e-mail']).show()

+--------------+--------------+---------+----+----------+--------------+--------------------+------+
|          name|          city|    phone| age|experience|    speciality|              e-mail|salary|
+--------------+--------------+---------+----+----------+--------------+--------------------+------+
|  Susan Calvin|        London| 56152358|  28|        10|   engineering|SusanCalvin@email...| 15000|
| Bently Powell|     Kathmandu| 96523995|  42|        22|    operations|   BentlyP@email.com| 90000|
|Gregory Powell|Missing values|895712365|  66|        35|   engineering|   GregP14@email.com| 50000|
|  Mike Donovan|     Bangalore|886549702|  67|        34|administration|  MDonovan@email.com| 80000|
|          null|     Sao Paulo|     null|  30|      null|   engineering|      Missing values|  null|
|          null|         Tokyo| 98028909|null|      null|    operations|      Missing values|  null|
+--------------+--------------+---------+----+----------+--------------+-------------------

### Filtering
* and: &
* or: |
* equal: ==
* not: ~

In [29]:
# Filtering
df_pyspark.filter("salary<=50000").show()

+--------------+------+---------+---+----------+-----------+--------------------+------+
|          name|  city|    phone|age|experience| speciality|              e-mail|salary|
+--------------+------+---------+---+----------+-----------+--------------------+------+
|  Susan Calvin|London| 56152358| 28|        10|engineering|SusanCalvin@email...| 15000|
|Gregory Powell|  null|895712365| 66|        35|engineering|   GregP14@email.com| 50000|
+--------------+------+---------+---+----------+-----------+--------------------+------+



In [30]:
#same result different syntax
df_pyspark.filter(df_pyspark['salary']<=50000).show()

+--------------+------+---------+---+----------+-----------+--------------------+------+
|          name|  city|    phone|age|experience| speciality|              e-mail|salary|
+--------------+------+---------+---+----------+-----------+--------------------+------+
|  Susan Calvin|London| 56152358| 28|        10|engineering|SusanCalvin@email...| 15000|
|Gregory Powell|  null|895712365| 66|        35|engineering|   GregP14@email.com| 50000|
+--------------+------+---------+---+----------+-----------+--------------------+------+



In [31]:
# invert selection with a ~
df_pyspark.filter(~(df_pyspark['salary']<=50000)).show()

+-------------+---------+---------+---+----------+--------------+------------------+------+
|         name|     city|    phone|age|experience|    speciality|            e-mail|salary|
+-------------+---------+---------+---+----------+--------------+------------------+------+
|Bently Powell|Kathmandu| 96523995| 42|        22|    operations| BentlyP@email.com| 90000|
| Mike Donovan|Bangalore|886549702| 67|        34|administration|MDonovan@email.com| 80000|
+-------------+---------+---------+---+----------+--------------+------------------+------+



In [32]:
df_pyspark.filter("salary<=50000").select(['name','experience']).show()

+--------------+----------+
|          name|experience|
+--------------+----------+
|  Susan Calvin|        10|
|Gregory Powell|        35|
+--------------+----------+



In [33]:
df_pyspark.filter((df_pyspark['salary']<=80000) & 
                (df_pyspark['salary']>=20000)).show()

+--------------+---------+---------+---+----------+--------------+------------------+------+
|          name|     city|    phone|age|experience|    speciality|            e-mail|salary|
+--------------+---------+---------+---+----------+--------------+------------------+------+
|Gregory Powell|     null|895712365| 66|        35|   engineering| GregP14@email.com| 50000|
|  Mike Donovan|Bangalore|886549702| 67|        34|administration|MDonovan@email.com| 80000|
+--------------+---------+---------+---+----------+--------------+------------------+------+



In [34]:
# selecting a subset of rows
df_pyspark.filter(df_pyspark.age==28).show()

+------------+------+--------+---+----------+-----------+--------------------+------+
|        name|  city|   phone|age|experience| speciality|              e-mail|salary|
+------------+------+--------+---+----------+-----------+--------------------+------+
|Susan Calvin|London|56152358| 28|        10|engineering|SusanCalvin@email...| 15000|
+------------+------+--------+---+----------+-----------+--------------------+------+



### Groupby 

In [35]:
# group by speciality and applying sum()
df_pyspark.groupBy('speciality').sum().show()

+--------------+----------+--------+---------------+-----------+
|    speciality|sum(phone)|sum(age)|sum(experience)|sum(salary)|
+--------------+----------+--------+---------------+-----------+
|administration| 886549702|      67|             34|      80000|
|    operations| 194552904|      42|             22|      90000|
|   engineering| 951864723|     124|             45|      65000|
+--------------+----------+--------+---------------+-----------+



In [36]:
# group by speciality and counting
df_pyspark.groupBy('speciality').count().show()

+--------------+-----+
|    speciality|count|
+--------------+-----+
|administration|    1|
|    operations|    2|
|   engineering|    3|
+--------------+-----+



In [37]:
# group by speciality and getting max()
df_pyspark.groupBy('speciality').max().show()

+--------------+----------+--------+---------------+-----------+
|    speciality|max(phone)|max(age)|max(experience)|max(salary)|
+--------------+----------+--------+---------------+-----------+
|administration| 886549702|      67|             34|      80000|
|    operations|  98028909|      42|             22|      90000|
|   engineering| 895712365|      66|             35|      50000|
+--------------+----------+--------+---------------+-----------+



In [38]:
# group by speciality and applying avg()
df_pyspark.groupBy('speciality').avg().show()

+--------------+-------------+------------------+---------------+-----------+
|    speciality|   avg(phone)|          avg(age)|avg(experience)|avg(salary)|
+--------------+-------------+------------------+---------------+-----------+
|administration| 8.86549702E8|              67.0|           34.0|    80000.0|
|    operations|  9.7276452E7|              42.0|           22.0|    90000.0|
|   engineering|4.759323615E8|41.333333333333336|           22.5|    32500.0|
+--------------+-------------+------------------+---------------+-----------+



### Aggregate

In [39]:
df_pyspark.agg({'salary':'sum'}).show()

+-----------+
|sum(salary)|
+-----------+
|     235000|
+-----------+



### Misc

In [40]:
# converts to pandas DataFrame
df_pyspark.toPandas()

Unnamed: 0,name,city,phone,age,experience,speciality,e-mail,salary
0,Susan Calvin,London,56152358.0,28.0,10.0,engineering,SusanCalvin@email.com,15000.0
1,Bently Powell,Kathmandu,96523995.0,42.0,22.0,operations,BentlyP@email.com,90000.0
2,Gregory Powell,,895712365.0,66.0,35.0,engineering,GregP14@email.com,50000.0
3,Mike Donovan,Bangalore,886549702.0,67.0,34.0,administration,MDonovan@email.com,80000.0
4,,Sao Paulo,,30.0,,engineering,,
5,,Tokyo,98028909.0,,,operations,,


In [41]:
# sorting
df_pyspark.sort(df_pyspark["age"]).show()

+--------------+---------+---------+----+----------+--------------+--------------------+------+
|          name|     city|    phone| age|experience|    speciality|              e-mail|salary|
+--------------+---------+---------+----+----------+--------------+--------------------+------+
|          null|    Tokyo| 98028909|null|      null|    operations|                null|  null|
|  Susan Calvin|   London| 56152358|  28|        10|   engineering|SusanCalvin@email...| 15000|
|          null|Sao Paulo|     null|  30|      null|   engineering|                null|  null|
| Bently Powell|Kathmandu| 96523995|  42|        22|    operations|   BentlyP@email.com| 90000|
|Gregory Powell|     null|895712365|  66|        35|   engineering|   GregP14@email.com| 50000|
|  Mike Donovan|Bangalore|886549702|  67|        34|administration|  MDonovan@email.com| 80000|
+--------------+---------+---------+----+----------+--------------+--------------------+------+



In [42]:
# sorting
df_pyspark.sort(df_pyspark["age"]).show()

+--------------+---------+---------+----+----------+--------------+--------------------+------+
|          name|     city|    phone| age|experience|    speciality|              e-mail|salary|
+--------------+---------+---------+----+----------+--------------+--------------------+------+
|          null|    Tokyo| 98028909|null|      null|    operations|                null|  null|
|  Susan Calvin|   London| 56152358|  28|        10|   engineering|SusanCalvin@email...| 15000|
|          null|Sao Paulo|     null|  30|      null|   engineering|                null|  null|
| Bently Powell|Kathmandu| 96523995|  42|        22|    operations|   BentlyP@email.com| 90000|
|Gregory Powell|     null|895712365|  66|        35|   engineering|   GregP14@email.com| 50000|
|  Mike Donovan|Bangalore|886549702|  67|        34|administration|  MDonovan@email.com| 80000|
+--------------+---------+---------+----+----------+--------------+--------------------+------+



In [43]:
# filtering + sorting
(df_pyspark
    .filter(df_pyspark["speciality"]=='engineering')
    .sort(df_pyspark["age"])
    .show())

+--------------+---------+---------+---+----------+-----------+--------------------+------+
|          name|     city|    phone|age|experience| speciality|              e-mail|salary|
+--------------+---------+---------+---+----------+-----------+--------------------+------+
|  Susan Calvin|   London| 56152358| 28|        10|engineering|SusanCalvin@email...| 15000|
|          null|Sao Paulo|     null| 30|      null|engineering|                null|  null|
|Gregory Powell|     null|895712365| 66|        35|engineering|   GregP14@email.com| 50000|
+--------------+---------+---------+---+----------+-----------+--------------------+------+

