In [208]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import desc, asc, col, avg, sum, max, min, mean, count, round

spark = SparkSession.builder.appName("local_spark").getOrCreate()

In [161]:
spark

1)	Read data from CSV

'header=True' Removes header row

In [162]:
sparkdf = spark.read.csv('students.csv',header=True,inferSchema=True)

2)	Print 5 rows ( !!! head() method behaves differently in pySpark)

Matrix

In [163]:
sparkdf.head(5)

[Row(id=1, name='Emily Hardie', class='Four', mark=75, gender='female'),
 Row(id=2, name='John Star', class='Three', mark=85, gender='male'),
 Row(id=3, name='Arnold Walker ', class='Three', mark=55, gender='male'),
 Row(id=4, name='Reana Talu', class='Four', mark=60, gender='female'),
 Row(id=5, name='Sidona Williams', class='Four', mark=60, gender='female')]

Table

'.show(truncate=True)' Overwrites dataframe

In [164]:
sparkdf.show(5)

+---+---------------+-----+----+------+
| id|           name|class|mark|gender|
+---+---------------+-----+----+------+
|  1|   Emily Hardie| Four|  75|female|
|  2|      John Star|Three|  85|  male|
|  3| Arnold Walker |Three|  55|  male|
|  4|     Reana Talu| Four|  60|female|
|  5|Sidona Williams| Four|  60|female|
+---+---------------+-----+----+------+
only showing top 5 rows



In [165]:
sparkdf2=spark.read.format('csv').option('inferSchema',True).option('header',True).option('sep',True).load('students.csv')

3)	Show schema of the spark data frame.

In [166]:
sparkdf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- class: string (nullable = true)
 |-- mark: integer (nullable = true)
 |-- gender: string (nullable = true)



4)	Update schema after reading form CSV. (StructField)

In [167]:
schema = StructType([
        StructField("id", IntegerType(), False),
        StructField("name", StringType(), True),
        StructField("class", StringType(), True),
        StructField("mark", IntegerType(), True),
        StructField("gender", StringType(), True)])

df = spark.read.csv('students.csv', header=True, schema=schema)
df.printSchema()
df.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- class: string (nullable = true)
 |-- mark: integer (nullable = true)
 |-- gender: string (nullable = true)

+---+----------------+-----+----+------+
| id|            name|class|mark|gender|
+---+----------------+-----+----+------+
|  1|    Emily Hardie| Four|  75|female|
|  2|       John Star|Three|  85|  male|
|  3|  Arnold Walker |Three|  55|  male|
|  4|      Reana Talu| Four|  60|female|
|  5| Sidona Williams| Four|  60|female|
|  6|       Alex John| Four|  55|  male|
|  7|    Robert John |Three|  78|  male|
|  8|       Lee Malva| Four|  85|  male|
|  9|    Wookie Davey|  Two|  78|  male|
| 10|      Diane Rose|  Two|  55|female|
| 11|    Holly Daives|  Two|  89|female|
| 12|        Eva Cup |Three|  94|female|
| 13| Victoria Mathew| Four|  88|female|
| 14|       Iris Zhao|  Two|  88|female|
| 15|       Scott Row| Four|  88|  male|
| 16|     Daniel Page| Four|  88|  male|
| 17|  James Williams|Three|  54

Updates schema without reading file again

In [200]:
df.withColumn('MARK', col('mark').cast(IntegerType())).printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- class: string (nullable = true)
 |-- MARK: integer (nullable = true)
 |-- gender: string (nullable = true)



5)	Give schema as option while reading from CSV

In [168]:
df = spark.read.csv('students.csv', header=True, schema=schema)

6)	Show columns and show summary statistics of numeric columns

No quartiles

In [201]:
df.describe().show()

+-------+------------------+---------+-----+-----------------+------+
|summary|                id|     name|class|             mark|gender|
+-------+------------------+---------+-----+-----------------+------+
|  count|                35|       36|   36|               35|    36|
|   mean|              18.0|     null| null|75.51428571428572|  null|
| stddev|10.246950765959598|     null| null|13.95448784772974|  null|
|    min|                 1|Alex John| Four|               48|female|
|    max|                35|     name|class|               96|  male|
+-------+------------------+---------+-----+-----------------+------+



With quartiles

In [202]:
df.summary().show()

+-------+------------------+---------+-----+-----------------+------+
|summary|                id|     name|class|             mark|gender|
+-------+------------------+---------+-----+-----------------+------+
|  count|                35|       36|   36|               35|    36|
|   mean|              18.0|     null| null|75.51428571428572|  null|
| stddev|10.246950765959598|     null| null|13.95448784772974|  null|
|    min|                 1|Alex John| Four|               48|female|
|    25%|                 9|     null| null|               60|  null|
|    50%|                18|     null| null|               79|  null|
|    75%|                27|     null| null|               88|  null|
|    max|                35|     name|class|               96|  male|
+-------+------------------+---------+-----+-----------------+------+



7)	Read from JSON

In [171]:
json_spark_df = spark.read.json('people.json')

8)	Subset the Dataframe for one and then more than one columns. 

In [172]:
id = df.select('id')

id_name_mark = df.select(
    'id',
    'name',
    'mark'
)

id_name_mark.show()

+----+----------------+----+
|  id|            name|mark|
+----+----------------+----+
|null|            name|null|
|   1|    Emily Hardie|  75|
|   2|       John Star|  85|
|   3|  Arnold Walker |  55|
|   4|      Reana Talu|  60|
|   5| Sidona Williams|  60|
|   6|       Alex John|  55|
|   7|    Robert John |  78|
|   8|       Lee Malva|  85|
|   9|    Wookie Davey|  78|
|  10|      Diane Rose|  55|
|  11|    Holly Daives|  89|
|  12|        Eva Cup |  94|
|  13| Victoria Mathew|  88|
|  14|       Iris Zhao|  88|
|  15|       Scott Row|  88|
|  16|     Daniel Page|  88|
|  17|  James Williams|  54|
|  18|Martin Johnston |  75|
|  19|     John Smith |  48|
+----+----------------+----+
only showing top 20 rows



9)	Filter Data frame based on condition.\
•	Filter according to gender\
•	Filter according to mark >50\
•	Filter by multiple conditions

In [173]:
df.filter(df.gender=='male').show()
df.filter(df.mark>50).show()
df.filter((df.gender == 'male') & (df.mark > 50)).show()

+---+----------------+-----+----+------+
| id|            name|class|mark|gender|
+---+----------------+-----+----+------+
|  2|       John Star|Three|  85|  male|
|  3|  Arnold Walker |Three|  55|  male|
|  6|       Alex John| Four|  55|  male|
|  7|    Robert John |Three|  78|  male|
|  8|       Lee Malva| Four|  85|  male|
|  9|    Wookie Davey|  Two|  78|  male|
| 15|       Scott Row| Four|  88|  male|
| 16|     Daniel Page| Four|  88|  male|
| 17|  James Williams|Three|  54|  male|
| 18|Martin Johnston | Four|  75|  male|
| 19|     John Smith | Four|  48|  male|
| 23|       Sam Adan |Three|  79|  male|
| 24|   Nova Prescott|  Two|  78|  male|
| 25|  William Taylor| Four|  88|  male|
| 26|   Laurin Wilson|Three|  79|  male|
| 29|         Ben Day| Four|  55|  male|
| 31|      Chris Ball| Four|  88|  male|
| 34|   Garry Richard|Three|  69|  male|
+---+----------------+-----+----+------+

+---+----------------+-----+----+------+
| id|            name|class|mark|gender|
+---+----------

10)	Add new column,\
•	New column name: corrected mark\
•	It has values mark+3

In [174]:
sparkdf.withColumn('corrected_mark', (sparkdf.mark + 3)).show(truncate=True)

+---+----------------+-----+----+------+--------------+
| id|            name|class|mark|gender|corrected_mark|
+---+----------------+-----+----+------+--------------+
|  1|    Emily Hardie| Four|  75|female|            78|
|  2|       John Star|Three|  85|  male|            88|
|  3|  Arnold Walker |Three|  55|  male|            58|
|  4|      Reana Talu| Four|  60|female|            63|
|  5| Sidona Williams| Four|  60|female|            63|
|  6|       Alex John| Four|  55|  male|            58|
|  7|    Robert John |Three|  78|  male|            81|
|  8|       Lee Malva| Four|  85|  male|            88|
|  9|    Wookie Davey|  Two|  78|  male|            81|
| 10|      Diane Rose|  Two|  55|female|            58|
| 11|    Holly Daives|  Two|  89|female|            92|
| 12|        Eva Cup |Three|  94|female|            97|
| 13| Victoria Mathew| Four|  88|female|            91|
| 14|       Iris Zhao|  Two|  88|female|            91|
| 15|       Scott Row| Four|  88|  male|        

11)	Groupby gender\
•	Calculate the average mark for each gender\
•	Max\
•	Min

In [209]:
sparkdf.groupBy('gender').avg('mark').show()
sparkdf.groupBy('gender').min('mark').show()
sparkdf.groupBy('gender').max('mark').show()
sparkdf.groupBy('gender')\
.agg((round(avg('mark'),2).alias('Avg')),max('mark').alias('Max'),min('mark').alias('Min')).show()

+------+-----------------+
|gender|        avg(mark)|
+------+-----------------+
|female|77.52941176470588|
|  male|73.61111111111111|
+------+-----------------+

+------+---------+
|gender|min(mark)|
+------+---------+
|female|       55|
|  male|       48|
+------+---------+

+------+---------+
|gender|max(mark)|
+------+---------+
|female|       96|
|  male|       88|
+------+---------+

+------+-----+---+---+
|gender|  Avg|Max|Min|
+------+-----+---+---+
|female|77.53| 96| 55|
|  male|73.61| 88| 48|
+------+-----+---+---+



12. Aggregation:\
•	Calculate the average mark of all students.

In [176]:
sparkdf.groupBy().avg('mark').show()

+-----------------+
|        avg(mark)|
+-----------------+
|75.51428571428572|
+-----------------+



13)	Order by 
- “class”
- “mark”
- “mark” and descending order.

In [177]:
sparkdf.orderBy('class').show()
sparkdf.orderBy(asc('mark')).show()
sparkdf.orderBy(desc('mark')).show()

+---+----------------+-----+----+------+
| id|            name|class|mark|gender|
+---+----------------+-----+----+------+
|  5| Sidona Williams| Four|  60|female|
| 13| Victoria Mathew| Four|  88|female|
| 15|       Scott Row| Four|  88|  male|
|  6|       Alex John| Four|  55|  male|
|  1|    Emily Hardie| Four|  75|female|
| 25|  William Taylor| Four|  88|  male|
| 28|  Emily Thompson| Four|  86|female|
| 29|         Ben Day| Four|  55|  male|
| 31|      Chris Ball| Four|  88|  male|
| 32|        Ela Love| Four|  90|female|
| 33|   Elisa Richard| Four|  96|female|
| 16|     Daniel Page| Four|  88|  male|
| 18|Martin Johnston | Four|  75|  male|
|  4|      Reana Talu| Four|  60|female|
| 19|     John Smith | Four|  48|  male|
|  8|       Lee Malva| Four|  85|  male|
|  2|       John Star|Three|  85|  male|
|  3|  Arnold Walker |Three|  55|  male|
|  7|    Robert John |Three|  78|  male|
| 12|        Eva Cup |Three|  94|female|
+---+----------------+-----+----+------+
only showing top

14)	Access to specific row( hint: collect() method)  and then convert it to dictionary. 

In [181]:
first_row = sparkdf.collect()[0]
my_dict = first_row.asDict()
print(first_row)
print(my_dict)

Row(id=1, name='Emily Hardie', class='Four', mark=75, gender='female')
{'id': 1, 'name': 'Emily Hardie', 'class': 'Four', 'mark': 75, 'gender': 'female'}


15. Create a view from the dataframe and filter it by using SQL syntax.

In [179]:
sparkdf.createOrReplaceTempView('temp_view')
spark.sql(
    '''
    SELECT *
    FROM temp_view
    WHERE gender = "male"
    '''
).show()

+---+----------------+-----+----+------+
| id|            name|class|mark|gender|
+---+----------------+-----+----+------+
|  2|       John Star|Three|  85|  male|
|  3|  Arnold Walker |Three|  55|  male|
|  6|       Alex John| Four|  55|  male|
|  7|    Robert John |Three|  78|  male|
|  8|       Lee Malva| Four|  85|  male|
|  9|    Wookie Davey|  Two|  78|  male|
| 15|       Scott Row| Four|  88|  male|
| 16|     Daniel Page| Four|  88|  male|
| 17|  James Williams|Three|  54|  male|
| 18|Martin Johnston | Four|  75|  male|
| 19|     John Smith | Four|  48|  male|
| 23|       Sam Adan |Three|  79|  male|
| 24|   Nova Prescott|  Two|  78|  male|
| 25|  William Taylor| Four|  88|  male|
| 26|   Laurin Wilson|Three|  79|  male|
| 29|         Ben Day| Four|  55|  male|
| 31|      Chris Ball| Four|  88|  male|
| 34|   Garry Richard|Three|  69|  male|
+---+----------------+-----+----+------+

