In [1]:
%%capture
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/20 05:06:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/20 05:06:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/01/20 05:06:34 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Read operation

Suppose we have a csv file, we can use `spark.read.csv()` like the following:

In [2]:
csvFile = spark.read.csv("sample.csv", header=True)
csvFile.show()

+-----+---+
| name|age|
+-----+---+
| john| 33|
| jack| 26|
|derry| 28|
| mary| 64|
+-----+---+



### Read from plain text and process the file

This is plain text file, we're going to process this file:

## Select

PySpark accepts different types of args

`select(columns)`

columns type:
- `string`
- `pyspark.sql.column.Column`

In [3]:
from pyspark.sql.functions import *

csvFile.select("name", "age").show()
csvFile.select(csvFile.name, csvFile.age).show()
csvFile.select(col("name"), col("age")).show()

+-----+---+
| name|age|
+-----+---+
| john| 33|
| jack| 26|
|derry| 28|
| mary| 64|
+-----+---+

+-----+---+
| name|age|
+-----+---+
| john| 33|
| jack| 26|
|derry| 28|
| mary| 64|
+-----+---+

+-----+---+
| name|age|
+-----+---+
| john| 33|
| jack| 26|
|derry| 28|
| mary| 64|
+-----+---+



These 2 actually are equal.

In [4]:
type(csvFile.name) == type(col("name"))

True

In [5]:
csvFile.select(csvFile['name'], csvFile['age'] + 20).show()

+-----+----------+
| name|(age + 20)|
+-----+----------+
| john|      53.0|
| jack|      46.0|
|derry|      48.0|
| mary|      84.0|
+-----+----------+



In [6]:
csvFile.filter(csvFile['age'] > 30).show()

+----+---+
|name|age|
+----+---+
|john| 33|
|mary| 64|
+----+---+



## Create a view and access by `spark.sql()`

> The engine works for sql and dataframe is the same

In [7]:
csvFile.createTempView("people")

In [8]:
spark.sql("select * from people").show()

+-----+---+
| name|age|
+-----+---+
| john| 33|
| jack| 26|
|derry| 28|
| mary| 64|
+-----+---+



### Nested data structure

In [9]:
sample_data1 = [(("Rame",None,"Gupta"),"Rajasthan","M"),
        (("Anita","Garg",""),"Delhi","F"),
        (("Pooja","","Aggarwal"),"Delhi","F"),
        (("Saurabh","Anne","Jones"),"Jammu","M"),
        (("Shahrukh","Khan","Brown"),"Maharashtra","M"),
        (("Salman","Gupta","Williams"),"Delhi","M")]

sample_schema = StructType([
    StructField('name', StructType([
         StructField('firstname', StringType(), True),
         StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
         ])),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
     ])
nested_df = spark.createDataFrame(data = sample_data1, schema = sample_schema)
nested_df.show()

                                                                                

+--------------------+-----------+------+
|                name|      state|gender|
+--------------------+-----------+------+
| {Rame, null, Gupta}|  Rajasthan|     M|
|     {Anita, Garg, }|      Delhi|     F|
| {Pooja, , Aggarwal}|      Delhi|     F|
|{Saurabh, Anne, J...|      Jammu|     M|
|{Shahrukh, Khan, ...|Maharashtra|     M|
|{Salman, Gupta, W...|      Delhi|     M|
+--------------------+-----------+------+



In [10]:
nested_df.select("name.firstname").show()
nested_df.select("name.*").show()

+---------+
|firstname|
+---------+
|     Rame|
|    Anita|
|    Pooja|
|  Saurabh|
| Shahrukh|
|   Salman|
+---------+

+---------+----------+--------+
|firstname|middlename|lastname|
+---------+----------+--------+
|     Rame|      null|   Gupta|
|    Anita|      Garg|        |
|    Pooja|          |Aggarwal|
|  Saurabh|      Anne|   Jones|
| Shahrukh|      Khan|   Brown|
|   Salman|     Gupta|Williams|
+---------+----------+--------+

