# Lesson Two : Introduction to Pyspark
### By Samuel Ko

In [40]:
# To investigate the use of the Pyspark SQL statements and other functions
# By Samuel Ko

In [41]:
import pyspark

In [42]:
# Use SparkSession to create DataFrame,
# and execute SQL commands over tables, cache tables, and read parquet files
from pyspark.sql import SparkSession

In [43]:
# Create a spark session for operation
spark=SparkSession.builder.appName('Dataframe').getOrCreate()

In [44]:
spark

In [45]:
# read data set into dataframe
df_pyspark=spark.read.option('header','true').csv('test2.csv',inferSchema=True)

# With InferSchema Option = True:
# Infer schema will automatically guess the data types for each field. 
# If we set this option to TRUE, the API will read some sample records 
# from the file to infer the schema. If we want to set this value to false, 
# ****We must specify a schema explicitly****

In [46]:
# Examine the true table schema (with exact data type)
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Year_Experience: integer (nullable = true)



In [47]:
### Now Check the type of the pyspark dataframe (dataframe means data structure)
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

### As we can see the dataframe of pyspark is different from the dataframe of Pandas

In [48]:
# to display the column name 
df_pyspark.columns

['Name', 'Age', 'Year_Experience']

In [49]:
# To select a specific column from dataframe to display, use
df_pyspark.select('Name')

DataFrame[Name: string]

In [50]:
# Execute the select statement
df_pyspark.select('Name','Year_Experience').show()

+--------------+---------------+
|          Name|Year_Experience|
+--------------+---------------+
|Kennith Kawaki|              7|
|   Sudans Wong|              6|
|   Polly Combo|              4|
|Desmond Cheung|             22|
|     Chariotte|              8|
|        Sophia|              3|
|          John|              3|
|  George Woody|              7|
|    Pansy Rose|              9|
|          John|              1|
|         Peter|             18|
|         David|             15|
|        Usamha|              4|
|          Zain|              7|
+--------------+---------------+



In [51]:
df_pyspark['Name']

Column<'Name'>

In [52]:
# Check all the datatype of the table
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Year_Experience', 'int')]

In [53]:
# Use .describe() function to calculate the summary statistics of 
# columns present in the DataFrame
# such as 1) Count, 2) Mean, 3) Stddev, 4) Min and 5) Max
df_pyspark.describe().show()

+-------+---------+------------------+-----------------+
|summary|     Name|               Age|  Year_Experience|
+-------+---------+------------------+-----------------+
|  count|       14|                14|               14|
|   mean|     null|32.642857142857146|8.142857142857142|
| stddev|     null| 8.793416618367164|6.099900917948685|
|    min|Chariotte|                17|                1|
|    max|     Zain|                51|               22|
+-------+---------+------------------+-----------------+



In [54]:
# Adding the calculated columns in dataframe
df_pyspark=df_pyspark.withColumn('Experience_After_5_Yrs', df_pyspark['Year_Experience']+5)

In [55]:
df_pyspark.show()

+--------------+---+---------------+----------------------+
|          Name|Age|Year_Experience|Experience_After_5_Yrs|
+--------------+---+---------------+----------------------+
|Kennith Kawaki| 33|              7|                    12|
|   Sudans Wong| 31|              6|                    11|
|   Polly Combo| 27|              4|                     9|
|Desmond Cheung| 51|             22|                    27|
|     Chariotte| 35|              8|                    13|
|        Sophia| 25|              3|                     8|
|          John| 25|              3|                     8|
|  George Woody| 31|              7|                    12|
|    Pansy Rose| 33|              9|                    14|
|          John| 17|              1|                     6|
|         Peter| 45|             18|                    23|
|         David| 42|             15|                    20|
|        Usamha| 28|              4|                     9|
|          Zain| 34|              7|    

In [56]:
# Now, drop the unwanted column
df_pyspark=df_pyspark.drop('Experience_After_5_Yrs')

In [57]:
df_pyspark.show()

+--------------+---+---------------+
|          Name|Age|Year_Experience|
+--------------+---+---------------+
|Kennith Kawaki| 33|              7|
|   Sudans Wong| 31|              6|
|   Polly Combo| 27|              4|
|Desmond Cheung| 51|             22|
|     Chariotte| 35|              8|
|        Sophia| 25|              3|
|          John| 25|              3|
|  George Woody| 31|              7|
|    Pansy Rose| 33|              9|
|          John| 17|              1|
|         Peter| 45|             18|
|         David| 42|             15|
|        Usamha| 28|              4|
|          Zain| 34|              7|
+--------------+---+---------------+



In [58]:
# To rename a column "Year_Experience" to "Experience"
df_temp = df_pyspark.withColumnRenamed('Year_Experience','Experience').show()

+--------------+---+----------+
|          Name|Age|Experience|
+--------------+---+----------+
|Kennith Kawaki| 33|         7|
|   Sudans Wong| 31|         6|
|   Polly Combo| 27|         4|
|Desmond Cheung| 51|        22|
|     Chariotte| 35|         8|
|        Sophia| 25|         3|
|          John| 25|         3|
|  George Woody| 31|         7|
|    Pansy Rose| 33|         9|
|          John| 17|         1|
|         Peter| 45|        18|
|         David| 42|        15|
|        Usamha| 28|         4|
|          Zain| 34|         7|
+--------------+---+----------+

