# Introduction to Basic PySpark Programs

Lab Exercises:
1 Write a PySpark program to square set of integers.
2 Write a PySpark program to find the maximum of given set of numbers.
3 Write a PySpark program to find average of N numbers.
4 Demonstrate how to read a CSV file into a PySpark DataFrame.
5 Use PySpark commands to display the first few rows and schema of a DataFrame.
6 Calculate basic summary statistics for a specific column in the DataFrame.

PySpark is the Python API to use Spark. Spark is an open-source, cluster computing system which is used for big data solution. It is lightning fast technology that is designed for fast computation.


In [16]:
import pyspark
import os
import sys
from pyspark import SparkContext, SparkConf
os.environ['PYSPARK_PYTHON']=sys.executable
os.environ['PYSPARK_DRIVER_PYTHON']=sys.executable
from pyspark.sql import SparkSession 

In [18]:
spark=SparkSession.builder.config("spark.driver.memory","16g").appName('square').getOrCreate()



In [29]:
import pandas as pd 
from pyspark.sql import functions as F
from pyspark.sql.types import *
df = spark.createDataFrame([("joe", 34), ("luisa", 22)], ["first_name", "age"])

df.printSchema()
df.show()

root
 |-- first_name: string (nullable = true)
 |-- age: long (nullable = true)

+----------+---+
|first_name|age|
+----------+---+
|       joe| 34|
|     luisa| 22|
+----------+---+



In [43]:
df.select(df['age']*df['age']).show()
df = df.withColumn("age",df['age']*df['age'])
df.withColumnRenamed("age","Square").show()

+-----------+
|(age * age)|
+-----------+
|       1156|
|        484|
+-----------+

+----------+------+
|first_name|Square|
+----------+------+
|       joe|  1156|
|     luisa|   484|
+----------+------+



In [49]:
df = spark.createDataFrame([(1, -1.0,[1,2]), (2,0.5,[3,4,5]),(3,2.7,[6,7,8,9])], ["integers", "floats","integer_arrays"])

df.printSchema()
df.show()

root
 |-- integers: long (nullable = true)
 |-- floats: double (nullable = true)
 |-- integer_arrays: array (nullable = true)
 |    |-- element: long (containsNull = true)

+--------+------+--------------+
|integers|floats|integer_arrays|
+--------+------+--------------+
|       1|  -1.0|        [1, 2]|
|       2|   0.5|     [3, 4, 5]|
|       3|   2.7|  [6, 7, 8, 9]|
+--------+------+--------------+



In [54]:
from pyspark.sql.functions import udf
def square(x):
    return x**2

In [55]:
from pyspark.sql import udf
square_udf_int=F.udf(lambda z: square(z),IntegerType())
(
    df.select('integers','floats',
              square_udf_int('Integers').alias('int_squared'),
              square_udf_int('floats').alias('float_squared')).show()
)

+--------+------+-----------+-------------+
|integers|floats|int_squared|float_squared|
+--------+------+-----------+-------------+
|       1|  -1.0|          1|         null|
|       2|   0.5|          4|         null|
|       3|   2.7|          9|         null|
+--------+------+-----------+-------------+



In [71]:
# df.select('integers',square('integers').alias('int_squared')).show()

In [60]:
square_udf_float = F.udf(lambda z: square(z), FloatType())  
(  
    df.select('integers',  
                'floats',  
                square_udf_float('integers').alias('int_squared'),  
                square_udf_float('floats').alias('float_squared'))  
    .show()  
)  

+--------+------+-----------+-------------+
|integers|floats|int_squared|float_squared|
+--------+------+-----------+-------------+
|       1|  -1.0|       null|          1.0|
|       2|   0.5|       null|         0.25|
|       3|   2.7|       null|         7.29|
+--------+------+-----------+-------------+



In [62]:
from pyspark.sql.types import ArrayType  
def square_list(x):  
    return [float(val)**2 for val in x]  
square_list_udf = F.udf(lambda y: square_list(y), ArrayType(FloatType()))  
df.select('integer_arrays', square_list_udf('integer_arrays')).show()

+--------------+------------------------+
|integer_arrays|<lambda>(integer_arrays)|
+--------------+------------------------+
|        [1, 2]|              [1.0, 4.0]|
|     [3, 4, 5]|       [9.0, 16.0, 25.0]|
|  [6, 7, 8, 9]|    [36.0, 49.0, 64.0...|
+--------------+------------------------+



In [67]:
#2
print(df.select("integers").rdd.max()[0])

3


In [70]:
#3
print(df.agg(F.avg(df['integers'])).show())

+-------------+
|avg(integers)|
+-------------+
|          2.0|
+-------------+

None


In [74]:
#4,5,6
spark=SparkSession.builder.appName('DATA ANALYSIS').getOrCreate()
df=spark.read.csv('data.csv',header="True",inferSchema="True")

In [78]:
df.printSchema()
df.show()
df.head()
df.summary().show()
df.select('age').summary().show()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

+----+---+
|name|age|
+----+---+
|  ab| 23|
|  cd| 45|
|  ef| 42|
|  gh| 13|
|  ij| 38|
+----+---+

+-------+----+------------------+
|summary|name|               age|
+-------+----+------------------+
|  count|   5|                 5|
|   mean|null|              32.2|
| stddev|null|13.663820841916802|
|    min|  ab|                13|
|    25%|null|                23|
|    50%|null|                38|
|    75%|null|                42|
|    max|  ij|                45|
+-------+----+------------------+

+-------+------------------+
|summary|               age|
+-------+------------------+
|  count|                 5|
|   mean|              32.2|
| stddev|13.663820841916802|
|    min|                13|
|    25%|                23|
|    50%|                38|
|    75%|                42|
|    max|                45|
+-------+------------------+

