Import the libraries

In [2]:
# Import the libararies
import pandas as pd
from pyspark.sql import SparkSession
from sklearn.datasets import load_iris

Load the dataset

In [3]:
# Load the iris dataset using pandas
iris_pandas = pd.read_csv('../data/sample_null.csv')
print(type(iris_pandas))
iris_pandas.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Name,age,Experience,Salary
0,Krish,31.0,10.0,30000.0
1,Sudhanshu,30.0,8.0,25000.0
2,Sunny,29.0,4.0,20000.0
3,Paul,24.0,3.0,20000.0
4,Harsha,21.0,1.0,15000.0


In [4]:
# Load the iris dataset using pyspark
spark = SparkSession.builder.appName('Practice').getOrCreate()
iris_spark = spark.read.csv('../data/sample_null.csv', header=True, inferSchema=True)
print(type(iris_spark))
iris_spark.show(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/19 08:22:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<class 'pyspark.sql.dataframe.DataFrame'>
+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
+---------+---+----------+------+
only showing top 5 rows



Get the schema of dataframe

In [5]:
iris_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [6]:
iris_spark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [7]:
iris_spark.describe().show()

25/03/19 08:22:13 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+------+------------------+------------------+-----------------+
|summary|  Name|               age|        Experience|           Salary|
+-------+------+------------------+------------------+-----------------+
|  count|     7|                 8|                 7|                8|
|   mean|  NULL|              28.5| 5.428571428571429|          25750.0|
| stddev|  NULL|5.3718844791323335|3.8234863173611093|9361.776388210581|
|    min|Harsha|                21|                 1|            15000|
|    max| Sunny|                36|                10|            40000|
+-------+------+------------------+------------------+-----------------+



Slicing by column

In [8]:
iris_spark.select(['Name', 'Age']).show(5)

+---------+---+
|     Name|Age|
+---------+---+
|    Krish| 31|
|Sudhanshu| 30|
|    Sunny| 29|
|     Paul| 24|
|   Harsha| 21|
+---------+---+
only showing top 5 rows



Add new column

In [9]:
iris_spark_filtered = iris_spark.withColumn('age_squared', iris_spark['Age'] ** 2)
iris_spark_filtered.show(5)

+---------+---+----------+------+-----------+
|     Name|age|Experience|Salary|age_squared|
+---------+---+----------+------+-----------+
|    Krish| 31|        10| 30000|      961.0|
|Sudhanshu| 30|         8| 25000|      900.0|
|    Sunny| 29|         4| 20000|      841.0|
|     Paul| 24|         3| 20000|      576.0|
|   Harsha| 21|         1| 15000|      441.0|
+---------+---+----------+------+-----------+
only showing top 5 rows



Drop existing column

In [10]:
iris_spark_filtered.drop('age_squared').show(5)

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
+---------+---+----------+------+
only showing top 5 rows



Rename column

In [11]:
iris_spark_filtered.withColumnRenamed('age_squared', 'age_powered').show(5)

+---------+---+----------+------+-----------+
|     Name|age|Experience|Salary|age_powered|
+---------+---+----------+------+-----------+
|    Krish| 31|        10| 30000|      961.0|
|Sudhanshu| 30|         8| 25000|      900.0|
|    Sunny| 29|         4| 20000|      841.0|
|     Paul| 24|         3| 20000|      576.0|
|   Harsha| 21|         1| 15000|      441.0|
+---------+---+----------+------+-----------+
only showing top 5 rows



Drop missing values

In [12]:
iris_spark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [16]:
# The thresh parameter means keeping only N non-null values
iris_spark.na.drop(how='any', thresh=2).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
+---------+----+----------+------+

