In [1]:
# Import SparkSession

from pyspark.sql import SparkSession
import findspark
findspark.init()
from pyspark.sql.functions import *

In [2]:
# Create SparkSession 
spark = SparkSession.builder \
    .appName('SparkExample') \
        .getOrCreate()

### RDD Parallelize

In [3]:
# Create RDD from parallelize    
dataList = [("Java", 20000), ("Python", 100000), ("Scala", 3000)]
rdd=spark.sparkContext.parallelize(dataList)
print(rdd.collect())

[('Java', 20000), ('Python', 100000), ('Scala', 3000)]


### Create dataframe with Schema

In [4]:
from pyspark.sql import Row
from pyspark.sql.types import *

rdd = spark.sparkContext.parallelize([
    Row(name='Allie', age=2),
    Row(name='Sara', age=33),
    Row(name='Grace', age=31)])

schema = schema = StructType([
   StructField("name", StringType(), True),
   StructField("age", IntegerType(), False)])

df = spark.createDataFrame(rdd, schema)

df.show(truncate=False)

+-----+---+
|name |age|
+-----+---+
|Allie|2  |
|Sara |33 |
|Grace|31 |
+-----+---+



### Create dataframe with data and columns

In [5]:
data2 = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
d2 = spark.createDataFrame(data=data2, schema = columns)
d2.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [6]:
dfFromData2 = spark.createDataFrame(data2).toDF(*columns)
dfFromData2.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



### Rename column dataframe

In [7]:
d2.withColumnRenamed('middlename', 'midname').printSchema()

root
 |-- firstname: string (nullable = true)
 |-- midname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



### change value column

In [8]:

d2.withColumn("salary",col("salary")*100).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|300000|
|  Michael|      Rose|        |2000-05-19|     M|400000|
|   Robert|          |Williams|1978-09-05|     M|400000|
|    Maria|      Anne|   Jones|1967-12-01|     F|400000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -100|
+---------+----------+--------+----------+------+------+



In [9]:
d3 = d2.withColumn("salary", when(d2.salary == -100,2000) \
    .otherwise(d2.salary + 1000)
    )
d3.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  4000|
|  Michael|      Rose|        |2000-05-19|     M|  5000|
|   Robert|          |Williams|1978-09-05|     M|  5000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  5000|
|      Jen|      Mary|   Brown|1980-02-17|     F|   999|
+---------+----------+--------+----------+------+------+



In [10]:
# GroupByColumn
d2.groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|     M|    3|
|     F|    2|
+------+-----+



In [11]:
d3.describe()

DataFrame[summary: string, firstname: string, middlename: string, lastname: string, dob: string, gender: string, salary: string]

### Adding Column in the DataFrame

In [12]:
d3 = d3.withColumn('Salary in percent', col('salary')*0.01)

In [13]:
d3 =d3.drop(col('Salary in percent'))

### change empty cells to null cells

In [14]:
d3 = d3.withColumn('middlename', \
    when(col('middlename')== '', None) \
        .otherwise(col('middlename')) \
    )

In [15]:
d3 = d3.withColumn('lastname', \
    when(col('lastname')== '', None) \
        .otherwise(col('lastname')) \
    )

In [20]:
d3.na.drop(how='all').show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|      null|   Smith|1991-04-01|     M|  4000|
|  Michael|      Rose|    null|2000-05-19|     M|  5000|
|   Robert|      null|Williams|1978-09-05|     M|  5000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  5000|
|      Jen|      Mary|   Brown|1980-02-17|     F|   999|
+---------+----------+--------+----------+------+------+



In [26]:
d3.na.drop(how='any', thresh=3).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|      null|   Smith|1991-04-01|     M|  4000|
|  Michael|      Rose|    null|2000-05-19|     M|  5000|
|   Robert|      null|Williams|1978-09-05|     M|  5000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  5000|
|      Jen|      Mary|   Brown|1980-02-17|     F|   999|
+---------+----------+--------+----------+------+------+



In [27]:
d3.na.drop(how='any', subset=['middlename']).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|  Michael|      Rose|    null|2000-05-19|     M|  5000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  5000|
|      Jen|      Mary|   Brown|1980-02-17|     F|   999|
+---------+----------+--------+----------+------+------+



In [28]:
d3.na.fill('Missing values', 'lastname').show()

+---------+----------+--------------+----------+------+------+
|firstname|middlename|      lastname|       dob|gender|salary|
+---------+----------+--------------+----------+------+------+
|    James|      null|         Smith|1991-04-01|     M|  4000|
|  Michael|      Rose|Missing values|2000-05-19|     M|  5000|
|   Robert|      null|      Williams|1978-09-05|     M|  5000|
|    Maria|      Anne|         Jones|1967-12-01|     F|  5000|
|      Jen|      Mary|         Brown|1980-02-17|     F|   999|
+---------+----------+--------------+----------+------+------+



In [36]:
d4 = spark.read.csv('test2.csv', header=True, inferSchema=True)
d4.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [37]:
d4.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [38]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'Experience', 'Salary'], 
    outputCols=["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
    ).setStrategy("median")

In [39]:
# Add imputation cols to df
imputer.fit(d4).transform(d4).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|null|      null| 40000|         29|                 4|         40000|
|     null|  34|        10| 38000|         34|                10|         38000|
|     null|  36|      null|  null|         36|                 4|         20000|
+---------+----+----------+-