In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/15 16:14:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark

In [29]:
# Read the dataset
df = spark.read.option("delimiter", ";").option("header", "true").csv('test1.csv', inferSchema=True)
# Another way
# df = spark.read.option("delimiter", ";").csv('test1.csv', header= True, inferSchema=True)

In [30]:
df.show()

+---------+----+-------+
|     Name| Age| Salary|
+---------+----+-------+
|    Krish|  31| 200000|
|Sudhanshu|  30| 500000|
|    Sunny|  29|   NULL|
|      Ama|  37|  40000|
|       Mi|  25| 150000|
|     NULL|NULL|6000000|
+---------+----+-------+



In [31]:
type(df)

pyspark.sql.classic.dataframe.DataFrame

In [32]:
# Check the schema
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [33]:
df.columns

['Name', 'Age', 'Salary']

In [34]:
df.select(['Name', 'Age']).show()

+---------+----+
|     Name| Age|
+---------+----+
|    Krish|  31|
|Sudhanshu|  30|
|    Sunny|  29|
|      Ama|  37|
|       Mi|  25|
|     NULL|NULL|
+---------+----+



In [35]:
df.describe().show()

+-------+-----+----------------+-----------------+
|summary| Name|             Age|           Salary|
+-------+-----+----------------+-----------------+
|  count|    5|               5|                5|
|   mean| NULL|            30.4|        1378000.0|
| stddev| NULL|4.33589667773576|2589386.027613496|
|    min|  Ama|              25|            40000|
|    max|Sunny|              37|          6000000|
+-------+-----+----------------+-----------------+



In [36]:
df = df.withColumn('Experience', df['Age'] - 24)

In [37]:
df.show()

+---------+----+-------+----------+
|     Name| Age| Salary|Experience|
+---------+----+-------+----------+
|    Krish|  31| 200000|         7|
|Sudhanshu|  30| 500000|         6|
|    Sunny|  29|   NULL|         5|
|      Ama|  37|  40000|        13|
|       Mi|  25| 150000|         1|
|     NULL|NULL|6000000|      NULL|
+---------+----+-------+----------+



In [38]:
df.na.drop().show()

+---------+---+------+----------+
|     Name|Age|Salary|Experience|
+---------+---+------+----------+
|    Krish| 31|200000|         7|
|Sudhanshu| 30|500000|         6|
|      Ama| 37| 40000|        13|
|       Mi| 25|150000|         1|
+---------+---+------+----------+



In [39]:
df.na.drop(how="all").show()

+---------+----+-------+----------+
|     Name| Age| Salary|Experience|
+---------+----+-------+----------+
|    Krish|  31| 200000|         7|
|Sudhanshu|  30| 500000|         6|
|    Sunny|  29|   NULL|         5|
|      Ama|  37|  40000|        13|
|       Mi|  25| 150000|         1|
|     NULL|NULL|6000000|      NULL|
+---------+----+-------+----------+



In [40]:
df.na.drop(how='any').show()

+---------+---+------+----------+
|     Name|Age|Salary|Experience|
+---------+---+------+----------+
|    Krish| 31|200000|         7|
|Sudhanshu| 30|500000|         6|
|      Ama| 37| 40000|        13|
|       Mi| 25|150000|         1|
+---------+---+------+----------+



In [None]:
# The row must contain at least thresh value to not be drop
df.na.drop(thresh=2).show()

+---------+---+------+----------+
|     Name|Age|Salary|Experience|
+---------+---+------+----------+
|    Krish| 31|200000|         7|
|Sudhanshu| 30|500000|         6|
|    Sunny| 29|  NULL|         5|
|      Ama| 37| 40000|        13|
|       Mi| 25|150000|         1|
+---------+---+------+----------+



In [44]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols= ['Age', 'Salary', 'Experience'],
    outputCols= ['{}_imputed'.format(c) for c in ['Age', 'Salary', 'Experience']]
).setStrategy('mean')

In [45]:
imputer.fit(df).transform(df).show()

+---------+----+-------+----------+-----------+--------------+------------------+
|     Name| Age| Salary|Experience|Age_imputed|Salary_imputed|Experience_imputed|
+---------+----+-------+----------+-----------+--------------+------------------+
|    Krish|  31| 200000|         7|         31|        200000|                 7|
|Sudhanshu|  30| 500000|         6|         30|        500000|                 6|
|    Sunny|  29|   NULL|         5|         29|       1378000|                 5|
|      Ama|  37|  40000|        13|         37|         40000|                13|
|       Mi|  25| 150000|         1|         25|        150000|                 1|
|     NULL|NULL|6000000|      NULL|         30|       6000000|                 6|
+---------+----+-------+----------+-----------+--------------+------------------+



In [47]:
df.filter('Salary<=200000').show()

+-----+---+------+----------+
| Name|Age|Salary|Experience|
+-----+---+------+----------+
|Krish| 31|200000|         7|
|  Ama| 37| 40000|        13|
|   Mi| 25|150000|         1|
+-----+---+------+----------+



In [48]:
df.filter(df['Salary']<=200000).show()

+-----+---+------+----------+
| Name|Age|Salary|Experience|
+-----+---+------+----------+
|Krish| 31|200000|         7|
|  Ama| 37| 40000|        13|
|   Mi| 25|150000|         1|
+-----+---+------+----------+



In [49]:
df.filter(df['Salary']<=200000).select(['Name', 'Age']).show()

+-----+---+
| Name|Age|
+-----+---+
|Krish| 31|
|  Ama| 37|
|   Mi| 25|
+-----+---+



In [51]:
df.filter((df['Salary']<=200000) & (df.Salary >=150000)).show()

+-----+---+------+----------+
| Name|Age|Salary|Experience|
+-----+---+------+----------+
|Krish| 31|200000|         7|
|   Mi| 25|150000|         1|
+-----+---+------+----------+



In [52]:
df.groupBy('Name').sum().show()

+---------+--------+-----------+---------------+
|     Name|sum(Age)|sum(Salary)|sum(Experience)|
+---------+--------+-----------+---------------+
|     NULL|    NULL|    6000000|           NULL|
|Sudhanshu|      30|     500000|              6|
|    Sunny|      29|       NULL|              5|
|    Krish|      31|     200000|              7|
|      Ama|      37|      40000|             13|
|       Mi|      25|     150000|              1|
+---------+--------+-----------+---------------+

