In [2]:
import pyspark

In [4]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('DataFrame').getOrCreate()

In [5]:
spark

In [7]:
df_pyspark=spark.read.option('header', 'true').csv('test1.csv')

In [8]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)



In [11]:
# If we do not include 'inferSchema=True', by default pyspark will assume all columns are string
df_pyspark=spark.read.option('header', 'true').csv('test1.csv', inferSchema=True)

In [12]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)



In [45]:
# Alternative way to read a file
df_pyspark=spark.read.csv('test1.csv', header=True, inferSchema=True)

In [38]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)



In [17]:
# Retrieve the column headers
df_pyspark.columns

['Name', 'Age']

In [19]:
# Select a column
df_pyspark.select('Name').show()

+--------+
|    Name|
+--------+
|   Krish|
|Sudhansh|
|    Suny|
+--------+



In [20]:
# Select multiple columns 
df_pyspark.select(['Name', 'Age']).show()

+--------+---+
|    Name|Age|
+--------+---+
|   Krish| 31|
|Sudhansh| 30|
|    Suny| 29|
+--------+---+



In [39]:
# Show the summary statistics of the dataframe
# The reason that 'Krish' is min is because it is indexed 0
df_pyspark.describe().show()

+-------+-----+----+
|summary| Name| Age|
+-------+-----+----+
|  count|    3|   3|
|   mean| null|30.0|
| stddev| null| 1.0|
|    min|Krish|  29|
|    max| Suny|  31|
+-------+-----+----+



In [46]:
# Add column to dataframe
# This is not an inplace operation
# Note that slicing works for selecting a column, but there is no method to show the selected column for this case
df_pyspark=df_pyspark.withColumn('Age after 2 years',df_pyspark['Age']+2)

In [48]:
df_pyspark.show()

+--------+---+-----------------+
|    Name|Age|Age after 2 years|
+--------+---+-----------------+
|   Krish| 31|               33|
|Sudhansh| 30|               32|
|    Suny| 29|               31|
+--------+---+-----------------+



In [49]:
# Drop a column from a dataframe
# This is not an inplace operation
df_pyspark=df_pyspark.drop('Age after 2 years')

In [50]:
df_pyspark.show()

+--------+---+
|    Name|Age|
+--------+---+
|   Krish| 31|
|Sudhansh| 30|
|    Suny| 29|
+--------+---+



In [52]:
# Rename a column
# This is not an inplace operation
df_pyspark.withColumnRenamed('Name', 'Full Name').show()

+---------+---+
|Full Name|Age|
+---------+---+
|    Krish| 31|
| Sudhansh| 30|
|     Suny| 29|
+---------+---+

