### Chapter 2: PySpark DataFrames - 1
In this Chapter:
1. csv options -- headers, inferSchema
2. DataFrame.printSchema()
3. DataFrame.head()
4. DataFrame.describe() & describe().show()
5. DataFrame.dtypes()
6. Selecting columns -- DataFrame.select()
7. Adding a new column -- DataFrame.withColumn('colName','value')
8. Droping a column -- DataFrame.drop()
9. Renaming a column -- DataFrame.withColumnRenamed()

In [None]:
#Importing SparkSession from sql(pyspark)
from pyspark.sql import SparkSession

In [4]:
#Creating spark session
spark= SparkSession.builder.appName('Dataframe').getOrCreate()

In [9]:
#View the Session
spark

In [10]:
#reading a csv file ; here option('header','true') will retain the original row name
#created a reader
csvReader=spark.read.option('header','true').csv('../test1.csv')

In [11]:
csvReader.show()

+-----+---+
| Name|Age|
+-----+---+
|Sahil| 24|
|Singh| 22|
|  Eka| 19|
+-----+---+



In [12]:
#but what exactly is csvReader? What's it's type?
csvReader
#after executing this code you will see the type of the csvReader, that is DataFrame

DataFrame[Name: string, Age: string]

In [13]:
## Check the schema
csvReader.printSchema()
## By default .csv considers every cell in the excel/csv as string

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)



In [14]:
## Let's change the default behavior of  .csv()
df_test1 = spark.read.option('header','true').csv('../test1.csv',inferSchema=True)

In [16]:
df_test1.printSchema()
##Using inferSchema options you will get he desired result

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)



In [17]:
## More elegant way to read the file
df_test1 = spark.read.csv('../test1.csv',header=True,inferSchema=True)

#Print the content
df_test1.show()

#Print the schema
df_test1.printSchema()

+-----+---+
| Name|Age|
+-----+---+
|Sahil| 24|
|Singh| 22|
|  Eka| 19|
+-----+---+

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)



In [18]:
## Let's check the type of df_test1
type(df_test1)

pyspark.sql.dataframe.DataFrame

In [5]:
df_test2 = spark.read.csv('test2.csv',header=True,inferSchema=True)
type(df_test2)
# df_test2.show()

pyspark.sql.dataframe.DataFrame

In [20]:
## head() prints the records from the top
df_test2.head()

Row(Name='Sahil', Age=24, Exp=3)

In [20]:
df_test2.head(2)

[Row(Name='Sahil', Age=24, Exp=3), Row(Name='Singh', Age=22, Exp=1)]

In [22]:
## The type of dataframe.head()
type(df_test2.head())

pyspark.sql.types.Row

In [23]:
##dtypes -- data type
df_test2.dtypes

[('Name', 'string'), ('Age', 'int'), ('Exp', 'int')]

In [26]:
'''
DataFame.describe()
Computes basic statistics for numeric and string columns.
This include count, mean, stddev, min, and max. If no columns are given, this function computes statistics for all numerical or string columns.
'''
df2_describe=df_test2.describe()
df2_describe

DataFrame[summary: string, Name: string, Age: string, Exp: string]

In [27]:
## .show() will give you the result in tabular format
df2_describe.show()

+-------+-----+------------------+------------------+
|summary| Name|               Age|               Exp|
+-------+-----+------------------+------------------+
|  count|    3|                 3|                 3|
|   mean| null|21.666666666666668|1.3333333333333333|
| stddev| null|2.5166114784235836|1.5275252316519468|
|    min|  Eka|                19|                 0|
|    max|Singh|                24|                 3|
+-------+-----+------------------+------------------+



In [6]:
## Let's target the columns
#dataframe.select(ColumnName OR [Column1,Column2..]) will return a Data Frame object
df2_column=df_test2.select('Name')

In [11]:
df2_column

DataFrame[Name: string]

In [12]:
df2_column.show()

+-----+
| Name|
+-----+
|Sahil|
|Singh|
|  Eka|
+-----+



In [13]:
## Reading multiple columns
df2_column2 = df_test2.select(['Name','Exp'])

In [14]:
df2_column2.show()

+-----+---+
| Name|Exp|
+-----+---+
|Sahil|  3|
|Singh|  1|
|  Eka|  0|
+-----+---+



In [32]:
## Adding Columns in Data Frame
'''
DataFrame.withColumn('New Column name','It's Value')
'''
df2_addCol=df_test2.withColumn('Exp+2yrs',df_test2['Exp']+2)
df2_addCol

DataFrame[Name: string, Age: int, Exp: int, Exp+2yrs: int]

In [30]:
df2_addCol.show()

+-----+---+---+--------+
| Name|Age|Exp|Exp+2yrs|
+-----+---+---+--------+
|Sahil| 24|  3|       5|
|Singh| 22|  1|       3|
|  Eka| 19|  0|       2|
+-----+---+---+--------+



In [33]:
##
df_test2['Exp']

Column<'Exp'>

In [35]:
df_test2.show()

+-----+---+---+
| Name|Age|Exp|
+-----+---+---+
|Sahil| 24|  3|
|Singh| 22|  1|
|  Eka| 19|  0|
+-----+---+---+



In [36]:
df2_addCol.show(
    
)

+-----+---+---+--------+
| Name|Age|Exp|Exp+2yrs|
+-----+---+---+--------+
|Sahil| 24|  3|       5|
|Singh| 22|  1|       3|
|  Eka| 19|  0|       2|
+-----+---+---+--------+



In [39]:
### Renaming a Column
df2_renameCol=df2_addCol.withColumnRenamed('Exp+2yrs','NewExp')

In [41]:
df2_renameCol.show()

+-----+---+---+------+
| Name|Age|Exp|NewExp|
+-----+---+---+------+
|Sahil| 24|  3|     5|
|Singh| 22|  1|     3|
|  Eka| 19|  0|     2|
+-----+---+---+------+

