 ### In this document I will cover:
 - PySpark DataFrame
 - Reading the Dataset
 - Checking the Datatypes of the column (Schema)
 - Selecting Columns and Indexing
 - Check Describe option similar to Pandas
 - Adding Columns
 - Dropping Columns
 - Renaming Columns

In [30]:
### Set up
from pyspark.sql import SparkSession

In [31]:
spark=SparkSession.builder.appName('DataFrane').getOrCreate()

In [32]:
spark

In [33]:
### Read the dataset
## Option 1
# Using inferSchema=True under the csv() will attempt to interpret the column value
df_pyspark=spark.read.option('header','True').csv('DATA/test1.csv',inferSchema=True)

In [35]:
## Option 2
df_pyspark=spark.read.csv('DATA/test1.csv',header=True,inferSchema=True)
df_pyspark.show()

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
| Amanda| 70|        20|
|Matthew| 28|         5|
|   John| 40|        21|
|  David| 18|        16|
| Ashley| 36|        13|
|Matthew| 35|         5|
|Jessica| 70|         9|
|Jessica| 49|         6|
|  Jacob| 71|        24|
| Ashley| 66|         6|
| Amanda| 72|         2|
|Matthew| 78|        13|
| Ashley| 75|        15|
|Michael| 59|         4|
|  Sarah| 56|        11|
|Michael| 71|        17|
|Matthew| 50|         6|
|  Emily| 61|        13|
| Ashley| 33|        25|
| Ashley| 25|         8|
+-------+---+----------+
only showing top 20 rows



In [37]:
### Checking the data types of the column
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [38]:
df_pyspark.columns

['name', 'age', 'experience']

In [39]:
df_pyspark.head(3)

[Row(name='Amanda', age=70, experience=20),
 Row(name='Matthew', age=28, experience=5),
 Row(name='John', age=40, experience=21)]

In [47]:
df_pyspark.show(3)

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
| Amanda| 70|        20|
|Matthew| 28|         5|
|   John| 40|        21|
+-------+---+----------+
only showing top 3 rows



In [52]:
### Selecting Columns and Indexing
type(df_pyspark.select('name'))

pyspark.sql.dataframe.DataFrame

In [48]:
df_pyspark.select(['name','experience']).show(3)

+-------+----------+
|   name|experience|
+-------+----------+
| Amanda|        20|
|Matthew|         5|
|   John|        21|
+-------+----------+
only showing top 3 rows



In [46]:
df_pyspark['name']

Column<'name'>

In [49]:
### Check Describe option similar to Pandas
df_pyspark.dtypes

[('name', 'string'), ('age', 'int'), ('experience', 'int')]

In [51]:
df_pyspark.describe().show()



+-------+-------+------------------+-----------------+
|summary|   name|               age|       experience|
+-------+-------+------------------+-----------------+
|  count|1000000|           1000000|          1000000|
|   mean|   null|         49.004238|        13.506422|
| stddev|   null|18.193161985496097|6.922562147050963|
|    min| Amanda|                18|                2|
|    max|  Sarah|                80|               25|
+-------+-------+------------------+-----------------+



                                                                                

In [57]:
### Adding Cols in DataFrame
df_pyspark=df_pyspark.withColumn('expereince After 2 year', df_pyspark['experience']+2)

In [58]:
df_pyspark.show(3)

+-------+---+----------+-----------------------+
|   name|age|experience|expereince After 2 year|
+-------+---+----------+-----------------------+
| Amanda| 70|        20|                     22|
|Matthew| 28|         5|                      7|
|   John| 40|        21|                     23|
+-------+---+----------+-----------------------+
only showing top 3 rows



In [59]:
### Dropping the columns
df_pyspark=df_pyspark.drop('expereince After 2 year')

In [60]:
df_pyspark.show(3)

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
| Amanda| 70|        20|
|Matthew| 28|         5|
|   John| 40|        21|
+-------+---+----------+
only showing top 3 rows



In [62]:
### Rename the columns
df_pyspark.withColumnRenamed('name', 'new name').show(3)

+--------+---+----------+
|new name|age|experience|
+--------+---+----------+
|  Amanda| 70|        20|
| Matthew| 28|         5|
|    John| 40|        21|
+--------+---+----------+
only showing top 3 rows

