# Formatting DataFrames

## Date

In [10]:
from pyspark.sql import Row
from pyspark.sql.functions import col,unix_timestamp

In [19]:
data=[Row(Date="10/10/2020 11.01.01",name='Sanjay'),
      Row(Date="12/11/2020 06.45.15",name='Prakash'),
      Row(Date="05/09/2020 03.42.55",name='Gnana'),]
df=spark.createDataFrame(data)
df.show()

+-------------------+-------+
|               Date|   name|
+-------------------+-------+
|10/10/2020 11.01.01| Sanjay|
|12/11/2020 06.45.15|Prakash|
|05/09/2020 03.42.55|  Gnana|
+-------------------+-------+



In [20]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- name: string (nullable = true)



In [21]:
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
df1=df.withColumn('Date',unix_timestamp(col('Date'),'dd/MM/yyyy HH.mm.ss').cast('timestamp'))
df1.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- name: string (nullable = true)



In [22]:
df1.show()

+-------------------+-------+
|               Date|   name|
+-------------------+-------+
|2020-10-10 11:01:01| Sanjay|
|2020-11-12 06:45:15|Prakash|
|2020-09-05 03:42:55|  Gnana|
+-------------------+-------+



## Extracting Month

In [25]:
df2=df1.withColumn("Month",col("Date")[0:7])
df2.show()

+-------------------+-------+-------+
|               Date|   name|  Month|
+-------------------+-------+-------+
|2020-10-10 11:01:01| Sanjay|2020-10|
|2020-11-12 06:45:15|Prakash|2020-11|
|2020-09-05 03:42:55|  Gnana|2020-09|
+-------------------+-------+-------+



## Saving as csv

In [26]:
df2.write.option('sep','|').option('header',True).csv('file:///home/sanjay/SparkTutorial/data/csv')

## Reading multiple CSV files(Pandas)

In [49]:
import pandas as pd
import glob

In [50]:
all_files=glob.glob('./data/csv/*.csv')
all_files

['./data/csv/part-00002-e233715e-5336-47ea-87b8-d19c56ce8b08-c000.csv',
 './data/csv/part-00000-e233715e-5336-47ea-87b8-d19c56ce8b08-c000.csv',
 './data/csv/part-00003-e233715e-5336-47ea-87b8-d19c56ce8b08-c000.csv',
 './data/csv/part-00001-e233715e-5336-47ea-87b8-d19c56ce8b08-c000.csv']

In [55]:
all_df=(pd.read_csv(f,delimiter='|') for f in all_files)
concanted_df=pd.concat(all_df,ignore_index=True)

In [56]:
concanted_df.head()

Unnamed: 0,Date,name,Month
0,2020-11-12T06:45:15.000+05:30,Prakash,2020-11
1,2020-09-05T03:42:55.000+05:30,Gnana,2020-09
2,2020-10-10T11:01:01.000+05:30,Sanjay,2020-10
