#READ AND WRITE OPERATIONS IN PYSPARK

DATAESET LINK:https://drive.google.com/drive/folders/1Tw7V9eBtUxy0xQMW38z3-bzWI_ewzLm6?usp=drive_link


In [0]:
%sql
create catalog if not exists customer_catalog;
create schema if not exists customer_catalog.customer_schema;
create volume if not exists customer_catalog.customer_schema.customer_volume;



In [0]:
dbutils.fs.mkdirs("/Volumes/customer_catalog/customer_schema/customer_volume/customer_data")

In [0]:
from pyspark.sql.session import SparkSession
spark=SparkSession.builder.getOrCreate()


In [0]:
#1. Header Concept
df=spark.read.csv("/Volumes/customer_catalog/customer_schema/customer_volume/customer_data/custs",header='true')

In [0]:
#2. Printing Schema
df.printSchema()

In [0]:
#3. Inferring Schema
df=spark.read.csv("/Volumes/customer_catalog/customer_schema/customer_volume/customer_data/custs",inferSchema="true").toDF("id","fname","lname","age","profession")
df.printSchema()
df.show(5)

In [0]:
#4. Using different options 
df=spark.read.options(header="true",inferSchema="true",sep=",").csv("/Volumes/customer_catalog/customer_schema/customer_volume/customer_data/custs_header")
df.printSchema()
df.show(5)

#Generic way of read and load data into dataframe

In [0]:
df=spark.read.format("csv").options(header="true",inferSchema="true",sep=",").load("/Volumes/customer_catalog/customer_schema/customer_volume/customer_data/custs_header")
df.printSchema()
df.show(5)

####Reading data from multiple files

In [0]:
df=spark.read.csv("/Volumes/customer_catalog/customer_schema/customer_volume/customer_data/custs*",header="true",inferSchema="true",sep=",")
df.printSchema()
df.show(5)

####Provide schema with SQL String or programatically

[PySpark SQL Datatypes](https://spark.apache.org/docs/latest/sql-ref-datatypes.html) <br>
[Data Types](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/data_types.html)

In [0]:
schema_sql = """
  cust_id INT,
  custf_name STRING,
  custl_name STRING,
  age INT,
  profession STRING
""" # multi line string
df = spark.read.option("header", "true").schema(schema_sql).csv("/Volumes/customer_catalog/customer_schema/customer_volume/customer_data/custs*")
df.printSchema()
df.show(5)
#customising our schema


In [0]:
#fater than inferschema ,also avoid default string assumption

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

schema = StructType([
    StructField("cust_id", IntegerType(), True),
    StructField("custf_name", StringType(), True),
    StructField("custl_name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("profession", StringType(), True),
]) # true denotes that the column allow null values
df=spark.read.option("header","true").schema(schema).csv("/Volumes/customer_catalog/customer_schema/customer_volume/customer_data/custs*")
df.printSchema()
df.show(5)
