### Creating Pyspark Session

In [0]:
import pyspark
from pyspark.sql import SparkSession

# creating a spark session
spark = SparkSession.builder.appName("pyspark practice").getOrCreate()

display(spark)

In [0]:
%fs
ls /mnt/tf-abfss/data/ds/food_inspection_dinesh

path,name,size
dbfs:/mnt/tf-abfss/data/ds/food_inspection_dinesh/Food_Inspections.csv,Food_Inspections.csv,250745613
dbfs:/mnt/tf-abfss/data/ds/food_inspection_dinesh/tesla_stocks.csv,tesla_stocks.csv,186958


### Reading a csv file

In [0]:
csv_file = '/mnt/tf-abfss/data/ds/food_inspection_dinesh/tesla_stocks.csv'
df = spark.read.csv(csv_file)

In [0]:
# displaying the dataframe without header is true
display(df)

_c0,_c1,_c2,_c3,_c4,_c5,_c6
Date,Open,High,Low,Close,Adj Close,Volume
2010-06-29,3.8,5.0,3.508,4.778,4.778,93831500
2010-06-30,5.158,6.084,4.66,4.766,4.766,85935500
2010-07-01,5.0,5.184,4.054,4.392,4.392,41094000
2010-07-02,4.6,4.62,3.742,3.84,3.84,25699000
2010-07-06,4.0,4.0,3.166,3.222,3.222,34334500
2010-07-07,3.28,3.326,2.996,3.16,3.16,34608500
2010-07-08,3.228,3.504,3.114,3.492,3.492,38557000
2010-07-09,3.516,3.58,3.31,3.48,3.48,20253000
2010-07-12,3.59,3.614,3.4,3.41,3.41,11012500


### Creating a dataframe with header is true

In [0]:
# printing the schema of the data
data = spark.read.csv(
    '/mnt/tf-abfss/data/ds/food_inspection_dinesh/tesla_stocks.csv',
    sep = ',',
    header = True,
    )

data.printSchema()

In [0]:
display(data)

Date,Open,High,Low,Close,Adj Close,Volume
2010-06-29,3.8,5.0,3.508,4.778,4.778,93831500
2010-06-30,5.158,6.084,4.66,4.766,4.766,85935500
2010-07-01,5.0,5.184,4.054,4.392,4.392,41094000
2010-07-02,4.6,4.62,3.742,3.84,3.84,25699000
2010-07-06,4.0,4.0,3.166,3.222,3.222,34334500
2010-07-07,3.28,3.326,2.996,3.16,3.16,34608500
2010-07-08,3.228,3.504,3.114,3.492,3.492,38557000
2010-07-09,3.516,3.58,3.31,3.48,3.48,20253000
2010-07-12,3.59,3.614,3.4,3.41,3.41,11012500
2010-07-13,3.478,3.728,3.38,3.628,3.628,13400500


In [0]:
# checking the data types
data.dtypes

In [0]:
# getting the first 3 rows of data from spark dataframe
data.head(3)

In [0]:
data.show(2)

In [0]:
data.first()

In [0]:
data.describe().show()

In [0]:
data.columns

In [0]:
data.count()

### Creating a new column as stock_Date with the reference of Date column

In [0]:
data = data.withColumn('stock_Date', data.Date)

data.show(5)

### Renaming the column

In [0]:
data = data.withColumnRenamed('stock_Date', 'Stock_date')

data.show(5)

### Dropping the extra created colun from the spark dataframe

In [0]:
# after dropping the added column we are having the existing columns
data = data.drop('Stock_date')

data.show(5)

### Imputting the missing values

In [0]:
# Remove Rows with Missing Values

data.na.drop()
data.show(5)

### Selecting the single column and multiple column

In [0]:
## Selecting Single Column

data.select('High').show(5)

## Selecting Multiple columns

data.select(['Open', 'High', 'Low']).show(5)

### filtering the column

In [0]:
from pyspark.sql.functions import col, lit

data.filter((col('Open') >= lit('2010-07-01')) | (col('Open') <= lit('2010-07-31'))).show(5)

### filtering the data by using the between condition

In [0]:
## fetch the data where the adjusted value is between 100.0 and 500.0

data.filter(data.High.between(4.0, 4.5)).show(5)

### filtering the data by using the When condition

In [0]:
from pyspark.sql import functions as f
data.select('open', 'close', 
            f.when(data.High >= 4.0, 1).otherwise(0)
           ).show(5)

### filtering the data by using the like condition

In [0]:
data.select('Open', 
            data.Open.rlike('^[3]').alias('open with 3')
            ).distinct().show(5)