### Creating Pyspark Session

In [2]:
import pyspark
from pyspark.sql import SparkSession

# creating a spark session
spark = SparkSession.builder.appName("pyspark practice").getOrCreate()

display(spark)

### Creating a Spark dataframe

In [4]:
Spark_data = [
    ("James",None,"M"),
    ("Anna","NY","F"),
    ("Julia",None,None)
]

columns = ["name","state","gender"]
df =spark.createDataFrame(Spark_data,columns)
display(df)

name,state,gender
James,,M
Anna,NY,F
Julia,,


In [5]:
%fs
ls /mnt/tf-abfss/data/ds/food_inspection_dinesh

path,name,size
dbfs:/mnt/tf-abfss/data/ds/food_inspection_dinesh/Food_Inspections.csv,Food_Inspections.csv,250745613
dbfs:/mnt/tf-abfss/data/ds/food_inspection_dinesh/tesla_stocks.csv,tesla_stocks.csv,186958


### Reading a csv file

In [7]:
csv_file = '/mnt/tf-abfss/data/ds/food_inspection_dinesh/tesla_stocks.csv'
df = spark.read.csv(csv_file)

In [8]:
# displaying the dataframe without header is true
display(df.head(3))

_c0,_c1,_c2,_c3,_c4,_c5,_c6
Date,Open,High,Low,Close,Adj Close,Volume
2010-06-29,3.8,5.0,3.508,4.778,4.778,93831500
2010-06-30,5.158,6.084,4.66,4.766,4.766,85935500


### Creating a dataframe with header is true

In [10]:
# printing the schema of the data
data = spark.read.csv(
    '/mnt/tf-abfss/data/ds/food_inspection_dinesh/tesla_stocks.csv',inferSchema=True,
    sep = ',',
    header = True,
    )

data.printSchema()

In [11]:
display(data.show(3))

In [12]:
# checking the data types
data.dtypes

In [13]:
# getting the first 2 rows of data from spark dataframe
data.head(2)

In [14]:
data.show(2)

In [15]:
data.first()

In [16]:
data.describe().show()

In [17]:
data.columns

In [18]:
data.count()

### Creating a new column as stock_Date with the reference of Date column

In [20]:
data = data.withColumn('stock_Date', data.Date)

data.show(2)

### Renaming the column

In [22]:
data = data.withColumnRenamed('stock_Date', 'Stock_date')

data.show(2)

### Dropping the extra created colun from the spark dataframe

In [24]:
# after dropping the added column we are having the existing columns
data = data.drop('Stock_date')

data.show(3)

In [25]:
### any==how
data.na.drop(how="any").show(2)

In [26]:
##threshold
data.na.drop(how="any",thresh=3).show(2)

In [27]:
from pyspark.sql.functions import col, lit

### checking the null values

In [29]:
data.filter(col("Open").isNull()).show()

In [30]:
# checking the missing values count using condition of two columns
data.filter(data.Open.isNull() & data.High.isNull()).count()

### Filter Rows with IS NOT NULL or isNotNull

In [32]:
data.filter(col("Open").isNotNull()).count()

### Imputting the missing values

In [34]:
# Remove Rows with Missing Values

data.na.drop()
data.show(3)

### Selecting the single column and multiple column

In [36]:
## Selecting Single Column

data.select('High').show(2)

## Selecting Multiple columns

data.select(['Open', 'High', 'Low']).show(2)

### filtering the column

In [38]:
from pyspark.sql.functions import col, lit

data.filter((col('Open') >= lit('2010-07-01')) | (col('Open') <= lit('2010-07-31'))).show(2)

In [39]:
data.filter("Open >=3.0").show(3)

In [40]:
data.filter("Volume <= 41094000").select(['Open','Close']).show(3)

### filtering the data by using the between condition

In [42]:
## fetch the data where the adjusted value is between 100.0 and 500.0

data.filter(data.High.between(4.0, 4.5)).show(2)

### filtering the data by using the When condition

In [44]:
from pyspark.sql import functions as f
data.select('open', 'close', 
            f.when(data.High >= 4.0, 1).otherwise(0)
           ).show(2)

### filtering the data by using the like condition

In [46]:
data.select('Open', 
            data.Open.rlike('^[3]').alias('open with 3')
            ).distinct().show(2)

### Pyspark GroupBy

In [48]:
data.groupBy('Volume').avg().show(3)