## Create Empty Dataframe 

In [0]:
schema="id INT, name STRING"
emtDF=spark.createDataFrame([],schema)
emtDF.show()
emtDF.printSchema()

## Convert RDD to DataFrame

In [0]:
rdd=spark.sparkContext.parallelize([(1,'John'),(2,'Mary')])
columns=['id','name']
df_from_rdd=spark.createDataFrame(rdd,columns)
df_from_rdd.show()
df_from_rdd.printSchema()

## Convert Dataframe to Pandas

In [0]:
pandasDF=df_from_rdd.toPandas()
pandasDF.head()

## StructType & StructField

In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType

schema= StructType([
        StructField("id", IntegerType(), True), 
        StructField("name", StringType(), True)
    ])

data=[(1,"John"),(2,"Mary")]

df_with_schema=spark.createDataFrame(data,schema)
df_with_schema.show()
df_with_schema.printSchema()

## Column Class

In [0]:
from pyspark.sql import functions as F

df_from_rdd=df_from_rdd.withColumn("upper_column",F.upper(df_from_rdd["name"]))
df_from_rdd.show()

## Select

In [0]:
df_from_rdd.select('id').show()

## Collect

In [0]:
rows=df_from_rdd.collect()
print(rows)
print(rows[0].id)
print(rows[0]['upper_column'])
print(rows[0][1])

## With New Column

In [0]:
df_with_new_col=df_from_rdd.withColumn("id_squared",df_from_rdd["id"]*df_from_rdd["id"])
df_with_new_col.show()

## withColumnRenamed()

In [0]:
df_with_renamed=df_from_rdd.withColumnRenamed("name","full_name")
df_with_renamed.show()

## where() and filter()

In [0]:
#where
df_filtered=df_from_rdd.where(df_from_rdd["id"]>1)
df_filtered.show()

#Alternatively:
df_filtered2=df_from_rdd.filter(df_from_rdd["id"]>1)
df_filtered2.show()

## Drop and Duplicates

In [0]:
# Dropping Column
df_dropped=df_from_rdd.drop("name")
df_dropped.show()

# Dropping Duplicate
df_no_duplicates=df_from_rdd.dropDuplicates()
df_no_duplicates.show()

## orderBy() or sort()

In [0]:
df_order_by=df_from_rdd.orderBy('id',ascending=False)
df_order_by.show()

#Equivalent to orderBy
df_sort=df_from_rdd.sort("id",ascending=False)
df_sort.show()

## groupBy()

In [0]:
df_group_by=df_from_rdd.groupBy("id").count()
df_group_by.show()

## join()

In [0]:
df2=spark.createDataFrame([(1,'Math'),(2,'Science'),(3,'English')],["id","subject"])
df_join=df_from_rdd.join(df2,on='id')
df_join.show()