## Convert PySpark DataFrame to Pandas

In [0]:
dbutils.library.restartPython() # Removes Python state, but some libraries might not work without calling this command.dbutils.restartPython()

#### Load libraries

In [0]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import IntegerType, DateType, StringType, StructType, StructField

#### Create Spark session

In [0]:
spark = SparkSession.builder.appName('Convert dataframe from pyspark to pandas').getOrCreate()

#### Prepare Data

In [0]:
data = [
  ('John', '', 'Smith', '36636', 'M', 2500),
  ('Jane', '', 'Doe', '42114', 'F', 500),
  ('Richard', 'Laurence', 'Marquette', '97086', 'M', 1500),
  ('Israel', '', 'Israeli', '', 'M', 3000),
  ('Edward', 'III', '', 'SL4', 'M', 5000)
]
 
columns = ['firstname', 'middlename', 'lastname', 'zip', 'gender', 'salary']
 
pysparkDF = spark.createDataFrame(data=data, schema=columns)
pysparkDF.printSchema()
pysparkDF.show(truncate=False)

#### Convert PySpark Dataframe to Pandas DataFrame

In [0]:
pandasDF = pysparkDF.toPandas()
print(pandasDF)

#### Convert PySpark Nested Struct DataFrame to Pandas

In [0]:
dataNested = [
  (('John', '', 'Smith'), '36636', 'M', 2500),
  (('Jane', '', 'Doe'), '42114', 'F', 500),
  (('Richard', 'Laurence', 'Marquette'), '97086', 'M', 1500),
  (('Israel', '', 'Israeli'), '', 'M', 3000),
  (('Edward', 'III', ''), 'SL4', 'M', 5000)
]
 
schemaNested = StructType([
   StructField('name', StructType([
     StructField('firstname', StringType(), True),
     StructField('middlename', StringType(), True),
     StructField('lastname', StringType(), True)
     ])),
   StructField('zip', StringType(), True),
   StructField('gender', StringType(), True),
   StructField('salary', StringType(), True)
])

pysparkNestedDF = spark.createDataFrame(data=dataNested, schema=schemaNested)
pysparkNestedDF.printSchema()

In [0]:
pandasNestedDF = pysparkNestedDF.toPandas()
print(pandasNestedDF)

#### The end of the notebook