In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('02_add-nested-structure')
         .getOrCreate())
sc = spark.sparkContext

# Set dynamic partitions to overwrite only the partition processed
spark.conf.set('spark.sql.sources.partitionOverwriteMode', 'dynamic')

In [None]:
from mimesis import Person, Address 
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DateType

def gen_data_add_nested_struct(data_path, partition_date, num_rows):
    person = Person('en')
    address = Address('en')
    
    # Create schema
    schema_address = StructType(
        [
            StructField('address', StringType(), True),
            StructField('city', StringType(), True),
            StructField('country', StringType(), True),
            StructField('state', StringType(), True),
            StructField('postal_code', StringType(), True)
        ]
    )

    schema_df = StructType(
        [
            StructField('identifier', StringType(), True),
            StructField('first_name', StringType(), True),
            StructField('last_name', StringType(), True),
            StructField('occupation', StringType(), True),
            StructField('age', IntegerType(), True),
            StructField('address', schema_address, True),
            StructField('date', DateType(), True)


        ]
    )
    
    # Generate data
    for i in range(num_rows):
        df_temp = spark.createDataFrame([
            [
                person.identifier(),
                person.first_name(),
                person.last_name(),
                person.occupation(),
                person.age(),
                [
                    address.address(),
                    address.city(),
                    address.country(),
                    address.state(),
                    address.postal_code()
                ],
                partition_date
            ]
        ], schema_df)

        try:
            df = df.union(df_temp)
        except:
            df = df_temp
            
    df.coalesce(1).write.partitionBy('date').mode('overwrite').parquet(data_path)
    
    print('Partition created: {data_path}/date={date}'.format(data_path=data_path,date=partition_date))
    print('# Rows:',df.count())
    print('Schema:')
    df.printSchema()
    print('\n')
    
    return