In [4]:
from pyspark.sql import SparkSession

# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('01_create-simple-schema')
         .getOrCreate())
sc = spark.sparkContext

# Set dynamic partitions to overwrite only the partition processed
spark.conf.set('spark.sql.sources.partitionOverwriteMode', 'dynamic')

In [None]:
from mimesis import Person
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DateType

def gen_data_simple_schema(data_path, partition_date, num_rows):
    person = Person('en')
    
    # Create a simple schema
    schema_df = StructType(
        [
            StructField('identifier', StringType(), True),
            StructField('first_name', StringType(), True),
            StructField('last_name', StringType(), True),
            StructField('occupation', StringType(), True),
            StructField('age', IntegerType(), True),
            StructField('date', DateType(), True)
        ]
    )
    
    # generate data
    for i in range(num_rows):
        df_temp = spark.createDataFrame([
            [
                person.identifier(),
                person.first_name(),
                person.last_name(),
                person.occupation(),
                person.age(),
                partition_date
            ]
        ], schema_df)

        try:
            df = df.union(df_temp)
        except:
            df = df_temp
    
    df.coalesce(1).write.partitionBy('date').mode('overwrite').parquet(data_path)
    
    print('Partition created: {data_path}/date={date}'.format(data_path=data_path,date=partition_date))
    print('# Rows:',df.count())
    print('Schema:')
    df.printSchema()
    print('\n')
    
    return