In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('06_remove-column')
         .getOrCreate())
sc = spark.sparkContext

# Set dynamic partitions to overwrite only the partition processed
spark.conf.set('spark.sql.sources.partitionOverwriteMode', 'dynamic')

In [None]:
from mimesis import Person, Address
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DateType, FloatType

def gen_data_remove_column(data_path, partition_date, num_rows):
    person = Person('en')
    address = Address('en')
    
    schema_street = StructType(
        [
            StructField('street_name', StringType(), True)
            # StructField('lat', FloatType(), True), #column removed
            # StructField('long', FloatType(), True) #column removed
        ]
    )

    schema_address_details = StructType(
        [
            StructField('street', schema_street, True),
            StructField('number', IntegerType(), True)
        ]
    )


    schema_address = StructType(
        [
            StructField('address_details', schema_address_details, True),
            StructField('city', StringType(), True),
            StructField('country', StringType(), True),
            # StructField('country_code', StringType(), True), #column removed
            StructField('state', StringType(), True),
            StructField('postal_code', IntegerType(), True)
        ]
    )

    schema_df = StructType(
        [
            StructField('identifier', StringType(), True),
            StructField('first_name', StringType(), True),
            StructField('last_name', StringType(), True),
            StructField('occupation', StringType(), True),
            StructField('age', IntegerType(), True),
            StructField('address', schema_address, True),
            # StructField('title_name', StringType(), True), #column removed
            StructField('date', DateType(), True)


        ]
    )

    for i in range(num_rows):
        df_temp = spark.createDataFrame([
            [
                person.identifier(),
                person.first_name(),
                person.last_name(),
                person.occupation(),
                person.age(),
                [
                    [
                        [
                            address.street_name()
                            #float(address.latitude()),
                            #float(address.longitude())
                        ],
                        int(address.street_number())
                    ],
                    address.city(),
                    address.country(),
                    #address.country_code(),
                    address.state(),
                    int(address.postal_code())
                ],
                #person.title(),
                partition_date
            ]
        ], schema_df)

        try:
            df = df.union(df_temp)
        except:
            df = df_temp
            
    df.coalesce(1).write.partitionBy('date').mode('overwrite').parquet(data_path)
    
    print('Partition created: {data_path}/date={date}'.format(data_path=data_path,date=partition_date))
    print('# Rows:',df.count())
    print('Schema:')
    df.printSchema()
    print('\n')
    
    return