In [1]:
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName("Extract Schema").getOrCreate()

24/03/31 12:00:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
%store -r params

In [4]:
df = spark.read.option("header", True).option("inferSchema", True).csv(f"file://{params['credit_card_file']}")

                                                                                

In [5]:
def get_avro_schema(spark_schema, schema_type:str, name:str):
    schema_base = {
    "type": schema_type,
    "name": name
    }
    
    # Keys are Spark Types, Values are Avro Types
    avro_mapping = {
        'StringType()' : ["string", "null"],
        'LongType()' : ["long", "null"],
        'IntegerType()' :  ["int", "null"],
        'BooleanType()' : ["boolean", "null"],
        'FloatType()' : ["float", "null"],
        'DoubleType()': ["double", "null"],
        'TimestampType()' : ["long", "null"],
        'ArrayType(StringType,true)' : [{"type": "array", "items": ["string", "null"]}, "null"],
        'ArrayType(IntegerType,true)' : [{"type": "array", "items": ["int", "null"]}, "null"],
        'default': ["string", "null"]
        }
    
    fields = []
    
    for field in spark_schema.fields:
        if (str(field.dataType) in avro_mapping):
            fields.append({"name" : field.name, "type": avro_mapping[str(field.dataType)]})
        else:
            fields.append({"name" : field.name, "type": avro_mapping['default']})
            
    schema_base["fields"] = fields
    
    return schema_base

In [6]:
#write the schema
with open(params["spark_schema_file"], "w") as f:
    json.dump(df.schema.jsonValue(), f)
f.close()

with open(params["avro_schema_file"], "w") as f:
    json.dump(get_avro_schema(df.schema, "record", "creditcard"), f)
f.close()

In [8]:
#read the schema file to verify
with open(params["spark_schema_file"]) as f:
    new_schema = StructType.fromJson(json.load(f))
    print(new_schema)

StructType([StructField('id', IntegerType(), True), StructField('trans_date_trans_time', TimestampType(), True), StructField('cc_num', LongType(), True), StructField('merchant', StringType(), True), StructField('category', StringType(), True), StructField('amt', DoubleType(), True), StructField('first', StringType(), True), StructField('last', StringType(), True), StructField('gender', StringType(), True), StructField('street', StringType(), True), StructField('city', StringType(), True), StructField('state', StringType(), True), StructField('zip', IntegerType(), True), StructField('lat', DoubleType(), True), StructField('long', DoubleType(), True), StructField('city_pop', IntegerType(), True), StructField('job', StringType(), True), StructField('dob', DateType(), True), StructField('trans_num', StringType(), True), StructField('unix_time', IntegerType(), True), StructField('merch_lat', DoubleType(), True), StructField('merch_long', DoubleType(), True), StructField('is_fraud', IntegerT