In [None]:
from pyspark.sql import SparkSession
import os
import findspark

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'

findspark.init()
spark = SparkSession.builder.appName("CSV to Parquet").getOrCreate()
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)

In [None]:
def getRawCsvDataFiles():
    import subprocess
    p = subprocess.Popen("hdfs dfs -ls -d /parkingviolations/rawdata/* | awk '{print $8}' ",
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT)
    
    csv_files = []
    
    for line in p.stdout.readlines():
        csv_files.append(line.decode().strip())
    
    p.wait()
    return csv_files

# Now for the tricky part

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DateType
def getSchema():
    return StructType([
        StructField("summons_number", IntegerType()),
        StructField("plate_id", StringType()),
        StructField("registration_state", StringType()),
        StructField("plate_type", StringType()),
        StructField("issue_date", DateType()),
        StructField("violation_code", IntegerType()),
        StructField("vehicle_body_type", StringType()),
        StructField("vehicle_make", StringType()),
        StructField("issuing_agency", StringType()),
        StructField("street_code1", IntegerType()),
        StructField("street_code2", IntegerType()),
        StructField("street_code3", IntegerType()),
        StructField("vehicle_expiration_date", IntegerType()),
        StructField("violation_location", StringType()),
        StructField("violation_precinct", IntegerType()),
        StructField("issuer_precinct", IntegerType()),
        StructField("issuer_code", IntegerType()),
        StructField("issuer_command", StringType()),
        StructField("issuer_squad", StringType()),
        StructField("violation_time", StringType()),
        StructField("time_first_observed", StringType()),
        StructField("violation_county", StringType()),
        StructField("violation_in_front_of_or_opposite", StringType()),
        StructField("house_number", StringType()),
        StructField("street_name", StringType()),
        StructField("intersecting_street", StringType()),
        StructField("date_first_observed", IntegerType()),
        StructField("law_section", IntegerType()),
        StructField("sub_division", StringType()),
        StructField("violation_legal_code", StringType()),
        StructField("days_parking_in_effect", StringType()),
        StructField("from_hours_in_effect", StringType()),
        StructField("to_hours_in_effect", StringType()),
        StructField("vehicle_color", StringType()),
        StructField("unregistered_vehicle", StringType()),
        StructField("vehicle_year", IntegerType()),
        StructField("meter_number", StringType()),
        StructField("feet_from_curb", IntegerType()),
        StructField("violation_post_code", StringType()),
#StructField("violation_description", StringType()),
        StructField("no_standing_or_stopping_violation", StringType()),
        StructField("hydrant_violation", StringType()),
        StructField("double_parking_violation", StringType())
    ])

## Extract excel with pandas

In [None]:
import pandas as pd

def get_violation_codes_df():
    pandas_df = pd.read_excel("../1_Hands-on/Codes-Mapping.xlsx", skiprows=1)
    pandas_df.columns = ['violation_code', 'violation_description', 'manhattan_96th_st_below', 'all_other_areas']
    return spark.createDataFrame(pandas_df)

## Transform function

In [None]:
from pyspark.sql.functions import when, col, to_timestamp, expr, regexp_replace

def transform_csv_to_df(csv_file):

    schema = getSchema()

    # Read csv with schema
    df_raw = spark.read.csv(csv_file, header=True, schema=schema)
    
    # columns to underscore case
    df = df_raw.select([col(col_name).alias(col_name.lower().replace(' ', '_')) for col_name in df_raw.columns])

    
    #this was generated because data is inconsistent
    
    # Data cleaning: Preprocess violation_time column
    df = df.withColumn("violation_time",
                           regexp_replace("violation_time", r'[^\dAP]', "")  # Keep only digits, 'A', and 'P'
                          )

    # Add leading zeros to hours if needed
    df = df.withColumn("violation_time",
                       regexp_replace("violation_time", r'(\d{1,2})(AM|PM)', r'0\1:\2')  # Add leading zero to single-digit hour
                      )

    # Add leading zeros to minutes if needed
    df = df.withColumn("violation_time",
                       regexp_replace("violation_time", r'(AM|PM)(\d{1,2})', r'\1:0\2')  # Add leading zero to single-digit minute
                      )

    # Handle edge case for '120AM' -> '12:00 AM'
    df = df.withColumn("violation_time",
                       regexp_replace("violation_time", r'(12)(AM)', r'00:\2')  # Replace '12AM' with '00'
                      )

    # Convert to timestamp
    df = df.withColumn("violation_time", 
                       when(df["violation_time"].isNotNull(), 
                            to_timestamp("violation_time", "hh:mma")
                           )
                      .otherwise(None)
                     )
    
    return df

# Now we combine all dataframes into one

In [None]:
my_dfs = []
for csvFile in getRawCsvDataFiles():
    my_dfs.append(transform_csv_to_df(csvFile))

In [None]:
from functools import reduce
from pyspark.sql import DataFrame

meta_df = get_violation_codes_df()
df = reduce(DataFrame.unionAll, my_dfs)

df = df.join(meta_df, on="violation_code", how="left")

## Write results

In [None]:
!hdfs dfs -rm -r /parkingviolations/raw_all.parquet/
df.repartition(44).write.parquet(f"/parkingviolations/raw_all.parquet")

## Stop Spark

In [None]:
spark.stop()