In [None]:
from pyspark.sql import SparkSession

In [None]:
import os
import findspark

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'

findspark.init()

In [None]:
spark = SparkSession.builder.appName("lets do some querying").getOrCreate()
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)

In [None]:
df = spark.read.parquet(f"/parkingviolations/raw_all.parquet").cache()

In [None]:
df.printSchema()

## Question 1: What's the most amount that was paid in violations and by which Plate ID?

In [None]:
filtered_df = df.filter((df.plate_id != "BLANKPLATE") & (df.plate_id != "N/A"))

violations_count = filtered_df.groupBy("plate_id", "vehicle_make").count()

top_violated_plates = violations_count.orderBy("count", ascending=False).limit(10)

top_violated_plates.show()

In [None]:
from pyspark.sql.functions import sum, col, format_number, count

grouped_df = df.groupBy("violation_code", "violation_description") \
    .agg(sum("all_other_areas").alias("total_fine"), count("*").alias("violation_count"))

sorted_df = grouped_df.orderBy(col("total_fine").desc()) \
    .withColumn("total_fine", format_number(col("total_fine"), 0))

sorted_df.show(truncate=False)

## Question 2: At what hour of the day is it most likely to get a ticket?

In [None]:
from pyspark.sql.functions import hour, col

# hour extraction
df_with_hour = df.withColumn("issue_hour", hour(col("issue_datetime")))

# tickets per hour
hourly_ticket_counts = df_with_hour.groupBy("issue_hour").count()

# Hour with the highest amounts of tickets issued
most_likely_hour = hourly_ticket_counts.orderBy(col("count").desc()).first()
most_likely_hour_of_day = most_likely_hour["issue_hour"]
ticket_count_at_peak_hour = most_likely_hour["count"]



# Calculate the total number of tickets for percentages
total_tickets = df_with_hour.count()

# Calculate the percentage likelihood for each hour
hourly_ticket_counts = hourly_ticket_counts.withColumn(
    "likeliness_percentage",
    (col("count") / total_tickets) * 100
)

# Show results
print(f"Most likely hour to get a ticket is: {most_likely_hour_of_day}")
print(f"The number of tickets issued during that hour: {ticket_count_at_peak_hour}")
hourly_ticket_counts.orderBy(hourly_ticket_counts.likeliness_percentage.desc()) \
.show(24, truncate=False)


## Question 3: Which percentage of the total violation "income" has been provided by the top 10% (top 10% amount of tickets)?

In [None]:
from pyspark.sql.functions import col, desc, sum as _sum

# Calculate the total number of tickets issued for each violator
#Filtering out the dumb ones
violator_ticket_counts = df.filter((df.plate_id != "BLANKPLATE") & (df.plate_id != "N/A")) \
.groupBy("plate_id").count()

# Calculate the threshold to find the top 10% violators
total_violators = violator_ticket_counts.count()
top_10_percent_threshold = int(total_violators * 0.10)

# Get the top 10% violators
top_10_percent_violators = violator_ticket_counts.orderBy(desc("count")).limit(top_10_percent_threshold)

# Join with the original dataframe to filter rows for the top 10% violators
top_10_percent_df = df.join(top_10_percent_violators, on="plate_id", how="inner")

# Calculate the total fine amount for the top 10% violators
total_fine_top_10_percent = top_10_percent_df.agg(_sum("fine_amount").alias("total_fine")).collect()[0]["total_fine"]

# Calculate the total fine amount for all violators
total_fine_all = df.agg(_sum("fine_amount").alias("total_fine")).collect()[0]["total_fine"]

# Calculate the percentage
percentage_fine_top_10_percent = (total_fine_top_10_percent / total_fine_all) * 100

print(percentage_fine_top_10_percent)

## Question 4: Top 10 Most issued violations and how much money the bring in

In [None]:
from pyspark.sql.functions import col

# Group by violation code and sum the fine amount
violation_counts = df.groupBy("violation_code", "violation_description") \
    .agg({"fine_amount": "sum"}) \
    .withColumnRenamed("sum(fine_amount)", "total_fine") \
    .orderBy(col("total_fine").desc()) \
    .limit(10)  # Select the top 10 violations


In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Extracting data for plotting
codes = violation_counts.select("violation_code").rdd.flatMap(lambda x: x).collect()
descriptions = violation_counts.select("violation_description").rdd.flatMap(lambda x: x).collect()
total_fines = violation_counts.select("total_fine").rdd.flatMap(lambda x: x).collect()

# Function to format large numbers
def format_large_numbers(x, pos):
    if x >= 1e9:
        return '{:,.0f}B'.format(x * 1e-9)
    elif x >= 1e6:
        return '{:,.0f}M'.format(x * 1e-6)
    elif x >= 1e3:
        return '{:,.0f}K'.format(x * 1e-3)
    else:
        return '{:,.0f}'.format(x)

# Plotting
plt.figure(figsize=(10, 8))
plt.barh(descriptions, total_fines, color='skyblue')
plt.xlabel('Total Fine Amount')
plt.ylabel('Violation Description')
plt.title('Top 10 Most Issued Violations and Their Total Fine Amounts')
plt.gca().invert_yaxis()  # Invert y-axis to have the highest fine at the top
plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(format_large_numbers))  # Apply the formatting function to x-axis
plt.show()

In [None]:
spark.stop()