# EDA and Cleaning

## Data Loading

In [0]:
# RUN
# import libraries needed
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, expr, dayofweek, date_format, sum
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import weekofyear
from pyspark.sql.window import Window
from pyspark.sql.types import TimestampType


In [0]:
# RUN
# create spark session
spark = SparkSession.builder.appName("FinalProject").getOrCreate()

In [0]:
# RUN
# data : https://learn.microsoft.com/en-us/azure/open-datasets/dataset-taxi-yellow?tabs=pyspark#azure-databricks

# Azure storage access info
blob_account_name = "azureopendatastorage"
blob_container_name = "nyctlc"
blob_relative_path = "yellow"
blob_sas_token = "r"

# Allow SPARK to read from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set(
  'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
  blob_sas_token)
print('Remote blob path: ' + wasbs_path)

# SPARK read parquet, note that it won't load any data yet by now
df = spark.read.parquet(wasbs_path)
#print('Register the DataFrame as a SQL temporary view: source')
#df.createOrReplaceTempView('source')

# Display top 10 rows
#print('Displaying top 10 rows: ')
#display(spark.sql('SELECT * FROM source LIMIT 10'))

## Data Cleaning

In [0]:
# smaller dataset bc data is big. Will use the truncated df for the project
df_2017 = df.filter((col("puYear") == 2017))

# drop unneccessar columns
df_2017 = df_2017.drop("vendorID", "rateCodeId", "storeAndFwdFlag",
                       "paymentType", "extra", "mtaTax", "improvementSurcharge", 
                       "tipAmount", "tollsAmount", "fareAmount", 
                       "startLon", "startLat", "endLon", "endLat")

#display(df_2017)

- New York City includes different boroughs, Bronx, Brooklyn, EWR, Queens, Staten Islands, and Manhattan. We want to only focus on Manhattan because that's the busiest borough.
- The original dataset from vendor has longitue and latitue infos, however, the data stored in Azure does not. Hence, I mannually filter the Manhattan zones.

## 1. NYC Borough Location Analysis

In [0]:
# Load the taxi zone csv from the hive metastore
taxi_zone = spark.sql("SELECT LocationID, Borough, Zone FROM `hive_metastore`.`default`.`taxi_zone_lookup`;")
#display(taxi_zone)

In [0]:
joined_df = df_2017.join(taxi_zone, df_2017.puLocationId == taxi_zone.LocationID, 'left_outer')

# Drop the LocationID column from df2 after the join
joined_df = joined_df.drop(joined_df.LocationID)

#display(joined_df)

In [0]:
import matplotlib.pyplot as plt
from pyspark.sql import functions as F

# Group by 'Borough' and count the number of occurrences
borough_counts = joined_df.groupBy('Borough').count()

# Order by count descending
borough_counts_ordered = borough_counts.orderBy(F.desc('count'))

# Collect the ordered data
ordered_data = borough_counts_ordered.collect()

# Extract the borough names and counts as lists
borough_names = [row.Borough for row in ordered_data]
counts = [row['count'] for row in ordered_data]

# Calculate total count
total_count = borough_counts.agg(F.sum('count')).collect()[0][0]

# Create the bar graph
plt.figure(figsize=(10, 6))

# Define colors for each borough, assuming borough_names has been sorted or arranged as desired
borough_colors = plt.cm.Paired(range(len(borough_names)))  # Adjusted part: use the same color mapping logic

bars = plt.bar(borough_names, counts, color=borough_colors)

# Add individual counts above bars
for i, bar in enumerate(bars):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval, f'{int(yval):,}',ha='center',va='bottom')  # va='bottom' to align the text

# Customize the plot
plt.xlabel('Borough')
plt.ylabel('Count')
plt.title('Distribution of Pickups by Borough')

# Create a legend showing the total count
plt.legend([f'Total Pickups: {int(total_count):,}'], loc='upper right')

# Remove the spines
ax = plt.gca()  # Get the current axes
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

plt.tight_layout()  # Adjust layout to make room for the rotated x-axis labels
plt.show()

Analysis result: Keep only the Manhattan data

In [0]:
import matplotlib.pyplot as plt
from pyspark.sql import functions as F
import numpy as np

# Group by 'Borough' and count the number of occurrences
borough_counts = joined_df.groupBy('Borough').count()

# Order by count descending
borough_counts_ordered = borough_counts.orderBy(F.desc('count'))

# Collect the ordered data
ordered_data = borough_counts_ordered.collect()

# Extract the borough names and counts as lists
borough_names = [row.Borough for row in ordered_data]
counts = [row['count'] for row in ordered_data]

# Calculate total count
total_count = borough_counts.agg(F.sum('count')).collect()[0][0]

# Define colors for each borough, use a more vibrant color map
borough_colors = plt.cm.viridis(np.linspace(0.3, 0.7, len(borough_names)))

# Create the bar graph
plt.figure(figsize=(12, 8))
bars = plt.bar(borough_names, counts, color=borough_colors)

# Customize the plot with improved aesthetics
plt.xlabel('Borough', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.title('Distribution of Pickups by Borough', fontsize=18, fontweight='bold')

# Add gridlines for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add individual counts above bars with improved text properties
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, height, f'{int(height):,}', ha='center', va='bottom', fontsize=10)

# Set the ticks' properties
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)

# Add the legend with a simplified message
plt.legend([f'Total: {total_count:,}'], loc='upper right', fontsize=12)

# Remove spines but leave the bottom spine for grounding
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')  # Light color for bottom spine

# Set background color within the plot
ax.set_facecolor('#f8f8f8')

# Set a frame around the entire plot
plt.gcf().set_edgecolor('#f0f0f0')
plt.gcf().set_linewidth(1.5)

plt.tight_layout()
plt.show()


In [0]:
Man_df = joined_df.filter(joined_df["Borough"] == "Manhattan")

## 2. Manhattan Demand Analysis

In [0]:
# Group by 'Zone' and count the pickups
pickup_counts = Man_df.groupby('Zone').count().select("Zone", col("count").alias("counts"))
display(pickup_counts)

In [0]:
# Read taxi_zones csv from Hive Metastore
zones_geom = spark.table("default.taxi_zones")

In [0]:
from pyspark.sql.functions import col

# Perform the join operation
zone_df = zones_geom.join(pickup_counts, zones_geom["zone"] == pickup_counts["Zone"])

# Select the desired columns
#zone_df = zone_df.select(zone_df["Zone"], col("counts"), col("the_geom"))

# Show the result
# display(zone_df)

In [0]:
zone_df = pickup_counts.join(zones_geom, pickup_counts.Zone == zones_geom.zone, 'left_outer')

# Drop the LocationID column from df2 after the join
zone_df = zone_df.drop(zone_df["borough"])
display(zone_df)

In [0]:
# Convert to Pandas dataframe
zone_pandas = zone_df.toPandas()

In [0]:
import geopandas as gpd
from shapely import wkt
# Convert the_geom column to geometry
zone_pandas['geometry'] = zone_pandas['the_geom'].apply(wkt.loads)

# Create a GeoDataFrame
gdf_zones = gpd.GeoDataFrame(zone_pandas, geometry='geometry', crs="EPSG:4326")

In [0]:
import matplotlib.pyplot as plt
import geopandas as gpd
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Assuming 'gdf_zones' is your GeoDataFrame

# Sort the GeoDataFrame by 'counts' to get the top 10 zones
top_zones_10 = gdf_zones.nlargest(10, 'counts')
top_zones_5 = gdf_zones.nlargest(5, 'counts')

# Plot the map
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10), gridspec_kw={'width_ratios': [3, 1]})

gdf_zones.plot(ax=ax1, column='counts', cmap='viridis')

# Add title to the map
ax1.set_title('Demand by Zone', fontdict={'fontsize': 16}, loc='center')
ax1.set_axis_off()

# Annotate only the top 10 zones on the map
for idx, row in top_zones_5.iterrows():
    if row['geometry']:  # This checks if the geometry is not None
        ax1.annotate(text=row['Zone'], xy=(row['geometry'].centroid.x, row['geometry'].centroid.y),
                     horizontalalignment='center', fontsize=10, color='black')

# Create a horizontal bar graph on the second subplot
colors = plt.cm.viridis(top_zones_10['counts'] / top_zones_10['counts'].max())
ax2.barh(top_zones_10['Zone'], top_zones_10['counts'], color=colors)
ax2.set_title('Top 10 Zones', fontdict={'fontsize': 16})
ax2.set_xlabel('Total Pickup Counts')
ax2.invert_yaxis()  # Reverse the order to have the highest at the top
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.spines['bottom'].set_visible(False)

# Create an overall title for the figure
fig.suptitle('Manhattan Taxi Pickup Analysis', fontsize=24, x = 0.64, y=1.05)

# Make the bar graph align with the colormap of the map
sm = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(vmin=gdf_zones['counts'].min(), vmax=gdf_zones['counts'].max()))
sm._A = []  # Necessary for matplotlib < 3.1
cbar = fig.colorbar(sm, orientation='horizontal', fraction=0.05, pad=0.05)
cbar.set_label('Pickup Counts')

# Display the figure
plt.tight_layout()
plt.show()


## 3. Pick Up Time Category Analysis

### 3.1 Time of the Day Analysis

In [0]:
# Step 1: Extract the hour from the pickup time
from pyspark.sql.functions import hour, col, when

Man_df_new = Man_df.withColumn('hour', hour('tpepPickupDateTime'))

In [0]:
# Step 2: Categorize the hours into time of day categories
time_categories = [
    (col('hour') < 5, 'Midnight'),
    (col('hour').between(5, 11), 'Morning'),
    (col('hour').between(11, 15), 'Noon'),
    (col('hour').between(15, 20), 'Evening'),
    (col('hour') >= 20, 'Night')
]

Man_df_new = Man_df_new.withColumn('time_of_day', when(time_categories[0][0], time_categories[0][1])
                                    .when(time_categories[1][0], time_categories[1][1])
                                    .when(time_categories[2][0], time_categories[2][1])
                                    .when(time_categories[3][0], time_categories[3][1])
                                    .when(time_categories[4][0], time_categories[4][1]))



In [0]:
# Step 3: Group by the time category and count the number of trips
time_of_day_counts = Man_df_new.groupBy('time_of_day').count().orderBy('time_of_day')

In [0]:
# Step 4: Calculate the proportions
total_trips = Man_df_new.count()
time_of_day_props = time_of_day_counts.withColumn('proportion', col('count') / total_trips)

# Collect the data to use for plotting
time_of_day_data = time_of_day_props.collect()

# Extract the time categories and proportions
categories = [row['time_of_day'] for row in time_of_day_data]
proportions = [row['proportion'] for row in time_of_day_data]
counts = [row['count'] for row in time_of_day_data]  # for displaying the count on the chart

# Convert proportions to percentages
percentages = [p * 100 for p in proportions]


In [0]:
# Update categories with time ranges
time_ranges = {
    'Midnight': 'Midnight (0-5)',
    'Morning': 'Morning (5-11)',
    'Noon': 'Noon (11-15)',
    'Evening': 'Evening (15-20)',
    'Night': 'Night (20-24)'
}
categories_with_time = [time_ranges[cat] for cat in categories]

# Identify the index of the largest segment
largest_segment_index = proportions.index(max(proportions))

# Define the explode values for the segments
explode_values = [0.07 if i == largest_segment_index else 0 for i in range(len(categories))]

# Create the donut chart
fig, ax = plt.subplots(figsize=(8, 8))
wedges, texts, autotexts = ax.pie(percentages, 
                                  labels=categories_with_time, 
                                  autopct=lambda pct: f"{pct:.2f}%", 
                                  startangle=140, 
                                  colors=plt.cm.tab20.colors, 
                                  pctdistance=0.85,
                                  explode=explode_values)

# Draw a circle at the center to turn it into a donut chart
centre_circle = plt.Circle((0, 0), 0.60, fc='white')
fig.gca().add_artist(centre_circle)

# Equal aspect ratio ensures that pie is drawn as a circle.
ax.axis('equal')  

# Set the position and alignment of the percentage labels
plt.setp(autotexts, size=12, weight="bold", color="white", va="center")

# Bold the largest segment text
plt.setp(autotexts[largest_segment_index], size=12,  color='white')

# Add the total in the center
total_trips_str = f'{total_trips:,}'  # formatted with a thousands separator
plt.text(0, 0, total_trips_str, ha='center', va='center', fontsize=14, weight='bold')

plt.title('Distribution of Trips by Time',fontsize=18, fontweight='bold')
plt.show()

### 3.2 Day of the Week Analysis

In [0]:
from pyspark.sql.functions import date_format

# Step 1

# Add a column that extracts the date from the datetime
Man_df_new = Man_df_new.withColumn('date', F.to_date('tpepPickupDateTime'))

# Add a new column for the day of the week
Man_df_new = Man_df_new.withColumn('day_of_week', date_format('tpepPickupDateTime', 'E'))

In [0]:
# Step 2: Aggregate data for total and average number of trips per day of the week

# Count the unique dates for each day of the week
day_counts = Man_df_new.groupBy('day_of_week').agg(F.countDistinct('date').alias('day_count'))

day_of_week_df = Man_df_new.groupBy('day_of_week').count().withColumnRenamed('count', 'total_trips')

# Join this back with the total trips to calculate the average
day_of_week_df = day_of_week_df.join(day_counts, 'day_of_week')
day_of_week_df = day_of_week_df.withColumn('avg_trips', F.col('total_trips') / F.col('day_count'))


In [0]:
# Collect the data to use for plotting
day_of_week_data = day_of_week_df.collect()

# Sort the data based on the day of the week assuming order is Sun, Mon, Tue, etc.
sorted_days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
sorted_data = sorted(day_of_week_data, key=lambda x: sorted_days.index(x['day_of_week']))

# Extract the day names and trip counts
day_names = [row['day_of_week'] for row in sorted_data]
total_counts = [row['total_trips'] for row in sorted_data]
avg_counts = [row['avg_trips'] for row in sorted_data]
proportions = [row['total_trips'] / total_trips for row in sorted_data]

In [0]:
# Define colors for each day to be consistent across both plots
day_colors = plt.cm.Paired(range(len(sorted_days)))

# Step 3: Create the combined plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Bar chart for average trips
bars = ax1.bar(day_names, avg_counts, color=day_colors)
ax1.set_title('Weekly Average')
ax1.set_xlabel('Day of week')
ax1.set_ylabel('Average number of trips')

# Add the text labels above the bars
for bar, label in zip(bars, avg_counts):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width() / 2, height, f'{label:.0f}', ha='center', va='bottom')

# Remove the top and right spines
ax = plt.gca()
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)

###########################################################################
# Identify the index of the largest segment
largest_segment_index = proportions.index(max(proportions))

# Define the explode values for the segments
explode_values = [0.07 if i == largest_segment_index else 0 for i in range(len(day_names))]

# Donut chart for proportion of total trips
wedges, texts, autotexts = ax2.pie(proportions, 
                                   labels=day_names, 
                                   autopct='%1.1f%%', 
                                   startangle=140, 
                                   pctdistance=0.85, 
                                   colors=day_colors,
                                   explode=explode_values)

centre_circle = plt.Circle((0, 0), 0.50, fc='white')
fig.gca().add_artist(centre_circle)
ax2.set_title('Proportion of Total Trips')

# Adjust the size of percentage labels
plt.setp(autotexts, size=12, weight="bold", color = 'white')

# Add the total in the center of the donut chart
total_trips_str = f'{total_trips:,}'
plt.text(0, 0, f'Total\n{total_trips_str}', ha='center', va='center', fontsize=12, weight='bold')

# Overall title for all subplots
plt.suptitle('Weekly Demand Analysis', fontsize=18, fontweight='bold', y=1.05)

plt.tight_layout()
plt.show()

## 4. Passenger Analysis

In [0]:
# Aggregate passenger count
from pyspark.sql.functions import count

# Aggregate passenger count
passenger_count_df = Man_df_new.groupBy("passengerCount").agg(count("*").alias("tripCount"))

# Collect the data to the driver node (as a list of rows)
passenger_count_data = passenger_count_df.collect()


In [0]:
display(passenger_count_data)

In [0]:
# Extract the passenger counts and corresponding trip counts
passenger_counts = [row['passengerCount'] for row in passenger_count_data]
trip_counts = [row['tripCount'] for row in passenger_count_data]

In [0]:
import matplotlib.pyplot as plt

# Assuming passenger_count_data is a list of dictionaries with 'passengerCount' and 'tripCount' keys
passenger_counts = [int(row['passengerCount']) for row in passenger_count_data]
trip_counts = [row['tripCount'] for row in passenger_count_data]

# Combine and sort data based on passenger counts as integers
combined_data = sorted(zip(passenger_counts, trip_counts), key=lambda x: x[0])

# Split combined_data back into passenger_counts and trip_counts, converting passenger counts back to strings
sorted_passenger_counts, sorted_trip_counts = zip(*[(str(pc), tc) for pc, tc in combined_data])

# Plot the bar graph with ordered x-axis
colors = ['red' if pc == '0' or int(pc) > 6 else 'lightgreen' for pc in sorted_passenger_counts]
plt.figure(figsize=(10, 6))
plt.bar(sorted_passenger_counts, sorted_trip_counts, color=colors, edgecolor='black')

# Add labels and title

plt.xlabel('Passenger Count')
plt.ylabel('Trip Count')
plt.title('Passenger Count Per Taxi', fontsize=16, fontweight='bold')
plt.yscale('log')

# Remove spines but leave the bottom spine for grounding
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')  # Light color for bottom spine

# Show the plot
plt.show()


Analysis Result: Remove outliers - 0, 7, 8, 9, 192

In [0]:
# Filter out the unwanted passenger counts
Man_df_new = Man_df_new.filter((col("passengerCount") != 0) & (col("passengerCount") != 7) & (col("passengerCount") != 8) & (col("passengerCount") != 9) & (col("passengerCount") != 192))

In [0]:
Man_df_new.count()

## 5. Trip Distance and Duration Analysis


- Goal 1: Keep only trip duration greater than 1 minute and less than 720 minute (12 hours)
- Goal 2: Kepp only trip distance greater than 0 miles and less than the 99.9% percentile

In [0]:
from pyspark.sql.functions import col, unix_timestamp,round

# Calculate the duration in minutes
Man_df_new = Man_df_new.withColumn(
    'tripDuration (min)',
    round(
        (unix_timestamp(col('tpepDropoffDateTime')) - unix_timestamp(col('tpepPickupDateTime')))/60,
    2))

Man_df_new = Man_df_new.withColumn(
    'tripSpeed (mph)',
    round(col('tripDistance') / (col('tripDuration (min)') / 60)).cast('integer'))

# Select only the tripDistance and tripDuration
# TripFocused = TripFocused.select(TripFocused["tripDistance"], col("tripDuration"))

In [0]:
from pyspark.sql.functions import mean, min, max, stddev, col

# Calculate basic statistics for tripDistance and tripDuration
stats_distance = Man_df_new.select(
    mean(col("tripDistance")).alias("mean_distance"),
    stddev(col("tripDistance")).alias("stddev_distance"),
    min(col("tripDistance")).alias("min_distance"),
    max(col("tripDistance")).alias("max_distance"),
    )

stats_duration = Man_df_new.select(
    mean(col("tripDuration (min)")).alias("mean_duration"),
    stddev(col("tripDuration (min)")).alias("stddev_duration"),
    min(col("tripDuration (min)")).alias("min_duration"),
    max(col("tripDuration (min)")).alias("max_duration"),
    )

stats_speed = Man_df_new.select(
    mean(col("tripSpeed (mph)")).alias("mean_speed"),
    stddev(col("tripSpeed (mph)")).alias("stddev_speed"),
    min(col("tripSpeed (mph)")).alias("min_speed"),
    max(col("tripSpeed (mph)")).alias("max_speed"),
    )

# Show the stats
stats_distance.show()
stats_duration.show()
stats_speed.show()

### 5.1 Duration Outlier 

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import percent_rank

# Calculate percentile ranks
window = Window.orderBy(Man_df_new['tripDuration (min)'])
df_with_percentile = Man_df_new.withColumn("percentile_rank", percent_rank().over(window))

# Now find the tripDuration at each percentile from 0 to 100
percentiles = [i / 100 for i in range(101)]  # 0, 0.01, 0.02, ..., 1
percentile_values = df_with_percentile.stat.approxQuantile("tripDuration (min)", percentiles, 0.01)

# Print the percentiles and their corresponding tripDuration
for percentile, value in zip(percentiles, percentile_values):
    print(f"{percentile * 100} percentile value is {value}")

# To look more closely at the 90th to 100th percentile
detailed_percentiles = [i / 100 for i in range(90, 101)]
detailed_percentile_values = df_with_percentile.stat.approxQuantile("tripDuration (min)", detailed_percentiles, 0.01)

for percentile, value in zip(detailed_percentiles, detailed_percentile_values):
    print(f"{percentile * 100} percentile value is {value}")


In [0]:
from pyspark.sql.functions import expr
# Calculate percentile ranks
window = Window.orderBy(Man_df_new['tripDuration (min)'])
df_with_percentile = Man_df_new.withColumn("percentile_rank", percent_rank().over(window))

# Calculate detailed percentiles for tripDuration
detailed_percentiles = [float(i) / 1000 + 0.98 for i in range(1, 11)]
detailed_percentile_values = df_with_percentile.approxQuantile("tripDuration (min)", detailed_percentiles, 0.01)

# Print detailed percentiles and their corresponding tripDistance
for percentile, value in zip(detailed_percentiles, detailed_percentile_values):
    print(f"{percentile * 100} percentile value is {value}")

### 5.2 Distance Outlier 

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import percent_rank

# Calculate percentile ranks
window = Window.orderBy(Man_df_new['tripDistance'])
df_with_percentile = Man_df_new.withColumn("percentile_rank", percent_rank().over(window))

# Now find the tripDistance at each percentile from 0 to 100
percentiles = [i / 100 for i in range(101)]  # 0, 0.01, 0.02, ..., 1
percentile_values = df_with_percentile.stat.approxQuantile("tripDistance", percentiles, 0.01)

# Print the percentiles and their corresponding tripDuration
for percentile, value in zip(percentiles, percentile_values):
    print(f"{percentile * 100} percentile value is {value}")

In [0]:
from pyspark.sql.functions import expr

# Calculate detailed percentiles for tripDistance
detailed_percentiles = [float(i) / 1000 + 0.98 for i in range(1, 11)]
detailed_percentile_values = df_with_percentile.approxQuantile("tripDistance", detailed_percentiles, 0.01)

# Print detailed percentiles and their corresponding tripDistance
for percentile, value in zip(detailed_percentiles, detailed_percentile_values):
    print(f"{percentile * 100} percentile value is {value}")


### 5.3 Speed Outlier

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import percent_rank

# Calculate percentile ranks
window = Window.orderBy(Man_df_new['tripSpeed (mph)'])
df_with_percentile = Man_df_new.withColumn("percentile_rank", percent_rank().over(window))

# Now find the tripDistance at each percentile from 0 to 100
percentiles = [i / 100 for i in range(101)]  # 0, 0.01, 0.02, ..., 1
percentile_values = df_with_percentile.stat.approxQuantile("tripSpeed (mph)", percentiles, 0.01)

# Print the percentiles and their corresponding tripDuration
for percentile, value in zip(percentiles, percentile_values):
    print(f"{percentile * 100} percentile value is {value}")

In [0]:
from pyspark.sql.functions import expr

# Calculate detailed percentiles for tripDistance
detailed_percentiles = [float(i) / 1000 + 0.98 for i in range(1, 11)]
detailed_percentile_values = df_with_percentile.approxQuantile("tripSpeed (mph)", detailed_percentiles, 0.01)

# Print detailed percentiles and their corresponding tripDistance
for percentile, value in zip(detailed_percentiles, detailed_percentile_values):
    print(f"{percentile * 100} percentile value is {value}")

### 5.4 Trip Outlier Count 

In [0]:
import matplotlib.pyplot as plt

# Assuming Man_df_new is your DataFrame already loaded in Spark
Man_df_new.createOrReplaceTempView("Man_df_new")

# Retrieve the data needed for plotting
query = """
SELECT `tripDistance`, `tripDuration (min)`, `tripSpeed (mph)`
FROM Man_df_new
"""
plot_data = spark.sql(query).toPandas()


In [0]:
# Create subplots for boxplots
fig, axs = plt.subplots(1, 3, figsize=(18, 6))

# Boxplot for tripDistance
axs[0].boxplot(plot_data['tripDistance'])
axs[0].set_title('Trip Distance (miles)')

# Boxplot for Trip Duration - filter out negative and zero durations before plotting
#filtered_durations = plot_data['tripDuration (min)'][plot_data['tripDuration (min)'] > 0]
axs[1].boxplot(plot_data['tripDuration (min)'])
axs[1].set_title('Trip Duration (min)')


# Boxplot for Trip Speed - remove the log scale due to negative values
filtered_speeds = plot_data['tripSpeed (mph)'][plot_data['tripSpeed (mph)'] > 0]
axs[2].boxplot(filtered_speeds)
axs[2].set_title('Trip Speed (mph)')

# Set common labels
for ax in axs:
    ax.set_xlabel('Dataset')
    ax.set_ylabel('Value')
    ax.set_yscale('log')

plt.tight_layout()
plt.show()

In [0]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Enable Arrow-based columnar data transfers; this makes toPandas() faster for large datasets
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Sample the data
sampled_data = Man_df_new.sample(withReplacement=False, fraction=0.01)  # for example, 1% of the data

# Sample the data and convert to Pandas DataFrame
sampled_pd = sampled_data.toPandas()

# Create a 3D scatter plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

x = sampled_pd['tripDistance']
y = sampled_pd['tripDuration (min)']
z = sampled_pd['tripSpeed (mph)']

In [0]:
import numpy as np
# Create a 3D scatter plot with improved aesthetics
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Color by speed, normalize the color
colors = sampled_pd['tripSpeed (mph)']
norm = plt.Normalize(colors.min(), colors.max())

# Create a scatter plot
sc = ax.scatter(x, y, z, c=colors, cmap='viridis', norm=norm, edgecolors='w', s=50)

# Add color bar which maps values to colors
cbar = fig.colorbar(sc, shrink=0.5, aspect=20, pad=0.1)
cbar.set_label('Trip Speed (mph)')

# Set labels with improved font size
ax.set_xlabel('Trip Distance (miles)', fontsize=12, labelpad=10)
ax.set_ylabel('Trip Duration (min)', fontsize=12, labelpad=10)
ax.set_zlabel('Trip Speed (mph)', fontsize=12, labelpad=10)

# Set background to white
ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))

# Set title with improved aesthetics
ax.set_title('Sampled Trip Characteristics', fontsize=16)

plt.show()

In [0]:
import numpy as np
# Create a 3D scatter plot with improved aesthetics
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Color by speed, normalize the color
colors = sampled_pd['tripSpeed (mph)']
norm = plt.Normalize(colors.min(), colors.max())

# Create a scatter plot
sc = ax.scatter(x, y, z, c=colors, cmap='viridis', norm=norm, edgecolors='w', s=50)

# Add color bar which maps values to colors
cbar = fig.colorbar(sc, shrink=0.5, aspect=20, pad=0.1)
cbar.set_label('Trip Speed (mph)')

# Set labels with improved font size
ax.set_xlabel('Trip Distance (miles)', fontsize=12, labelpad=10)
ax.set_ylabel('Trip Duration (min)', fontsize=12, labelpad=10)
ax.set_zlabel('Trip Speed (mph)', fontsize=12, labelpad=10)

# Set title with improved aesthetics
ax.set_title('Sampled Trip Characteristics', fontsize=16)

plt.show()

In [0]:

# Assuming outlier is any tripDuration over 12 hours (720 minutes) and less than 1 minute 
outliers_duration = Man_df_new.filter((col("tripDuration (min)") <1 ) | (col("tripDuration (min)") > 720))
outliers_count = outliers_duration.count()
print(f"Number of duration outliers (trips over 12 hours or less than 1 min): {outliers_count}")

outliers_duration_2 = Man_df_new.filter((col("tripDuration (min)") <1 ) | (col("tripDuration (min)") > 45))
outliers_count = outliers_duration_2.count()
print(f"Number of duration outliers (less than 1 min or over 45 min): {outliers_count}")

#############################################################################################

# Trip Distance less than 0 miles and greater than 12 miles
outliers_distance = Man_df_new.filter((col("tripDistance") < 0) | (col("tripDistance") > 12))
outliers_count = outliers_distance.count()
print(f"Number of distance outliers (less than 0 miles or over 12 miles): {outliers_count}")

#############################################################################################

# Trip speed less than 1 mph and greater than 30 mph
outliers_speed = Man_df_new.filter((col("tripSpeed (mph)") < 1) | (col("tripSpeed (mph)") > 30))
outliers_count = outliers_speed.count()
print(f"Number of speed outliers (less than 1 mph and greater than 30 mph): {outliers_count}")


## 6. Total Fare Analysis

In [0]:
from pyspark.sql.functions import mean, min, max, stddev, col
stats_Amount = Man_df_new.select(
    mean(col("totalAmount")).alias("mean_Amount"),
    stddev(col("totalAmount")).alias("stddev_Amount"),
    min(col("totalAmount")).alias("min_Amount"),
    max(col("totalAmount")).alias("max_Amount"),
    )

stats_Amount.show()

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import percent_rank

# Calculate percentile ranks
window = Window.orderBy(Man_df_new['totalAmount'])
df_with_percentile = Man_df_new.withColumn("percentile_rank", percent_rank().over(window))

# Now find the tripDistance at each percentile from 0 to 100
percentiles = [i / 100 for i in range(101)]  # 0, 0.01, 0.02, ..., 1
percentile_values = df_with_percentile.stat.approxQuantile("totalAmount", percentiles, 0.01)

# Print the percentiles and their corresponding tripDuration
for percentile, value in zip(percentiles, percentile_values):
    print(f"{percentile * 100} percentile value is {value}")

In [0]:
from pyspark.sql.functions import expr

# Calculate detailed percentiles for tripDistance
detailed_percentiles = [float(i) / 1000 + 0.98 for i in range(1, 11)]
detailed_percentile_values = df_with_percentile.approxQuantile("totalAmount", detailed_percentiles, 0.01)

# Print detailed percentiles and their corresponding tripDistance
for percentile, value in zip(detailed_percentiles, detailed_percentile_values):
    print(f"{percentile * 100} percentile value is {value}")