In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, TimestampType, DoubleType, StringType
from pyspark.sql.window import Window
import random
from datetime import datetime, timedelta

# Define the schema for the synthesized data
synthesized_schema = StructType([
    StructField("tpep_pickup_datetime", TimestampType(), True),
    StructField("tpep_dropoff_datetime", TimestampType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("pickup_zip", StringType(), True),
    StructField("dropoff_zip", StringType(), True)
])

# Read the original data from the Delta table
try:
    df_original = spark.read.format("delta").table("samples.nyctaxi.trips")
except Exception as e:
    print(f"Error reading Delta table: {e}. Please ensure you have access to 'samples.nyctaxi.trips'.")
    exit()

# Extract useful features and convert to Pandas for pattern analysis
df_original_pd = df_original.select(
    F.hour(F.col("tpep_pickup_datetime")).alias("pickup_hour"),
    F.dayofweek(F.col("tpep_pickup_datetime")).alias("pickup_day_of_week"),
    (F.col("tpep_dropoff_datetime").cast("long") - F.col("tpep_pickup_datetime").cast("long")).alias("trip_duration_seconds"),
    F.col("trip_distance"),
    F.col("fare_amount"),
    F.col("pickup_zip"),
    F.col("dropoff_zip")
).toPandas()

# Group original data for faster lookup during synthesis
data_by_hour = df_original_pd.groupby(['pickup_day_of_week', 'pickup_hour']).apply(
    lambda x: x.to_dict('records')
).reset_index(name='trips_data')

# Define the start and end date for the new dataset
start_date = datetime(2016, 1, 1)
end_date = datetime(2016, 12, 31)

# Create a new list to hold the synthesized records
new_records = []
current_date = start_date

while current_date <= end_date:
    day_of_week = current_date.weekday()
    hour = current_date.hour

    relevant_trips_df = data_by_hour[
        (data_by_hour['pickup_day_of_week'] == day_of_week) &
        (data_by_hour['pickup_hour'] == hour)
    ]

    if not relevant_trips_df.empty:
        relevant_trips = relevant_trips_df['trips_data'].iloc[0]
        
        num_trips_to_generate = random.randint(50, 150)
        
        sampled_trips = random.choices(relevant_trips, k=min(num_trips_to_generate, len(relevant_trips)))

        for row in sampled_trips:
            new_trip_distance = row['trip_distance'] * (1 + random.gauss(0, 0.1))
            new_fare_amount = row['fare_amount'] * (1 + random.gauss(0, 0.1))

            new_pickup_datetime = current_date + timedelta(minutes=random.randint(0, 60))
            new_dropoff_datetime = new_pickup_datetime + timedelta(seconds=row['trip_duration_seconds'])

            new_records.append({
                'tpep_pickup_datetime': new_pickup_datetime,
                'tpep_dropoff_datetime': new_dropoff_datetime,
                'trip_distance': new_trip_distance,
                'fare_amount': new_fare_amount,
                'pickup_zip': row['pickup_zip'],
                'dropoff_zip': row['dropoff_zip']
            })
    
    current_date += timedelta(hours=1)

# Create a Spark DataFrame from the synthesized records
df_synthesized_spark = spark.createDataFrame(new_records, schema=synthesized_schema)



In [0]:
display(df_synthesized_spark)

In [0]:


%python
df_synthesized_spark.write.csv(
    path="/Volumes/workspace/default/nyctaxi/nyctaxi_synthesized.csv",
    header=True,
    mode="overwrite"
)

print(f"Synthesized data with {df_synthesized_spark.count()} records for 12 months has been saved to '{output_path}'.")
# Save the synthesized data to a new Delta table
# output_path = "/Volumes/workspace/default/nyctaxi"
# df_synthesized_spark.write.format("delta").mode("overwrite").save(output_path)

In [0]:
%python
df_synthesized_spark.write.format("delta").mode("overwrite").saveAsTable("workspace.default.nyctaxi")