In [0]:
import os
import math
import time
import requests
from datetime import datetime

from pyspark.sql.functions import col,count

In [0]:
file_id = "1AGXVlDhbMbhoGXDJG0IThnqz86Qy3hqb"
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
local_path = "/dbfs/temp/transactions.csv"
s3_output_path = "dbfs:/mnt/s3mount/chunk/"
chunk_size = 10000

In [0]:
os.makedirs("local_path", exist_ok=True)

if not os.path.exists(local_path):
    r = requests.get(download_url)
    with open(local_path, "wb") as f:
        f.write(r.content)
        
schema= """step INT,
    customer STRING,
    age INT,           
    gender STRING,
    zipcodeOri STRING,
    merchant STRING,
    zipMerchant STRING,
    category STRING,
    amount DOUBLE,
    fraud INT"""

transactions_df = spark.read.option("header", "true").option("quote", "'").schema(schema).csv(f"file:{local_path}")

In [0]:
total_rows = transactions_df.count()
total_chunks = math.ceil(total_rows / chunk_size)
print(f"Total rows: {total_rows}, total chunks: {total_chunks}")

Total rows: 594643, total chunks: 60


In [0]:
indexed_df = transactions_df.rdd.zipWithIndex().toDF()
indexed_df = indexed_df.selectExpr("_1.*", "_2 as row_index")
display(indexed_df)

step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud,row_index
0,C1093826151,4.0,M,28007,M348934600,28007,es_transportation,4.55,0,0
0,C352968107,2.0,M,28007,M348934600,28007,es_transportation,39.68,0,1
0,C2054744914,4.0,F,28007,M1823072687,28007,es_transportation,26.89,0,2
0,C1760612790,3.0,M,28007,M348934600,28007,es_transportation,17.25,0,3
0,C757503768,5.0,M,28007,M348934600,28007,es_transportation,35.72,0,4
0,C1315400589,3.0,F,28007,M348934600,28007,es_transportation,25.81,0,5
0,C765155274,1.0,F,28007,M348934600,28007,es_transportation,9.1,0,6
0,C202531238,4.0,F,28007,M348934600,28007,es_transportation,21.17,0,7
0,C105845174,3.0,M,28007,M348934600,28007,es_transportation,32.4,0,8
0,C39858251,5.0,F,28007,M348934600,28007,es_transportation,35.4,0,9


In [0]:
for chunk_id in range(total_chunks):
    start = chunk_id * chunk_size
    end = start + chunk_size
    chunk_df = indexed_df.filter((col("row_index") >= start) & (col("row_index") < end)).drop("row_index")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    chunk_df.coalesce(1).write.mode("append").option("header", "true").csv(s3_output_path)
    print(f"Written chunk {chunk_id+1}/{total_chunks} to {s3_output_path}")
    time.sleep(1)

Written chunk 1/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 2/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 3/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 4/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 5/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 6/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 7/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 8/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 9/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 10/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 11/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 12/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 13/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 14/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 15/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 16/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 17/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 18/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 19/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 20/60 to dbfs:/mnt/s3mount/chunk/
Written chunk 21/60 to dbfs:/mnt/s3mount/chunk/
W