In [41]:
import boto3
import pandas as pd
from faker import Faker
import random
from io import StringIO
from datetime import datetime

In [42]:
# Initialize Faker and AWS S3 client
fake = Faker()
s3 = boto3.client('s3')

In [43]:
def generate_cdc_order_data(num_rows=500):
    data = []
    for _ in range(num_rows):
        order = {
            'order_id': fake.uuid4(),
            'customer_id': fake.uuid4(),
            'order_date': fake.date_this_year(),
            'status': random.choice(['CREATED', 'SHIPPED', 'DELIVERED', 'CANCELLED']),
            'country': random.choice(['USA', 'CANADA', 'MEXICO', 'BRAZIL', 'ARGENTINA', 'CHILE', 'PERU']),
            'product_id': fake.uuid4(),
            'quantity': random.randint(1, 5),
            'price': round(random.uniform(10.0, 500.0), 2),
            'total_amount': 0.0,  
            'cdc_timestamp': datetime.now()  
        }
        order['total_amount'] = round(order['quantity'] * order['price'], 2)
        data.append(order)

    df = pd.DataFrame(data)
    return df

In [44]:
def upload_to_s3(bucket_name, file_name, df):
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    s3.put_object(Bucket=bucket_name, Key=file_name, Body=csv_buffer.getvalue())
    print(f"Data uploaded to s3://{bucket_name}/{file_name}")

df_cdc_order_data = generate_cdc_order_data(num_rows=4000)

# Define S3 bucket and file path
bucket_name = 'harrytestz'
file_name = 'orders/fake_cdc_order_data_2.csv' 

upload_to_s3(bucket_name, file_name, df_cdc_order_data)

Data uploaded to s3://harrytestz/orders/fake_cdc_order_data_2.csv
