## Extract from Postgres

In [32]:
import psycopg2
import csv
import boto3
from tenacity import retry, wait_exponential, stop_after_attempt

In [35]:
database='fire_incidents_db'
user_name='root'
pwd='root'
host_name='fire_incidents_db_container'
port_number=5432
fire_incidents_tbl_name='fire_incidents_tbl'
traffic_tbl_name='nyc_traffic_tbl'
fire_incidents_data_name='nyc_fire_incidents_data'
traffic_data_name='nyc_traffic_data'

In [17]:
def export_data_to_csv(database,user_name,pwd,host_name,port_number,tbl_name,data_name):
    conn = psycopg2.connect(
        dbname=database,
        user=user_name,
        password=pwd,
        host=host_name,
        port=port_number
    )
    cursor = conn.cursor()
    cursor.execute("SELECT * FROM " + f'{tbl_name}')
    
    with open('./temp_csv_files/exported_' + f'{data_name}'+'.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([i[0] for i in cursor.description])  # Write headers
        writer.writerows(cursor.fetchall())  # Write data
    
    cursor.close()
    conn.close()

In [38]:
#Export nyc fire incidents data
export_data_to_csv(database,user_name,pwd,host_name,port_number,fire_incidents_tbl_name,fire_incidents_data_name)

In [39]:
#Export nyc traffic incident data
export_data_to_csv(database,user_name,pwd,host_name,port_number,traffic_tbl_name,traffic_data_name)

## Upload CSV File to S3

In [27]:
##This may be the preferred approach as opposed to the above
sts_client = boto3.client('sts')

# Assume the IAM role
assumed_role = sts_client.assume_role(
    RoleArn="arn:aws:iam::564001313146:role/S3AccessRoleForNYCFireIncidentsProj",
    RoleSessionName="MyS3Session"
)

# Extract temporary credentials
credentials = assumed_role['Credentials']
s3_client = boto3.client(
    's3',
    aws_access_key_id=credentials['AccessKeyId'],
    aws_secret_access_key=credentials['SecretAccessKey'],
    aws_session_token=credentials['SessionToken']
)


In [28]:
data_name = ['nyc_fire_incidents_data','nyc_traffic_data']

In [33]:
@retry(wait=wait_exponential(multiplier=2, min=2, max=16), stop=stop_after_attempt(5))
def upload_to_s3():
    for name in data_name:
        s3_client.upload_file('./temp_csv_files/exported_' + f'{name}' + '.csv', 'nyc-fire-incidents-s3', 'exported_' + f'{name}' + '.csv')
        print("File uploaded successfully!")

In [34]:
try:
    upload_to_s3()
    print("Uploaded Files to S3!")
    
except requests.exceptions.RequestException as e:
    print(f"Failed to upload to S3")

File uploaded successfully!
File uploaded successfully!
Uploaded Files to S3!


## Load Data from s3 to Redshift

In [None]:
COPY your_redshift_table
FROM 's3://your_bucket_name/exported_data.csv'
IAM_ROLE 'arn:aws:iam::your_account_id:role/your_redshift_role'
CSV
IGNOREHEADER 1;

In [None]:
import psycopg2
from psycopg2 import sql

# Define your Redshift credentials
redshift_host = "your-redshift-cluster-endpoint"
redshift_port = 5439  # Default Redshift port
redshift_dbname = "your_database_name"
redshift_user = "your_username"
redshift_password = "your_password"

# Define your COPY command parameters
copy_command = """
COPY your_redshift_table
FROM 's3://your_bucket_name/exported_data.csv'
IAM_ROLE 'arn:aws:iam::your_account_id:role/your_redshift_role'
CSV
IGNOREHEADER 1;
"""

try:
    # Establish a connection to Redshift
    connection = psycopg2.connect(
        dbname=redshift_dbname,
        user=redshift_user,
        password=redshift_password,
        host=redshift_host,
        port=redshift_port
    )
    connection.autocommit = True  # Auto-commit for COPY command

    # Create a cursor object to execute the query
    cursor = connection.cursor()

    # Execute the COPY command
    cursor.execute(sql.SQL(copy_command))

    print("COPY command executed successfully!")
except Exception as e:
    print(f"Error: {e}")
finally:
    # Clean up and close the connection
    if connection:
        cursor.close()
        connection.close()
        print("Connection closed.")

## DAG

In [None]:
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.postgres.operators.postgres import PostgresOperator
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.providers.amazon.aws.hooks.redshift import RedshiftHook
from datetime import datetime

# Define the DAG
default_args = {'start_date': datetime(2025, 3, 22)}
dag = DAG('postgres_to_redshift', default_args=default_args, schedule_interval='@daily')

# Task 1: Export PostgreSQL Data
export_task = PythonOperator(
    task_id='export_postgres_to_csv',
    python_callable=export_data_to_csv,
    dag=dag,
)

# Task 2: Upload CSV to S3
upload_task = PythonOperator(
    task_id='upload_csv_to_s3',
    python_callable=upload_to_s3,
    dag=dag,
)

# Task 3: Load to Redshift
redshift_task = PostgresOperator(
    task_id='load_to_redshift',
    postgres_conn_id='redshift_default',
    sql="""
        COPY your_redshift_table
        FROM 's3://your_bucket_name/exported_data.csv'
        IAM_ROLE 'arn:aws:iam::your_account_id:role/your_redshift_role'
        CSV
        IGNOREHEADER 1;
    """,
    dag=dag,
)

# Define Task Dependencies
export_task >> upload_task >> redshift_task