# Pandas and PostgreSQL Integration

This notebook demonstrates how to:
1. Connect to PostgreSQL using SQLAlchemy
2. Read data from PostgreSQL into Pandas DataFrames
3. Write Pandas DataFrames to PostgreSQL tables
4. Perform basic SQL queries and transformations

## Setup and Import Dependencies

In [1]:
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import matplotlib.pyplot as plt

# Import the config module from the scripts directory
import sys
sys.path.append('../scripts')
from config import DB_CONFIG

## Connect to PostgreSQL

In [2]:
# Create a connection URL
db_url = f"postgresql+psycopg2://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"

# Create SQLAlchemy engine
engine = create_engine(db_url)

## Load Sample Data

In [3]:
# Load sample data from CSV file
sample_data_path = '../data/sample_data.csv'
df = pd.read_csv(sample_data_path)

# Display the first few rows
df.head()

## Write Data to PostgreSQL

In [4]:
# Write DataFrame to PostgreSQL table
table_name = 'sample_data'
df.to_sql(table_name, engine, if_exists='replace', index=False)

print(f"Data written to {table_name} table")

## Read Data from PostgreSQL

In [5]:
# Read data from PostgreSQL table
query = f"SELECT * FROM {table_name}"
df_from_db = pd.read_sql(query, engine)

# Display the first few rows
df_from_db.head()

## Perform SQL Queries

In [6]:
# Example: Get summary statistics
query = f"""
SELECT 
    COUNT(*) as total_rows,
    AVG(numeric_column) as avg_value,
    MIN(numeric_column) as min_value,
    MAX(numeric_column) as max_value
FROM {table_name}
"""
summary_stats = pd.read_sql(query, engine)
summary_stats

## Using Parameters in SQL Queries

In [7]:
# Example: Parameterized query
def get_filtered_data(min_value, max_value):
    query = f"""
    SELECT *
    FROM {table_name}
    WHERE numeric_column BETWEEN %(min_value)s AND %(max_value)s
    """
    params = {'min_value': min_value, 'max_value': max_value}
    return pd.read_sql(query, engine, params=params)

# Get data within a range
filtered_df = get_filtered_data(10, 50)
filtered_df.head()

## Batch Processing

In [8]:
# Example: Read data in chunks
chunk_size = 1000
chunks = []

for chunk_df in pd.read_sql(f"SELECT * FROM {table_name}", engine, chunksize=chunk_size):
    # Process each chunk
    chunks.append(chunk_df)
    
# Combine all chunks
full_df = pd.concat(chunks)
print(f"Total rows: {len(full_df)}")

## Data Visualization

In [9]:
# Example: Visualize data from PostgreSQL
query = f"""
SELECT 
    category,
    COUNT(*) as count
FROM {table_name}
GROUP BY category
"""
category_counts = pd.read_sql(query, engine)

# Create a bar chart
plt.figure(figsize=(10, 6))
plt.bar(category_counts['category'], category_counts['count'])
plt.title('Counts by Category')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Transaction Management

In [10]:
# Example: Using transactions
from sqlalchemy.orm import sessionmaker

Session = sessionmaker(bind=engine)
session = Session()

try:
    # Start a transaction
    transaction = session.begin()
    
    # Execute multiple operations
    session.execute(f"DELETE FROM {table_name} WHERE id < 10")
    session.execute(f"UPDATE {table_name} SET numeric_column = numeric_column * 2 WHERE category = 'A'")
    
    # Commit the transaction
    transaction.commit()
    print("Transaction committed successfully")
except Exception as e:
    # Rollback the transaction on error
    transaction.rollback()
    print(f"Transaction rolled back: {e}")
finally:
    # Close the session
    session.close()

## Cleanup

In [11]:
# Example: Drop the table
# engine.execute(f"DROP TABLE IF EXISTS {table_name}")
# print(f"Table {table_name} dropped")

# Close the connection
engine.dispose()
print("Connection closed")