In [1]:
import psycopg2
import json
import csv
import pandas as pd
import psycopg2
from psycopg2 import sql
from psycopg2.extras import execute_values


In [56]:
def connect_to_postgres(dbname=None):
    """Connect to PostgreSQL. If dbname is None, connect to the default database."""
    return psycopg2.connect(
        dbname=dbname or "postgres",
        user="postgres",
        password="123456",
        host="localhost",
        port="5432"
    )

def create_database_if_not_exists(dbname):
    """Create the database if it does not exist."""
    conn = connect_to_postgres()
    conn.autocommit = True
    try:
        with conn.cursor() as cur:
            cur.execute(
                sql.SQL("CREATE DATABASE {};").format(sql.Identifier(dbname))
            )
            print(f"Database '{dbname}' created successfully.")
    except psycopg2.errors.DuplicateDatabase:
        print(f"Database '{dbname}' already exists.")
    finally:
        conn.close()

def create_tables_if_not_exists(conn):
    """Create tables in the database if they do not exist."""
    table_creation_queries = {
        "products": """
        CREATE TABLE IF NOT EXISTS products (
            product_id SERIAL PRIMARY KEY,
            product_name TEXT NOT NULL,
            category TEXT NOT NULL,
            price NUMERIC NOT NULL,
            stock_quantity INT NOT NULL
        );
        """,
        "customers": """
        CREATE TABLE IF NOT EXISTS customers (
            customer_id SERIAL PRIMARY KEY,
            name TEXT NOT NULL,
            email TEXT NOT NULL,
            phone TEXT,
            address TEXT,
            join_date DATE NOT NULL
        );
        """,
        "regions": """
        CREATE TABLE IF NOT EXISTS regions (
            region_id SERIAL PRIMARY KEY,
            region_name TEXT NOT NULL
        );
        """,
        "salespersons": """
        CREATE TABLE IF NOT EXISTS salespersons (
            salesperson_id SERIAL PRIMARY KEY,
            name TEXT NOT NULL,
            region_id SERIAL REFERENCES regions(region_id),
            hire_date DATE NOT NULL
        );
        """,
        "sales": """
        CREATE TABLE IF NOT EXISTS sales (
            sale_id SERIAL PRIMARY KEY,
            customer_id SERIAL REFERENCES customers(customer_id),
            product_id SERIAL REFERENCES products(product_id),
            region_id SERIAL REFERENCES regions(region_id),
            salesperson_id SERIAL REFERENCES salespersons(salesperson_id),
            quantity INT NOT NULL,
            sale_date DATE NOT NULL
        );
        """
    }
    
    with conn.cursor() as cur:
        for table, query in table_creation_queries.items():
            cur.execute(query)
        conn.commit()
    print("Tables created successfully (if they did not already exist).")

def insert_data_to_postgres(conn, table_name, data):
    """Insert data into a PostgreSQL table."""
    with conn.cursor() as cur:
        columns = data.columns.tolist()
        values = [tuple(x) for x in data.to_numpy()]
        insert_query = f"""
        INSERT INTO {table_name} ({', '.join(columns)}) 
        VALUES %s
        """
        execute_values(cur, insert_query, values)
        conn.commit()

In [57]:
# Define the database name
db_name = "sales_region"

# Create the database if it doesn't exist
create_database_if_not_exists(db_name)

# Connect to the newly created database
conn = connect_to_postgres(dbname=db_name)

# Ensure tables exist
create_tables_if_not_exists(conn)


Database 'sales_region' already exists.
Tables created successfully (if they did not already exist).


In [58]:
# Load CSV files into DataFrames
customers_df = pd.read_json('customers.json')
sales_df = pd.read_json('sales.json')
regions_df = pd.read_json('regions.json')
products_df = pd.read_json('products.json')
sales_persons_df = pd.read_json('salespersons.json')

In [59]:
try:
    # Transfer data to PostgreSQL
    insert_data_to_postgres(conn, "products", products_df)
    insert_data_to_postgres(conn, "customers", customers_df)
    insert_data_to_postgres(conn, "regions", regions_df)
    insert_data_to_postgres(conn, "salespersons", sales_persons_df)
    insert_data_to_postgres(conn, "sales", sales_df)
    print("Data successfully transferred to PostgreSQL database.")
finally:
    conn.close()

Data successfully transferred to PostgreSQL database.


In [None]:
'''
1. Who are the top 5 customers by total spending?
2. What is the average customer lifetime value (CLTV)?
3. Which region has the most active customers?
4. What are the top 5 products by revenue?
5. Which product categories generate the highest revenue?
6. What is the stock status of products?
7. What is the daily revenue trend?
8. Which regions have the highest sales over time?
9. What is the most popular payment method?
10. Who are the top-performing salespersons by revenue?
11. Which salesperson has the highest average order value?
12. How is salesperson performance distributed across regions?
13. What is the average delivery time for orders?
14. Which regions have the highest stock turnover?
15. What is the reorder frequency of products?
'''

Unnamed: 0,sale_id,customer_id,product_id,salesperson_id,region_id,quantity,sale_date
0,1,9535,16,45,4,3,2023-11-27
1,2,6824,139,8,2,5,2023-10-15
2,3,515,130,26,1,8,2024-02-21
3,4,8487,195,6,2,8,2023-11-13
4,5,4856,183,49,4,9,2023-04-18
...,...,...,...,...,...,...,...
99995,99996,6217,115,18,4,10,2024-09-10
99996,99997,5436,106,31,4,9,2024-02-11
99997,99998,8118,39,33,4,8,2024-03-30
99998,99999,3024,45,46,4,4,2024-06-25
