In [1]:
from pyspark.sql import SparkSession
import psycopg2
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /usr/local/spark/jars/postgresql-42.7.3.jar pyspark-shell'

In [2]:
spark = SparkSession.builder \
    .appName("PostgreSQL JDBC Example") \
    .getOrCreate()

In [3]:
df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("./data/Online_Retail.csv")

In [4]:
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS S

In [5]:
df = df.withColumn("Amount", df['Quantity'] * df['UnitPrice'])
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|            Amount|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|     17850|United Kingdom|15.299999999999999|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|             20.34|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|     17850|United Kingdom|              22.0|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|             20.34|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|    

In [6]:
conn = psycopg2.connect(
    host="postgres",
    database="my_database",
    user="my_username",
    password="my_password"
)

In [7]:
cur = conn.cursor()

In [8]:
cur.execute("""
   CREATE TABLE IF NOT EXISTS sales (
       InvoiceNo VARCHAR(255),
       StockCode VARCHAR(255),
       Description VARCHAR(255),
       Quantity INTEGER,
       InvoiceDate DATE,
       UnitPrice FLOAT,
       CustomerID INTEGER,
       Country VARCHAR(255),
       Amount FLOAT
   ) 
""")

In [9]:
for row in df.rdd.collect():
    cur.execute("""
        INSERT INTO sales (InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, CustomerID, Country, Amount)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
    """, (row.InvoiceNo, row.StockCode, row.Description, row.Quantity, row.InvoiceDate, row.UnitPrice, row.CustomerID, row.Country, row.Amount))

In [10]:
conn.commit()

In [11]:
query = "SELECT * FROM sales"

In [12]:
df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/my_database") \
    .option("dbtable", f"({query}) as tmp") \
    .option("user", "my_username") \
    .option("password", "my_password") \
    .load()

In [13]:
df.show(truncate=False)

+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+------------------+
|invoiceno|stockcode|description                        |quantity|invoicedate|unitprice|customerid|country       |amount            |
+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+------------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 |2.55     |17850     |United Kingdom|15.299999999999999|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34             |
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 |2.75     |17850     |United Kingdom|22.0              |
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34             |
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6    

In [14]:
cur.close()

In [15]:
conn.close()