In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, lit
from configparser import ConfigParser
import psycopg2

In [2]:
def read_config(filename='database.ini', section='postgresql'):
    parser = ConfigParser()
    parser.read(filename)

    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))

    return db

def connect(conf):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        conn = psycopg2.connect(**conf)

        return conn.cursor()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)

def config(filename):
    return read_config(filename=filename)
        

In [3]:
conf = read_config("./docker/database.ini")
cur = connect(conf)

In [8]:
spark = SparkSession \
    .builder \
    .master("local") \
    .config('spark.driver.extraClassPath', "/home/jovyan/work/lib/postgresql-42.2.12.jar") \
    .appName("CSV to DB") \
    .getOrCreate()

# load the CSV
df = spark.read \
    .format('csv') \
    .option("header", "true") \
    .load("./data/authors.csv")

# concat the names
df = df.withColumn("name", concat(df["lname"], lit(", "), df["fname"]))
url = f"jdbc:postgresql://{conf['host']}/{conf['database']}"
properties = {"driver": "org.postgresql.Driver", **conf}

# write the dataframe to postgres
df.write.jdbc(url=url, table="public.names", mode="overwrite", properties=properties)