In [1]:
from pyspark.sql import SparkSession
import psycopg2
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /usr/local/spark/jars/postgresql-42.7.3.jar pyspark-shell'

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("linear_regression") \
    .getOrCreate()

In [3]:
df = spark.read.csv("./data/bank-full.csv", header=True, inferSchema=True, sep=";")
df.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

In [4]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string')]

In [5]:
conn = psycopg2.connect(
    host="jupyter-pyspark-postgres",
    user="user",
    password="password",
    database="jupyter_pyspark_db"
)

In [6]:
cur = conn.cursor()

In [7]:
cur.execute("""
   CREATE TABLE IF NOT EXISTS bank_table (
       age INTEGER,
       job VARCHAR(255),
       marital VARCHAR(255),
       education VARCHAR(255),
       "default" VARCHAR(255),
       balance INTEGER,
       housing VARCHAR(255),
       loan VARCHAR(255),
       contact VARCHAR(255),
       day INTEGER,
       month VARCHAR(255),
       duration INTEGER,
       campaign INTEGER,
       pdays INTEGER,
       previous INTEGER,
       poutcome VARCHAR(255),
       y VARCHAR(255)
   ) 
""")

In [8]:
conn.commit()
cur.close()
conn.close()

In [9]:
def insert_data(df, table_name):
    df.write \
        .format("jdbc") \
        .option("url", "jdbc:postgresql://jupyter-pyspark-postgres:5432/jupyter_pyspark_db") \
        .option("dbtable", table_name) \
        .option("user", "user") \
        .option("password", "password") \
        .mode("append") \
        .save()

In [10]:
insert_data(df, "bank_table")

In [11]:
query = "SELECT * FROM bank_table" 

In [12]:
df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://jupyter-pyspark-postgres:5432/jupyter_pyspark_db") \
    .option("dbtable", f"({query}) as tmp") \
    .option("user", "user") \
    .option("password", "password") \
    .load()

In [13]:
df.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may