In [1]:
from sqlalchemy import create_engine, text
engine = create_engine("mysql+mysqlconnector://root:abc@127.0.0.1:3306/cs544")
conn = engine.connect()

In [None]:
list(conn.execute(text("show tables;")))

In [None]:
# table: users
# columns: id, name, phone
# name is required
# id uniquely identifies row
conn.execute(text("""
    CREATE TABLE users (id INT, name TEXT NOT NULL, phone TEXT, PRIMARY KEY(id))
"""))

In [None]:
# Needed to drop table so we could fix the constraint on name being not null
# conn.execute(text("drop table users;"))

In [6]:
# Inserting rows into the table
# Format: INSERT INTO table (columns) VALUES (values to put in each column)

conn.execute(text("""
    INSERT INTO users (id, name) VALUES (1, "tyler")
"""))

# Note: if we run this command twice, we get an error because we can't have duplicate primary keys

In [None]:
# Create an accounts table with a foreign key to the users table
conn.execute(text("""
    CREATE TABLE accounts (user_id INT, name text NOT NULL, amount INT NOT NULL, FOREIGN KEY (user_id) references users(id))
"""))

In [None]:
# This gives error, we can't insert a row in accounts that references a non-existent row in users (violates foreign key constraint)
# conn.execute(text("""
#     INSERT INTO accounts (user_id, name, amount) VALUES (2, "A", 10)
# """))

In [None]:
# We can have 2 bank accounts for the same user:
conn.execute(text("""
    INSERT INTO accounts (user_id, name, amount) VALUES (1, "A", 10);
"""))

In [None]:
# We can have 2 bank accounts for the same user:
conn.execute(text("""
    INSERT INTO accounts (user_id, name, amount) VALUES (1, "B", 100);
"""))

In [None]:
# Make the recent changes durable
conn.commit()

In [8]:
# Check the contents of our table so far:
list(conn.execute(text("""
    SELECT * FROM accounts;
""")))

[(1, 'A', 10), (1, 'B', 100)]

In [None]:
# This gives an error, as it would cause a foreign key constraint:
# conn.execute(text("""
#     DELETE FROM users WHERE id = 1;
# """))

# Load CSVs to MySQL tables

In [None]:
# creates 3 tables, "actions", "loan types", and "purposes"
import pandas as pd
url = "https://raw.githubusercontent.com/cfpb/api/master/resources/datasets/hmda/code_sheets/"
df = pd.read_csv(url + "action_taken.csv")
df.to_sql("actions", conn, index=False, if_exists="replace")    # df.to_sql([table name], [db connection], [index=do we want row num as a column in the table?], [if_exists=do we want to overwrite?])
df = pd.read_csv(url + "loan_type.csv")
df.to_sql("loan_types", conn, index=False, if_exists="replace")
df = pd.read_csv(url + "loan_purpose.csv")
df.to_sql("purposes", conn, index=False, if_exists="replace")
conn.commit()

In [9]:
# check that the tables were created successfully
list(conn.execute(text("show tables;")))

[('accounts',),
 ('actions',),
 ('loan_types',),
 ('loans',),
 ('purposes',),
 ('users',)]

In [None]:
import pyarrow as pa
import pyarrow.csv, pyarrow.parquet
# use PyArrow to do type inference (must be done before making a SQL table) and then write as Parquet file (10x smaller than the CSV file due to compression)
pa.parquet.write_table(pa.csv.read_csv("hdma-wi-2021.csv"), "loans.parquet")
# because our Parquet file is column-oriented, grabbing a subset of columns is much faster than operating with a CSV --> put into a PyArrow table
t = pa.parquet.read_table("loans.parquet", columns=["lei", "action_taken", "loan_type", "loan_amount", "interest_rate", "loan_purpose", "income"])

# TODO: to_sql with chunking

In [10]:
!ls -lah

total 204M
drwxrwxr-x 4 dsmith7789 dsmith7789 4.0K Oct 15 01:23 .
drwxrwxr-x 6 dsmith7789 dsmith7789 4.0K Oct 14 22:54 ..
drwxrwxr-x 2 dsmith7789 dsmith7789 4.0K Oct 14 22:54 .ipynb_checkpoints
-rw-rw-r-- 1 dsmith7789 dsmith7789  482 Oct 14 06:13 Dockerfile
-rw-rw-r-- 1 dsmith7789 dsmith7789 9.9K Oct 15 01:23 SQL1.ipynb
-rw-r----- 1 dsmith7789 dsmith7789 167M Nov  1  2022 hdma-wi-2021.csv
-rw-rw-r-- 1 dsmith7789 dsmith7789  21M Jan  5  2023 hdma-wi-2021.zip
-rw-rw-r-- 1 dsmith7789 dsmith7789  16M Oct 14 23:52 loans.parquet
drwxr-xr-x 3 root       root       4.0K Oct 14 06:28 nb


In [None]:
t

In [None]:
# now we put the PyArrow table into SQL (need to convert to Pandas because no direct PyArrow.to_sql function)
# load in chunks otherwise we'll run out of memory (this table will have ~400k rows)
t.to_pandas().to_sql("loans", conn, index=False, if_exists="replace", chunksize=10000)
conn.commit()

In [None]:
# we can use Pandas for SQL queries, gives a nicer output
pd.read_sql("""
    SELECT * FROM accounts
""", conn)   # don't forget to specify the connection!

In [12]:
# take $4 out of account A
conn.execute(text("""
    UPDATE accounts SET amount = amount - 4 WHERE name = 'A';
"""))

<sqlalchemy.engine.cursor.CursorResult at 0x7f92c462d720>

In [13]:
# and transfer the $4 to account B
conn.execute(text("""
    UPDATE accounts SET amount = amount + 4 WHERE name = 'B';
"""))

<sqlalchemy.engine.cursor.CursorResult at 0x7f92c462d840>

In [16]:
# check the table now
pd.read_sql("""
    SELECT * FROM accounts
""", conn)

Unnamed: 0,user_id,name,amount
0,1,A,6
1,1,B,104


# But, the table is not updated in the other DB connection (in our terminal)!

#### This is because our intermediate progress is isolated from other connections until we commit the transaction.

In [18]:
remaining = list(conn.execute(text("""
    SELECT amount FROM accounts WHERE name = 'A';
""")))[0][0]    # pull the value out of the list(tuple) result
if remaining > 0:
    print("It worked!")
    conn.commit()    # at this point, the updated funds will show in the query results of all connections
else:
    print("Not enough funds.")
    conn.rollback()

It worked!


# Let's combine the above steps:

In [20]:
# take $4 out of account A
conn.execute(text("""
    UPDATE accounts SET amount = amount - 4 WHERE name = 'A';
"""))

# and transfer the $4 to account B
conn.execute(text("""
    UPDATE accounts SET amount = amount + 4 WHERE name = 'B';
"""))

remaining = list(conn.execute(text("""
    SELECT amount FROM accounts WHERE name = 'A';
""")))[0][0]    # pull the value out of the list(tuple) result
if remaining > 0:
    print("It worked!")
    conn.commit()    # at this point, the updated funds will show in the query results of all connections
else:
    print("Not enough funds.")
    conn.rollback()

# check the table now
pd.read_sql("""
    SELECT * FROM accounts
""", conn)

Not enough funds.


Unnamed: 0,user_id,name,amount
0,1,A,2
1,1,B,108
