# Join Tables
1. Create fake cpu purchase data
2. Create fake purchase date data
3. Combine two
4. Read user.csv data
5. Join user Table and combined Table on credit card info

- user.csv: fake user information
- noindx.csv: remove index column from user.csv == staging_fake_ppl
- cpu_purchase.csv: fake purchase information data  == staging_fake_cpu_purchases
- ppl_cpu_purchases: joined table

In [1]:
import psycopg2             # python->psql connection
import psycopg2.extras
import pandas as pd         # create dataframes 
import numpy as np
from faker import Factory,Faker # Create fake data to use for join-tables
import io
# Import the 'config' function from the config_user_dta.py file:
from config import config

# Get the config params
params_ = config()
# Connect to the Postgres_DB:
conn = psycopg2.connect(**params_)
# Create new_cursor allowing us to write Python to execute PSQL:
cur = conn.cursor()
conn.autocommit = True

In [4]:
'''
Creating Fake CPU's that customers purchased, with country of purchase,linking them 
for joining tables later by foreign keys.
'''
fake_data=Faker()

cpus=[]
for _ in range(len(fake_ppl)//2):# len//2 I want the same length as dataframe, 2 cpu types
    cpus.append(fake_data.numerify(text='Intel Core i%-%%##K'))
    cpus.append(fake_data.numerify(text='AMD Ryzen % %%##X'))
len(cpus)
# len(fake_ppl)
cpus[:5]

['Intel Core i7-4518K',
 'AMD Ryzen 4 6578X',
 'Intel Core i6-8742K',
 'AMD Ryzen 6 2489X',
 'Intel Core i5-2743K']

In [5]:
# Create List of fake purchase dates:

Faker.seed(10)
purchase_dates=[] 
for _ in range(len(fake_ppl)):
    purchase_dates.append(fake_data.date_between(start_date='-3y', end_date='today'))
    
purchase_dates[:6]

[datetime.date(2020, 3, 26),
 datetime.date(2017, 12, 10),
 datetime.date(2019, 8, 18),
 datetime.date(2019, 11, 9),
 datetime.date(2020, 4, 6),
 datetime.date(2017, 11, 13)]

In [6]:
# Combine tables: 'purchase_dates' and 'cpus' & Save in csv file
#Note:Can use np.column_stack instead of zip(),this is mainly for preference and speedup

join_table_=pd.DataFrame(np.column_stack([fake_ppl['credit_card'],cpus,purchase_dates]),
             columns=['credit_card','cpu','purchase_date'])

join_table_.to_csv('data/cpu_purchase.csv',index=False)
join_table_.head()

Unnamed: 0,credit_card,cpu,purchase_date
0,5399-3484-4724-7187,Intel Core i7-4518K,2020-03-26
1,1630-5261-6108-7631,AMD Ryzen 4 6578X,2017-12-10
2,4435-3866-1076-3595,Intel Core i6-8742K,2019-08-18
3,3489-7099-9906-8660,AMD Ryzen 6 2489X,2019-11-09
4,8631-4500-5666-1510,Intel Core i5-2743K,2020-04-06


In [7]:

# CREATE TABLE: staging_fake_cpu_purchases & staging_fake_ppl

def create_staging_table(cursor):
    cursor.execute("""
        DROP TABLE IF EXISTS staging_fake_cpu_purchases;
        CREATE UNLOGGED TABLE staging_fake_cpu_purchases (
            credit_card      TEXT PRIMARY KEY,
            cpu              TEXT,
            purchase_date    DATE NOT NULL
        );""")

with conn.cursor() as cursor:
    create_staging_table(cursor)

def create_fake_ppl_table(cursor):
    cursor.execute("""
        DROP TABLE IF EXISTS staging_fake_ppl;
        CREATE UNLOGGED TABLE staging_fake_ppl (
            credit_card         TEXT PRIMARY KEY,
            email               TEXT,
            first_name          TEXT,
            last_name           TEXT,
            primary_phone       TEXT
        );""")

with conn.cursor() as cursor:
    create_fake_ppl_table(cursor)

In [8]:
# Send Python data to PSQL
def send_csv_to_psql(connection,csv,table_):
    sql = "COPY %s FROM STDIN WITH CSV HEADER DELIMITER AS ','"
    file = open(csv, "r")
    table = table_
    with connection.cursor() as cur:
        cur.execute("truncate " + table + ";")  # avoiding uploading duplicate data!
        cur.copy_expert(sql=sql % table, file=file)
        conn.commit()
#         cur.close() # Omit these to lines because we don't want to finish connection yet
#         conn.close()
    return conn.commit()

In [9]:
# Sending Fake Purchases to PSQL From Python:
send_csv_to_psql(conn,'data/cpu_purchase.csv','staging_fake_cpu_purchases')
# Sending Fake People to PSQL FROM Python:
send_csv_to_psql(conn,'data/noIndx.csv','staging_fake_ppl')

In [10]:
# Join two tables into a New Table
sql_c= """CREATE TABLE ppl_cpu_purchases AS 
SELECT staging_fake_ppl .*,
        staging_fake_cpu_purchases.cpu,
        staging_fake_cpu_purchases.purchase_date 
FROM staging_fake_ppl 
INNER JOIN staging_fake_cpu_purchases 
ON staging_fake_ppl.credit_card=staging_fake_cpu_purchases.credit_card"""

cur.execute(sql_c)
# cur.fetchone()

DuplicateTable: relation "ppl_cpu_purchases" already exists


In [21]:
sq="SELECT * FROM ppl_cpu_purchases LIMIT 3"
cur.execute(sq)
cur.fetchall()

[('5399-3484-4724-7187',
  'gso@qiegan.sqe',
  'Donyell Ann',
  'Ospina',
  '5219459148',
  'Intel Core i1-9809K',
  datetime.date(2020, 3, 26)),
 ('1630-5261-6108-7631',
  'xnji@gfruaxqnvm.fha',
  'Bishop',
  'Siyed',
  '4164254716',
  'AMD Ryzen 2 7889X',
  datetime.date(2017, 12, 10)),
 ('4435-3866-1076-3595',
  'dvyco@tkzhsop.zxg',
  'Connor',
  'Powers',
  '3627413915',
  'Intel Core i1-5459K',
  datetime.date(2019, 8, 18))]