In [1]:
import pandas as pd
import boto3 
from botocore import UNSIGNED 
from botocore.client import Config

In [2]:
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED)) 
bucket_name = "d2b-internal-assessment-bucket" 
response = s3.list_objects(Bucket=bucket_name, Prefix="orders_data") 



In [3]:
for item in response.get('Contents'):
    print(item.get('Key'))

orders_data/
orders_data/orders.csv
orders_data/reviews.csv
orders_data/shipment_deliveries.csv


In [4]:
def download_files(response, filenames):
    for filename in filenames:
        try:
            s3.download_file(bucket_name, f"orders_data/{filename}", f"{filename}")
            print(f"{filename} downloaded!")
        except:
            print(f"Problem downloading {filename}")
    print("files download completed!")

In [5]:
filenames = ['orders.csv', 'reviews.csv', 'shipment_deliveries.csv']

download_files(response, filenames)

orders.csv downloaded!
reviews.csv downloaded!
shipment_deliveries.csv downloaded!
files download completed!


In [6]:
from psycopg2 import connect

In [7]:
# Create connection
def connection(params: dict):
    try:
        return connect(**params)
    except Exception as e:
        print(e)


In [8]:
# load credentials to environment variables
from dotenv import load_dotenv
from os import environ

if not load_dotenv(override=True):
    print("No evironment variables set")
    exit()


params = {
    "host": environ['DB_HOST'],
    "database": environ['DB_NAME'],
    "user": environ['DB_USER'],
    "password": environ['DB_PASSWD'],
    "port": environ['DB_PORT'],
}

In [9]:
print(params)

{'host': 'd2b-internal-assessment-dwh.cxeuj0ektqdz.eu-central-1.rds.amazonaws.com', 'database': 'd2b_assessment', 'user': 'okondivi4898', 'password': 'cwlo4unWbV', 'port': '5432'}


In [10]:
conn = connection(params)


In [11]:
from typing import Dict


def load_csv_todb(conn, schema_name, csvs: Dict):
    cursor = conn.cursor()
    cursor.execute(f"SET search_path TO {schema_name}")

    for item in csvs.items():
        try:
            table_name = item[0]
            filepath = item[1][0]
            columns_info = ",".join(item[1][-1])
            query = f'''
                DROP TABLE IF EXISTS {table_name};
                CREATE TABLE {table_name} ({columns_info});
            '''
            cursor.execute(query)

            with open(filepath) as f:
                data = f.read()
                data = data.strip('\n').split('\n')[1:]

            column_names = [line.split(' ')[0] for line in item[1][-1]]
            subs = ['%s' for _ in column_names]
            columns = ', '.join(column_names)
            subs = ', '.join(subs)
            for row in data:
                row = row.strip().split(',')
                row = tuple([_  if _ else None for _ in row ])
                print(row)
                query2 = f'''INSERT INTO {table_name} ({columns}) VALUES({subs});'''
                cursor.execute(query2, row)
            conn.commit()
            
        except Exception as e:
            raise Exception("".format(e))
    
    
    cursor.close()
    print("done")



In [12]:
csvs = {
    "orders":  [
        r'C:\Users\owner\Downloads\DE1\orders.csv',
        [
            'order_id INT NOT NULL',
            'customer_id INT NOT NULL',
            'order_date DATE NOT NULL',
            'product_id INT NOT NULL',
            'unit_price NUMERIC NOT NULL',
            'quantity NUMERIC NOT NULL',
            'total_price NUMERIC NOT NULL'
        ]
    ],

    'reviews': [
        r'C:\Users\owner\Downloads\DE1\reviews.csv',
        [
            'review INT NOT NULL',
            'product_id INT NOT NULL'
        ]
    ],

    'shipment_deliveries': [
        r'C:\Users\owner\Downloads\DE1\shipment_deliveries.csv',
        [
            'shipment_id INT NOT NULL',
            'order_id INT NOT NULL',
            'shipment_date DATE',
            'delivery_date DATE'
        ]
    ]
}

staging_schema = environ['STAGING_SCHEMA']

load_csv_todb(conn, staging_schema, csvs)

('1', '5', '2022-07-13', '24', '139', '10', '1390')
('2', '14', '2021-04-06', '2', '273', '4', '1092')
('3', '17', '2022-07-29', '20', '253', '9', '2277')
('4', '14', '2022-08-27', '8', '334', '1', '334')
('5', '25', '2021-12-15', '6', '334', '3', '1002')
('6', '7', '2021-04-15', '25', '144', '7', '1008')
('7', '4', '2021-09-28', '12', '876', '1', '876')
('8', '24', '2021-05-08', '15', '994', '7', '6958')
('9', '3', '2022-01-06', '18', '641', '4', '2564')
('10', '23', '2021-08-22', '3', '841', '5', '4205')
('11', '3', '2022-01-25', '8', '334', '8', '2672')
('12', '21', '2022-01-15', '12', '413', '4', '1652')
('13', '22', '2021-06-25', '9', '596', '4', '2384')
('14', '6', '2021-10-04', '15', '997', '9', '8973')
('15', '8', '2021-02-27', '22', '284', '1', '284')
('16', '11', '2021-03-09', '13', '266', '10', '2660')
('17', '23', '2022-05-27', '11', '455', '8', '3640')
('18', '5', '2021-09-10', '9', '812', '9', '7308')
('19', '18', '2022-03-15', '9', '969', '10', '9690')
('20', '13', '2021

In [None]:
'''
1. 
'''