In [1]:
from sqlalchemy import create_engine
from sqlalchemy.exc import OperationalError

import psycopg2
from psycopg2 import OperationalError

import pandas as pd

In [2]:
def create_db_connection():
    """
    Create a database connection and return the connection object
    Returns:
        connection: SQLAlchemy connection object if connection is successful, None otherwise
    """
    try:
        engine = create_engine('postgresql://postgres:superadmin@localhost:5432/ecommerce')
        engine.connect()
        print("Connection to PostgreSQL DB successful")
    except OperationalError as e:
        print("Error occurred during connection:", e)
        connection = None

    return engine

# test connection
connection = create_db_connection()

Connection to PostgreSQL DB successful


In [3]:
# load csv files for each table
df_products = pd.read_csv("../raw-datasets/products.csv")
df_productlist = pd.read_csv("../raw-datasets/product_list.csv")
df_transactions = pd.read_csv("../raw-datasets/rev_transactions.csv")

# displaying each dataframe
display(df_products.head(3), df_productlist.head(3), df_transactions.head(3))

Unnamed: 0,SKU,name,orderedQuantity,stockLevel,restockingLeadTime,sentimentScore,sentimentMagnitude
0,GGADFBSBKS42347,PC gaming speakers,0,100,1,,
1,GGOEGAAX0581,Women's Colorblock Tee White,0,0,8,0.8,2.0
2,GGOEGAAX0596,Men's Quilted Insulated Vest Black,26,32,8,0.8,2.0


Unnamed: 0,productSKU,v2ProductName
0,10 55401,Lip Balm
1,9180842,Maze Pen
2,GGOEGGOA017399,Maze Pen


Unnamed: 0,fullVisitorId,channelGrouping,hits_time,geoNetwork_country,geoNetwork_city,totals_totalTransactionRevenue,totals_transactions,totals_timeOnSite,totals_pageviews,date,...,hits_product_v2ProductCategory,hits_product_productVariant,hits_item_currencyCode,hits_item_itemQuantity,hits_item_itemRevenue,hits_transaction_transactionRevenue,hits_transaction_transactionId,hits_page_pageTitle,hits_page_searchKeyword,hits_page_pagePathLevel1
0,7851878234225373633,Organic Search,1819831,Peru,La Victoria,42000000,1,1834,40,20160807,...,(not set),BLUE,USD,,,42000000,ORD20160807983,Checkout Confirmation,,/ordercompleted.html
1,7851878234225373633,Organic Search,1819831,Peru,La Victoria,42000000,1,1834,40,20160807,...,(not set),Single Option Only,USD,,,42000000,ORD20160807983,Checkout Confirmation,,/ordercompleted.html
2,7851878234225373633,Organic Search,1819831,Peru,La Victoria,42000000,1,1834,40,20160807,...,(not set),GREEN,USD,,,42000000,ORD20160807983,Checkout Confirmation,,/ordercompleted.html


In [4]:
# displaying information for each dataframe
display(df_products.info(), df_productlist.info(), df_transactions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1092 entries, 0 to 1091
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   SKU                 1092 non-null   object 
 1   name                1092 non-null   object 
 2   orderedQuantity     1092 non-null   int64  
 3   stockLevel          1092 non-null   int64  
 4   restockingLeadTime  1092 non-null   int64  
 5   sentimentScore      1091 non-null   float64
 6   sentimentMagnitude  1091 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 59.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2273 entries, 0 to 2272
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   productSKU     2273 non-null   object
 1   v2ProductName  2273 non-null   object
dtypes: object(2)
memory usage: 35.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 1

None

None

None

In [5]:
def insert_data_to_db(connection, csv_file, table_name):
    """
    Insert data from a CSV file into a database table using a given database connection
    Args:
        connection: A connection object
        csv_file (str): Path to the CSV file to be inserted
        table_name (str): Name of the database table into which data is to be inserted
    Returns:
        None
    """
    try:
        df = pd.read_csv(csv_file)
        # convert column names to lowercase
        df.columns = map(str.lower, df.columns)
        
        # convert 'date' column to datetime if it exists
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

        # convert columns of type uint64 to string
        for col in df.columns:
            if df[col].dtype == 'uint64':
                df[col] = df[col].astype(str)

        # drop duplicates if 'productsku' column exists
        if 'productsku' in df.columns:
            df = df.drop_duplicates(subset='productsku', keep='first')

        df.to_sql(table_name, connection, if_exists='append', index=False)
        print(f"Data inserted successfully into table {table_name}")
    except Exception as e:
        print(f"Error occurred during data insertion into table {table_name}:", e)

In [6]:
# set dictionary
csv_files_and_tables = {
    '../raw-datasets/products.csv': 'products',
    '../raw-datasets/product_list.csv': 'product_list',
    '../raw-datasets/rev_transactions.csv': 'all_transactions',
}

# insert the data into the table
for csv_file, table_name in csv_files_and_tables.items():
    insert_data_to_db(connection, csv_file, table_name)

Data inserted successfully into table products
Data inserted successfully into table product_list
Data inserted successfully into table all_transactions


In [12]:
def create_conn():
    conn = None
    try:
        conn = psycopg2.connect(
            database="ecommerce",
            user="postgres",
            password="superadmin",
            host="localhost",
            port="5432",
        )
        print("Connection to PostgreSQL DB successful")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
    return conn

engine = create_conn()

Connection to PostgreSQL DB successful


In [13]:
def query_data_from_db(engine, query):
    """
    Select data from a database using a given SQL query
    Args:
        connection: A connection object
        query (str): SQL query
    Returns:
        df: A dataframe containing the selected data
    """
    df = None
    try:
        df = pd.read_sql_query(query, con=engine)
        print("Data fetched successfully.")
    except Exception as e:
        print(f"Error occurred during data fetching: {e}")
    return df

In [14]:
transaction_query = "SELECT * FROM all_transactions LIMIT 100;"
transaction_df = query_data_from_db(engine, transaction_query)

transaction_df

Data fetched successfully.


  df = pd.read_sql_query(query, con=engine)


Unnamed: 0,fullvisitorid,channelgrouping,hits_time,geonetwork_country,geonetwork_city,totals_totaltransactionrevenue,totals_transactions,totals_timeonsite,totals_pageviews,date,...,hits_product_v2productcategory,hits_product_productvariant,hits_item_currencycode,hits_item_itemquantity,hits_item_itemrevenue,hits_transaction_transactionrevenue,hits_transaction_transactionid,hits_page_pagetitle,hits_page_searchkeyword,hits_page_pagepathlevel1
0,7851878234225373633,Organic Search,1819831,Peru,La Victoria,42000000.0,1,1834,40,2016-08-07,...,(not set),BLUE,USD,,,42000000.0,ORD20160807983,Checkout Confirmation,,/ordercompleted.html
1,7851878234225373633,Organic Search,1819831,Peru,La Victoria,42000000.0,1,1834,40,2016-08-07,...,(not set),Single Option Only,USD,,,42000000.0,ORD20160807983,Checkout Confirmation,,/ordercompleted.html
2,7851878234225373633,Organic Search,1819831,Peru,La Victoria,42000000.0,1,1834,40,2016-08-07,...,(not set),GREEN,USD,,,42000000.0,ORD20160807983,Checkout Confirmation,,/ordercompleted.html
3,7851878234225373633,Organic Search,1819831,Peru,La Victoria,42000000.0,1,1834,40,2016-08-07,...,(not set),Single Option Only,USD,,,42000000.0,ORD20160807983,Checkout Confirmation,,/ordercompleted.html
4,7851878234225373633,Organic Search,1819831,Peru,La Victoria,42000000.0,1,1834,40,2016-08-07,...,(not set),RED,USD,,,42000000.0,ORD20160807983,Checkout Confirmation,,/ordercompleted.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,9162599630650052136,Paid Search,1042298,India,Ahmedabad,30430000.0,1,1042,32,2017-06-06,...,Office,Single Option Only,USD,,,30430000.0,ORD201706062576,Checkout Confirmation,,/ordercompleted.html
96,9162599630650052136,Paid Search,1042298,India,Ahmedabad,30430000.0,1,1042,32,2017-06-06,...,Office,Single Option Only,USD,,,30430000.0,ORD201706062576,Checkout Confirmation,,/ordercompleted.html
97,9162599630650052136,Paid Search,1042298,India,Ahmedabad,30430000.0,1,1042,32,2017-06-06,...,Office,Single Option Only,USD,,,30430000.0,ORD201706062576,Checkout Confirmation,,/ordercompleted.html
98,9162599630650052136,Paid Search,1042298,India,Ahmedabad,30430000.0,1,1042,32,2017-06-06,...,Office,Single Option Only,USD,,,30430000.0,ORD201706062576,Checkout Confirmation,,/ordercompleted.html


In [15]:
engine.close()