In [1]:
%%capture
# Install postgresql server
!sudo apt-get -y -qq update
!sudo apt-get -y -qq install postgresql
!sudo service postgresql start

# Setup a password `postgres` for username `postgres`
!sudo -u postgres psql -U postgres -c "ALTER USER postgres PASSWORD 'postgres';"

# Setup a database with name `tfio_demo` to be used
!sudo -u postgres psql -U postgres -c 'DROP DATABASE IF EXISTS db;'
!sudo -u postgres psql -U postgres -c 'CREATE DATABASE db;'

In [2]:
import pandas as pd
from sqlalchemy import create_engine
import yaml
import os
from time import time

In [3]:
with open('settings.yaml', encoding='utf8') as f:
    settings = yaml.safe_load(f)

In [4]:
user = settings['DB']['USER']
password = settings['DB']['PASSWORD']
host = settings['DB']['HOST']
post = settings['DB']['PORT']
name = settings['DB']['NAME']

In [5]:
point_connect = 'postgresql://{}:{}@{}:{}/{}'.format(user, password, host, post, name)
con = create_engine(point_connect)

In [6]:
url = "https://raw.githubusercontent.com/grishenkovp/databricks/main/sql/sales.csv"
input_csv = "sales.csv"
db_tbl = "sales"

In [7]:
os.system(f"wget {url} -O {input_csv}")

0

In [14]:
!head -n 5 -v  /content/sales.csv

==> /content/sales.csv <==
date,id_client,id_product,sale_amount
01.01.2021,4,4,24
02.01.2021,1,1,1
03.01.2021,2,1,10
04.01.2021,10,3,80


In [15]:
!wc -l /content/sales.csv

182 /content/sales.csv


In [None]:
# from datetime import datetime
# dateparse = lambda x: datetime.strptime(x, '%d.%m.%Y %H:%M:%S')
# df = pd.read_csv(input_csv, parse_dates=['date'], date_parser=dateparse)

In [16]:
df = pd.read_csv(input_csv)
df.columns = ['dt','client_id','product_id','amount']
df['dt'] = pd.to_datetime(df['dt'], format='%d.%m.%Y', errors='ignore')
df['client_id'] = df['client_id'].astype('int32')
df['product_id'] = df['product_id'].astype('int32')
df['amount'] = df['amount'].astype('int32')

In [17]:
df.head(5)

Unnamed: 0,dt,client_id,product_id,amount
0,2021-01-01,4,4,24
1,2021-01-02,1,1,1
2,2021-01-03,2,1,10
3,2021-01-04,10,3,80
4,2021-01-05,8,1,72


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   dt          181 non-null    datetime64[ns]
 1   client_id   181 non-null    int32         
 2   product_id  181 non-null    int32         
 3   amount      181 non-null    int32         
dtypes: datetime64[ns](1), int32(3)
memory usage: 3.7 KB


In [19]:
print(pd.io.sql.get_schema(df,name=db_tbl,con=con))


CREATE TABLE sales (
	dt TIMESTAMP WITHOUT TIME ZONE, 
	client_id INTEGER, 
	product_id INTEGER, 
	amount INTEGER
)




In [20]:
chunk_size = 50
chunk_number=1

In [21]:
for df_chunk in pd.read_csv(input_csv, iterator=True, chunksize=chunk_size):
  t_start = time()
  df_chunk.columns = ['dt','client_id','product_id','amount']
  df_chunk['dt'] = pd.to_datetime(df['dt'], format='%d.%m.%Y', errors='ignore')
  df_chunk['client_id'] = df['client_id'].astype('int32')
  df_chunk['product_id'] = df['product_id'].astype('int32')
  df_chunk['amount'] = df['amount'].astype('int32')
  df_chunk.to_sql(name=db_tbl, index=False, con=con, if_exists='append')
  t_end = time()
  t = t_end - t_start
  print(f'Inserted chunk {chunk_number}, {t:0.3f} second')
  chunk_number = chunk_number + 1 

Inserted chunk 1, 0.038 second
Inserted chunk 2, 0.010 second
Inserted chunk 3, 0.014 second
Inserted chunk 4, 0.009 second


In [22]:
def select_postgresql(sql):
    return pd.read_sql(sql, con)

In [23]:
sql = "select count(*) from sales"

In [24]:
print(select_postgresql(sql))

   count
0    181


In [25]:
sql = "select * from sales limit 5"

In [26]:
print(select_postgresql(sql))

          dt  client_id  product_id  amount
0 2021-01-01          4           4      24
1 2021-01-02          1           1       1
2 2021-01-03          2           1      10
3 2021-01-04         10           3      80
4 2021-01-05          8           1      72
