# 01-Sample data

In [47]:
import pandas as pd
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
from sqlalchemy import func

import clickhouse_util as ch_util

In [48]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
import ibis

Create SqlAlchemy engine

In [50]:
ch_url = 'clickhouse://default@10.0.0.2:8123/default'
engine = sa.create_engine(ch_url)

In [51]:
table = 'flight'
flight_null_count = ch_util.get_null_count(ch_url, table)

In [52]:
flight_null_count

{'Year': 0,
 'Month': 0,
 'DayofMonth': 0,
 'DayOfWeek': 0,
 'DepTime': 2302136,
 'CRSDepTime': 0,
 'ArrTime': 2584478,
 'CRSArrTime': 0,
 'UniqueCarrier': 0,
 'FlightNum': 0,
 'TailNum': 37385420,
 'ActualElapsedTime': 2587529,
 'CRSElapsedTime': 26234,
 'AirTime': 39266398,
 'ArrDelay': 2587529,
 'DepDelay': 2302136,
 'Origin': 0,
 'Dest': 0,
 'Distance': 202000,
 'TaxiIn': 37397295,
 'TaxiOut': 37382704,
 'Cancelled': 0,
 'CancellationCode': 122800263,
 'Diverted': 0,
 'CarrierDelay': 89329433,
 'WeatherDelay': 89329433,
 'NASDelay': 89329433,
 'SecurityDelay': 89329433,
 'LateAircraftDelay': 89329433}

Use pandas to read from SqlAlchemy engine

In [17]:
sql = 'select * from flight limit 100000'
df = pd.read_sql(sql, engine)

Get column names and types for a table

In [18]:
metadata = sa.MetaData(bind=engine)
metadata.reflect(only=['flight'])
flight_tbl = metadata.tables['flight']

# cannot access type for nullable types in Clickhouse
for column in flight_tbl.columns:
    print(column.name)
    break

Year


In [21]:
for col in flight_tbl.columns:
    print(col)
    break

flight.Year


In [22]:
col.name

'Year'

Use SqlAlchemy ORM to query the database

In [10]:
Session = sessionmaker(bind=engine)
session = Session()

qry = session.query(flight_tbl)

for row in session.query(flight_tbl).filter(
        flight_tbl.c.Month == 2, flight_tbl.c.DayofMonth == 29).limit(10):
    print(row)

(1988, 2, 29, 1, 957, 1000, 1054, 1104, 'PI', 894, None, 57, 64, None, -10, -3, 'DCA', 'SYR', 298, None, None, 0, None, 0, None, None, None, None, None)
(1988, 2, 29, 1, 704, 705, 746, 749, 'PI', 894, None, 42, 44, None, -3, -1, 'JAX', 'CHS', 193, None, None, 0, None, 0, None, None, None, None, None)
(1988, 2, 29, 1, 1121, 1125, 1204, 1200, 'PI', 894, None, 43, 35, None, 4, -4, 'SYR', 'BUF', 134, None, None, 0, None, 0, None, None, None, None, None)
(1988, 2, 29, 1, 1624, 1625, 1711, 1710, 'PI', 895, None, 47, 45, None, 1, -1, 'JFK', 'BDL', 106, None, None, 0, None, 0, None, None, None, None, None)
(1988, 2, 29, 1, 2200, 2147, 2230, 2227, 'PI', 896, None, 30, 40, None, 3, 13, 'BWI', 'CHO', 120, None, None, 0, None, 0, None, None, None, None, None)
(1988, 2, 29, 1, 1825, 1820, 1929, 1930, 'PI', 896, None, 64, 70, None, -1, 5, 'LGA', 'ROC', 254, None, None, 0, None, 0, None, None, None, None, None)
(1988, 2, 29, 1, 1954, 2000, 2057, 2104, 'PI', 896, None, 63, 64, None, -7, -6, 'ROC', 'BW

In [11]:
conn = ibis.clickhouse.connect(host='10.0.0.2', port=9000, database='default')

Get metadata for flight table

In [14]:
sql = '''
select database, table, name, type
from system.columns
where database = 'default'
    and table = 'flight'
'''
df2 = pd.read_sql(sql, engine)

In [15]:
df2.head()

Unnamed: 0,database,table,name,type
0,default,flight,Year,Int16
1,default,flight,Month,Int8
2,default,flight,DayofMonth,Int16
3,default,flight,DayOfWeek,Int8
4,default,flight,DepTime,Nullable(Int16)


In [16]:
table = conn.table('flight')
ibis.options.interactive = True

In [20]:
table.TailNum.count()

86149549

In [21]:
table.count()

123534969

In [22]:
df.isna().sum() * 100/df.shape[0]

Year                   0.000
Month                  0.000
DayofMonth             0.000
DayOfWeek              0.000
DepTime                2.763
CRSDepTime             0.000
ArrTime                3.085
CRSArrTime             0.000
UniqueCarrier          0.000
FlightNum              0.000
TailNum               49.152
ActualElapsedTime      3.085
CRSElapsedTime         0.001
AirTime               51.877
ArrDelay               3.085
DepDelay               2.763
Origin                 0.000
Dest                   0.000
Distance               0.000
TaxiIn                49.152
TaxiOut               49.152
Cancelled              0.000
CancellationCode     100.000
Diverted               0.000
CarrierDelay         100.000
WeatherDelay         100.000
NASDelay             100.000
SecurityDelay        100.000
LateAircraftDelay    100.000
dtype: float64