# Summary

Explore if the chosen types are correct and inspect some values in sample data (this table was created with two months of data).

# Initialize

In [1]:
import pandas as pd
import psycopg2

In [2]:
conn = psycopg2.connect(database='etldb',
                        user='airflow_etl',
                        host='localhost',
                        port=5432,
                        options=f'-c search_path=etl')
conn.autocommit = True


def run_query(sql, conn=conn, **read_sql_kwargs) -> pd.DataFrame:
    return pd.read_sql(sql, con=conn, **read_sql_kwargs)

# Check range of values

In [3]:
integer_cols = [
    'cicid',
    'i94yr',
    'i94mon',
    'i94cit',
    'i94res',
    'arrdate',
    'i94mode',
    'depdate',
    'i94bir',
    'i94visa',
    'count',
    'biryear',
    'admnum',
]

text_cols = [
    'i94port',
    'i94addr',
    'dtadfile',
    'visapost',
    'occup',
    'entdepa',
    'entdepd',
    'entdepu',
    'matflag',
    'dtaddto',
    'gender',
    'insnum',
    'airline',
    'fltno',
    'visatype',
]

In [4]:
one_row = run_query('SELECT * FROM immigration LIMIT 1;')
cols = one_row.columns.tolist()

In [23]:
one_row.T

Unnamed: 0,0
cicid,5912468
i94yr,2016
i94mon,1
i94cit,260
i94res,260
i94port,PEN
arrdate,20454
i94mode,2
i94addr,AL
depdate,


# Numerical types storage size

In [5]:
col_types_integer = []

for col in integer_cols:
    result = run_query(f'''
    WITH
    min_max AS (
      SELECT MIN({col:s}) AS min_{col:s}, MAX({col:s}) AS max_{col:s}
      FROM immigration)
    
    SELECT
      CASE
        WHEN min_{col:s} >= -32768 AND max_{col:s} <= 32767 THEN 'SMALLINT'
        WHEN min_{col:s} >= -2147483648 AND max_{col:s} <= 2147483647 THEN 'INTEGER'
        WHEN min_{col:s} >= -9223372036854775808 AND max_{col:s} <= 9223372036854775807 THEN 'BIGINT'
        ELSE 'UNKNOWN'
      END AS numeric_type,
      min_{col:s} AS min_value, max_{col:s} AS max_value
    FROM min_max;''').loc[0].to_dict()
    col_types_integer.append(dict({'col_name': col}, **result))

In [6]:
# The numerical types in the CREATE TABLE statement were decided here.
# Since "arrdate" and "depdate" were close to the limits, we changed their type (this is a sample)

pd.DataFrame(col_types_integer)

Unnamed: 0,col_name,numeric_type,min_value,max_value
0,cicid,INTEGER,6,7318723
1,i94yr,SMALLINT,2016,2016
2,i94mon,SMALLINT,1,12
3,i94cit,SMALLINT,0,999
4,i94res,SMALLINT,101,760
5,arrdate,SMALLINT,20454,20819
6,i94mode,SMALLINT,0,9
7,depdate,SMALLINT,-11972,31442
8,i94bir,SMALLINT,-2,111
9,i94visa,SMALLINT,1,3


# Text columns length

In [7]:
col_types_text = []

for col in text_cols:
    result = run_query(f'''
      SELECT MIN(LENGTH({col:s})) AS min_length, MAX(LENGTH({col:s})) AS max_length
      FROM immigration;''').loc[0].to_dict()
    col_types_text.append(dict({'col_name': col}, **result))

In [8]:
# The text types in the CREATE TABLE statement were decided here:
# If min = max --> CHAR(min)
# Otherwise --> VARCHAR(max)
# Exceptions: visapost, occup, insnum, airline, fltno, visatype: VARCHAR(16) to
# leave room for more characters, since this is a sample.

pd.DataFrame(col_types_text)

Unnamed: 0,col_name,min_length,max_length
0,i94port,3,3
1,i94addr,1,2
2,dtadfile,8,8
3,visapost,3,3
4,occup,3,3
5,entdepa,1,1
6,entdepd,1,1
7,entdepu,1,1
8,matflag,1,1
9,dtaddto,3,8


# Missing data

In [17]:
missing_values_count = {}
for col in cols:
    result = run_query(f'''SELECT COUNT(*)
    FROM immigration
    WHERE {col:s} IS NULL;''').values[0][0]
    missing_values_count[col] = result

In [18]:
# Where there is no missing value, use NOT NULL in CREATE TABLE.

missing_values_count

{'cicid': 0,
 'i94yr': 0,
 'i94mon': 0,
 'i94cit': 969,
 'i94res': 0,
 'i94port': 0,
 'arrdate': 0,
 'i94mode': 60,
 'i94addr': 343954,
 'depdate': 737400,
 'i94bir': 1883,
 'i94visa': 0,
 'count': 0,
 'dtadfile': 90486,
 'visapost': 3170950,
 'occup': 6225834,
 'entdepa': 137,
 'entdepd': 725665,
 'entdepu': 6279309,
 'matflag': 725665,
 'biryear': 1883,
 'dtaddto': 1404,
 'gender': 284535,
 'insnum': 5211751,
 'airline': 155154,
 'admnum': 0,
 'fltno': 27762,
 'visatype': 0}

# Primary key

In [20]:
# A good gues is "cicid". In this sample, it doesn't repeat.

run_query('''SELECT cicid, COUNT(*)
FROM immigration
GROUP BY 1
HAVING COUNT(*) > 2 LIMIT 5;''')

Unnamed: 0,cicid,count


# Cleanup

In [4]:
conn.close()