# Initialize

In [1]:
import os
import re

from IPython.display import display
import pandas as pd

In [2]:
DATA_PATH = os.path.realpath(os.path.join('..', '..', 'etl', 'airflow_home',
                                          'resources', 'data'))


def explore_and_create_sql(csv_file: str, data_path=DATA_PATH, **read_csv_args
                          ) -> pd.DataFrame:
    df = pd.read_csv(os.path.join(data_path, csv_file), **read_csv_args)

    print('Data')
    print('----')
    display(df)

    print()
    print('Number of lines and columns')
    print('---------------------------')
    print(df.shape)

    print()
    print('Column types')
    print('------------')
    types = df.dtypes
    print(types)

    print()
    print('Count distinct values')
    print('---------------------')
    value_counts = {col: df.loc[:, col].value_counts()
                    for col in df.columns}
    thresh = 20
    for col, count in value_counts.items():
        distinct = count.index.tolist()
        print(col, end=': ')
        if len(distinct) <= thresh:
            print(distinct)
        else:
            print(f'<more than {thresh:d} values>')

    print()
    print('Columns with non-repeated values --> primary key')
    print('------------------------------------------------')
    primary_keys = [col for col, count in value_counts.items()
                    if count.iloc[0] == 1]
    print('\n'.join(primary_keys))

    print()
    print('Count and percentage of missing data')
    print('------------------------------------')
    print('Count:')
    null_count = df.isna().sum()
    print(null_count)
    print()
    print('Percentage:')
    print((100 * df.isna().sum() / df.shape[0]).round(2))
    
    print()
    print('SQL statement')
    print('-------------')
    
    # Build CREATE TABLE statement:
    # - if no missing values: NOT NULL
    # - if no repeated values: PRIMARY KEY
    # - string columns: VARCHAR with maximum length found
    # - float columns: 

    print('CREATE TABLE IF NOT EXISTS <schema>.<table> (')

    for col in df.columns:
        
        # CamelCase to snake_case:
        col_name = (re.sub('(?!^)[ _]?([A-Z]+)',
                           r'_\1',
                           col.replace('-', ' '))
                    .replace(' ', '_')
                    .lower())

        sql_null = ' NOT NULL' if col in null_count[null_count==0] else ''
        sql_pk = ' PRIMARY KEY' if col in primary_keys else ''

        if types[col] == object:
            # https://www.postgresql.org/docs/current/datatype-character.html
            max_length = int(df.loc[:, col].str.len().max())
            col_type = f'VARCHAR({max_length:d})'
        elif types[col] == float:
            # https://www.postgresql.org/docs/current/datatype-numeric.html
            non_nulls = df.loc[df[col].notna(), col]
            scale = non_nulls.apply(lambda f: len(str(f).split('.')[-1])).max()
            if scale <= 6:
                col_type = 'REAL'
            elif scale <= 15:
                col_type = 'DOUBLE PRECISION'
            else:
                precision = non_nulls.apply(lambda f: len(str(f)) - 1).max()
                col_type = f'DECIMAL({precision:d}, {scale:d})'
        elif types[col] == 'Int64':
            # https://www.postgresql.org/docs/current/datatype-numeric.html
            min_ = df[col].min()
            max_ = df[col].max()
            if min_ >= -32768 and max_ <= 32767:
                col_type = 'SMALLINT'
            elif min_ >= -2147483648 and max_ <= 2147483647:
                col_type = 'INTEGER'
            elif min_ >= -9223372036854775808 and max_ <= 9223372036854775807:
                col_type = 'BIGINT'
            else:
                raise ValueError(f'Too large values in column {col:s}: min = {min_:d}, max = {max_:d}')

        print(f'  {col_name:s} {col_type:s}{sql_null:s}{sql_pk:s},')

    print(');')
    
    return df

# Explore data and generate CREATE SQL statement

## Airport codes

In [3]:
airport_codes = explore_and_create_sql('airport-codes.csv')

Data
----


Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"
...,...,...,...,...,...,...,...,...,...,...,...,...
55070,ZYYK,medium_airport,Yingkou Lanqi Airport,0.0,AS,CN,CN-21,Yingkou,ZYYK,YKH,,"122.3586, 40.542524"
55071,ZYYY,medium_airport,Shenyang Dongta Airport,,AS,CN,CN-21,Shenyang,ZYYY,,,"123.49600219726562, 41.784400939941406"
55072,ZZ-0001,heliport,Sealand Helipad,40.0,EU,GB,GB-ENG,Sealand,,,,"1.4825, 51.894444"
55073,ZZ-0002,small_airport,Glorioso Islands Airstrip,11.0,AF,TF,TF-U-A,Grande Glorieuse,,,,"47.296388888900005, -11.584277777799999"



Number of lines and columns
---------------------------
(55075, 12)

Column types
------------
ident            object
type             object
name             object
elevation_ft    float64
continent        object
iso_country      object
iso_region       object
municipality     object
gps_code         object
iata_code        object
local_code       object
coordinates      object
dtype: object

Count distinct values
---------------------
ident: <more than 20 values>
type: ['small_airport', 'heliport', 'medium_airport', 'closed', 'seaplane_base', 'large_airport', 'balloonport']
name: <more than 20 values>
elevation_ft: <more than 20 values>
continent: ['EU', 'SA', 'AS', 'AF', 'OC', 'AN']
iso_country: <more than 20 values>
iso_region: <more than 20 values>
municipality: <more than 20 values>
gps_code: <more than 20 values>
iata_code: <more than 20 values>
local_code: <more than 20 values>
coordinates: <more than 20 values>

Columns with non-repeated values --> primary key
--------------

## Global temperatures

In [4]:
global_temperatures = explore_and_create_sql('GlobalLandTemperaturesByCity.csv')

Data
----


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E
...,...,...,...,...,...,...,...
8599207,2013-05-01,11.464,0.236,Zwolle,Netherlands,52.24N,5.26E
8599208,2013-06-01,15.043,0.261,Zwolle,Netherlands,52.24N,5.26E
8599209,2013-07-01,18.775,0.193,Zwolle,Netherlands,52.24N,5.26E
8599210,2013-08-01,18.025,0.298,Zwolle,Netherlands,52.24N,5.26E



Number of lines and columns
---------------------------
(8599212, 7)

Column types
------------
dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
City                              object
Country                           object
Latitude                          object
Longitude                         object
dtype: object

Count distinct values
---------------------
dt: <more than 20 values>
AverageTemperature: <more than 20 values>
AverageTemperatureUncertainty: <more than 20 values>
City: <more than 20 values>
Country: <more than 20 values>
Latitude: <more than 20 values>
Longitude: <more than 20 values>

Columns with non-repeated values --> primary key
------------------------------------------------


Count and percentage of missing data
------------------------------------
Count:
dt                                    0
AverageTemperature               364130
AverageTemperatureUncertainty    364130
City      

In [5]:
# Trying to find the primary keys:
# * city + country + date --> there are multiple cities with the same name in the same country
# * latitude + longitude + date --> there are multiple cities with the same latitude and longitude
# * city + country + latitude + longitude + date --> this is unique

print(global_temperatures.groupby(['City', 'Country', 'dt']).size().sort_values(ascending=False).head())
print(global_temperatures.groupby(['Latitude', 'Longitude', 'dt']).size().sort_values(ascending=False).head())
print(global_temperatures.groupby(['City', 'Country', 'Latitude', 'Longitude', 'dt']).size().sort_values(ascending=False).head())

City         Country        dt        
Springfield  United States  1875-12-01    3
                            1866-05-01    3
                            1871-03-01    3
                            1871-02-01    3
                            1871-01-01    3
dtype: int64
Latitude  Longitude  dt        
36.17N    139.23E    1915-10-01    64
                     1901-10-01    64
                     1900-09-01    64
                     1900-10-01    64
                     1900-11-01    64
dtype: int64
City    Country  Latitude  Longitude  dt        
Ürümqi  China    44.20N    87.20E     2013-09-01    1
Hardoi  India    26.52N    80.60E     1835-04-01    1
                                      1835-06-01    1
                                      1835-07-01    1
                                      1835-08-01    1
dtype: int64


In [6]:
# Minumum and maximum temperatures

# It seems there is a rounding error due to the nature of floating point. Let's
# use 4 decimal places (scale), and precision = 6 (4 decimal places + 2 leading
# digits).

print(global_temperatures.AverageTemperature.min())
print(global_temperatures.AverageTemperature.max())

-42.70399999999999
39.650999999999996


In [7]:
# "dt" is a date of the format YYYY-MM-DD (10 characters), so let's use DATE type
# (https://www.postgresql.org/docs/current/datatype-datetime.html)

## U.S. cities demographics

In [8]:
us_cities = explore_and_create_sql('us-cities-demographics.csv', sep=';',
                                   dtype={'Male Population': 'Int64',
                                          'Female Population': 'Int64',
                                          'Number of Veterans': 'Int64',
                                          'Foreign-born': 'Int64'})

Data
----


Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601,41862,82463,1562,30908,2.60,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129,49500,93629,4147,32935,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040,46799,84839,4819,8229,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127,87105,175232,5821,33878,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040,143873,281913,5829,86253,2.73,NJ,White,76402
...,...,...,...,...,...,...,...,...,...,...,...,...
2886,Stockton,California,32.5,150976,154674,305650,12822,79583,3.16,CA,American Indian and Alaska Native,19834
2887,Southfield,Michigan,41.6,31369,41808,73177,4035,4011,2.27,MI,American Indian and Alaska Native,983
2888,Indianapolis,Indiana,34.1,410615,437808,848423,42186,72456,2.53,IN,White,553665
2889,Somerville,Massachusetts,31.0,41028,39306,80334,2103,22292,2.43,MA,American Indian and Alaska Native,374



Number of lines and columns
---------------------------
(2891, 12)

Column types
------------
City                       object
State                      object
Median Age                float64
Male Population             Int64
Female Population           Int64
Total Population            int64
Number of Veterans          Int64
Foreign-born                Int64
Average Household Size    float64
State Code                 object
Race                       object
Count                       int64
dtype: object

Count distinct values
---------------------
City: <more than 20 values>
State: <more than 20 values>
Median Age: <more than 20 values>
Male Population: <more than 20 values>
Female Population: <more than 20 values>
Total Population: <more than 20 values>
Number of Veterans: <more than 20 values>
Foreign-born: <more than 20 values>
Average Household Size: <more than 20 values>
State Code: <more than 20 values>
Race: ['Hispanic or Latino', 'White', 'Black or African-American', 'A



In [9]:
# Trying to find the primary keys

us_cities.sort_values('City').head(6)

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
2727,Abilene,Texas,31.3,65212,60664,125876,9367,8129,2.64,TX,Asian,2929
1403,Abilene,Texas,31.3,65212,60664,125876,9367,8129,2.64,TX,Hispanic or Latino,33222
1533,Abilene,Texas,31.3,65212,60664,125876,9367,8129,2.64,TX,White,95487
245,Abilene,Texas,31.3,65212,60664,125876,9367,8129,2.64,TX,American Indian and Alaska Native,1813
2880,Abilene,Texas,31.3,65212,60664,125876,9367,8129,2.64,TX,Black or African-American,14449
2175,Akron,Ohio,38.1,96886,100667,197553,12878,10024,2.24,OH,Asian,9033


In [10]:
# Let's check if the combination city + state + race is unique

assert us_cities.groupby(['City', 'State', 'Race']).size().sort_values(ascending=False).iloc[0] == 1