In [None]:
%config IPCompleter.greedy=True

import yaml
import psycopg2
import os
import os.path
import sys

# Set the path
home_dir = os.getcwd()
credential_dir = os.path.join('../../config')

def create_pgconn(credentials_yaml):
    with open(credentials_yaml) as f:
        configs = yaml.load(f)
    try: 
        conn = psycopg2.connect("dbname='{}' user='{}' host='{}' password='{}'".format(
            configs['DB_name'],
            configs['user'],
            configs['host'],
            configs['password']))
    except Exception as e: 
        print("Error connecting to db.")
        raise e
    conn.set_client_encoding('latin_1')
    cur = conn.cursor()
    cur.execute("SET ROLE " + configs['role'])
    return conn

In [None]:
# Setting up a connection to the server

credentials_yaml = os.path.join(credential_dir, 'db_creds.yml') #example file on hitchikers repo
conn = create_pgconn(credentials_yaml)

def sql(query, conn=conn):
    return pd.read_sql(query, conn)

In [None]:
# The renaming uses the ordering of the new column names in 1_column_mapping.csv, combined with the automatic naming of columns by csvkit (a,b,c...aa,bb,cc...aaa,bbb,...)
with open('../../../../garfield/1_column_mapping.csv', 'r') as fil:
    col_file = fil.read()
alphabet = list(map(chr, range(97, 123)))
def multi_alphabet(n_times):
    full_alph = []
    for n in range(1, n_times+1):
        full_alph += [''.join(x) for x in zip(*[alphabet]*n)]
    return full_alph
old_cols = multi_alphabet(12)
col_map = dict()
all_new_cols = []
for pair in col_file.split('\n'):
    lst = pair.split(',')
    col_map[lst[0]] = dict()
    new = []
    old = []
    for idx, new_col in enumerate(lst[1:]):
        if new_col != '':
            old += ['g%d' %idx] #old_cols[idx]]
            new += [new_col]
    col_map[lst[0]]['old'] = old
    col_map[lst[0]]['new'] = new
    all_new_cols += new
union = set(all_new_cols)

In [None]:
cur = conn.cursor()
cmnd = 'year smallint, ' + ' varchar, '.join(union) + ' varchar'  # This creates a list of column names that will be used in the joined table
cur.execute("""drop table if exists preproc."1_joined";""")
cur.execute("""create table if not exists preproc."1_joined" (%s);""" %cmnd)
conn.commit()

In [None]:
# Iterates over all tables in raw (with columns a0, a1, a2...), inserting into a joined preprocessing table (with columns nie, dpto_code_ce, year...)
cur = conn.cursor()
for table_name, cols in col_map.items():
    print(table_name)
    year = table_name[-4:]
    cur.execute("""insert into preproc."1_joined" (year, %s) select %s, %s from raw."%s";""" %(','.join(cols['new']), int(year), ','.join(cols['old']), table_name))
conn.commit()

In [None]:
with open('../../../../garfield/1_column_mapping_types.csv', 'r') as fil:
    col_file = fil.read()
col_types = dict()
for pair in col_file.split('\n'):
    split = pair.split(',')
    if split[1] != '':
        col_types[split[0]] = split[1]
    else:
        col_types[split[0]] = 'varchar'

In [None]:
# Changing the column types and removing empty strings
cur = conn.cursor()
for col, col_typ in col_types.items():
    print(col)
    if col in union:
        try:
            if col_typ == 'bool':
                cur.execute("""ALTER TABLE preproc."1_joined" ALTER COLUMN %s TYPE bool
        USING CASE %s 
        WHEN 'Sí' THEN '1'::bool
        WHEN 'No' THEN '0'::bool
        WHEN '1' then '1'::bool
        WHEN '0' then '0'::bool
        END;""" %(col, col))
            elif 'int' in col_typ:
                cur.execute("""ALTER TABLE preproc."1_joined" ALTER COLUMN %s TYPE %s using NULLIF(%s, '')::numeric::%s;""" %(col, col_typ, col, col_typ))
            else:
                cur.execute("""ALTER TABLE preproc."1_joined" ALTER COLUMN %s TYPE %s using NULLIF(%s, '')::%s;""" %(col, col_typ, col, col_typ))
        except Exception as e:
            print('Failed because: %s' %e)
            conn = create_pgconn(credentials_yaml)
            cur = conn.cursor()
#     else:
#         LOGGER WARN
# conn.commit()

In [None]:
with open('/mnt/data/projects/el_salvador_mined_education/garfield/1._Censo_matricular_-_Centros_educativos/Base_de_Centros_Escolares_Censo_2016.csv') as fil:
    read = fil.readlines()

In [None]:
headers = read[8]

In [None]:
headers.split('^')[-22]

In [None]:
read[16].split('^')[-22]