# Read in CSV files to DB

## Monkey patching pandas sql IO

It turns out that there's an existing issue with pandas that limits its ability to perform insertions for multiple rows at a time. If I used the default one row at a time, then this operation would take far too long. See the link below for more about the existing issue.

[Pandas to_sql issue link.](https://github.com/pandas-dev/pandas/issues/8953)

Thanks to github user `nhockham` for suggesting the use of the monkey patch below.

In [None]:
from pandas.io.sql import SQLTable

def _execute_insert(self, conn, keys, data_iter):
    print('.', end='')
    data = [dict((k, v) for k, v in zip(keys, row)) for row in data_iter]
    conn.execute(self.insert_statement().values(data))

SQLTable._execute_insert = _execute_insert

In [None]:
from sqlalchemy import create_engine
from getpass import getpass, getuser
from os import listdir
from os.path import join

import pandas as pd
import psycopg2

In [None]:
df.head()

In [None]:
csv_files = [file for file in listdir('../data/') if file[-4:] == '.csv']


u = input('Database user:')
p = getpass('Input database password')
engine_string = 'postgresql://{0}:{1}@handelstaccato.homenet.org:5432/king_county'.format(u, p)
engine = create_engine(engine_string)

for csv_file in csv_files:
    table_name = csv_file.split('.')[0]
    df = pd.read_csv(join('../data', csv_file), quotechar='"', encoding='latin1')
    df.to_sql(table_name, engine, schema='assessor_data', index=False, chunksize=1000)
    # Just in case. I've been hurt too many times.
    print('"Finished"', table_name)
    



In [None]:
open_data_csvs = [file for file in listdir('../data/seattle_open_data/') if file[-4:] == '.csv']

u = input('Database user:')
p = getpass('Input database password')
engine_string = 'postgresql://{0}:{1}@handelstaccato.homenet.org:5432/king_county'.format(u, p)
engine = create_engine(engine_string)

for csv_file in open_data_csvs:
    table_name = csv_file.split('.')[0]
    df = pd.read_csv(join('../data/seattle_open_data/', csv_file))
    try:
        df = pd.concat([
            df,
            df.Shape.str[1:-1].str.split(', ').apply(pd.Series).astype(float).rename(columns={0: 'lat', 1: 'long'})
        ], axis=1).head()
    except AttributeError:
        pass
    try:
        df.to_sql(table_name, engine, schema='extra_info', index=False, chunksize=1000)
    except:
        print('Already in DB')
        pass
    # Just in case. I've been hurt too many times.
    print('"Finished"', table_name)


## One-offs

Discovered issues easier to fix in python than in sql with the data.

In [None]:
csv_file = '/Users/glpeterkin/codespace/python/king_county_property/data/RPSale.csv'
    
table_name = 'RPSale'
df = pd.read_csv(join('../data', csv_file), quotechar='"', encoding='latin1')
#Drop if Major or minor are null
df.Major = df.Major.str.strip()
df.Minor = df.Minor.str.strip()

has_null_ids = (
    (df.Major.isnull()) | (df.Minor.isnull()) | (df.Major == '') | (df.Minor == '')
)
df.drop(df[has_null_ids].index, inplace=True)

df.Major = df.Major.apply(pd.to_numeric).astype(int)
df.Minor = df.Minor.apply(pd.to_numeric).astype(int)

print(df.dtypes)

u = input('Database user:')
p = getpass('Input database password')
engine_string = 'postgresql://{0}:{1}@handelstaccato.homenet.org:5432/king_county'.format(u, p)
engine = create_engine(engine_string)

df.to_sql(table_name, engine, schema='assessor_data', index=False, chunksize=1000)

In [None]:
filename = '/Users/glpeterkin/codespace/python/king_county_property/data/seattle_open_data/Landmarks.csv'

table_name = 'Landmarks'

df = pd.read_csv(filename)

df = pd.concat([
    df,
    df.Shape.str[1:-1].str.split(', ').apply(pd.Series).astype(float).rename(columns={0: 'lat', 1: 'long'})
], axis=1)

u = input('Database user:')
p = getpass('Input database password')
engine_string = 'postgresql://{0}:{1}@handelstaccato.homenet.org:5432/king_county'.format(u, p)
engine = create_engine(engine_string)

df.to_sql(table_name, engine, schema='extra_info', index=False, chunksize=1000)


In [None]:
# Fixing issue with public and private schools only having

In [None]:
# Updating residential to have a pin and str major and minor indexes to allow easier joining between tables

# Data in

u = input('Database user:')
p = getpass('Input database password')
engine_string = 'postgresql://{0}:{1}@handelstaccato.homenet.org:5432/king_county'.format(u, p)
engine = create_engine(engine_string)

residential_q = (
    'SELECT * FROM project."residential";'
)

parcel = pd.read_sql(residential_q, engine)

In [None]:
# Create a new pin string column

def add_pin_col(df):
    df['major_str'] = df.Major.astype(str).apply(lambda x: x.zfill(6))
    df['minor_str'] = df.Minor.astype(str).apply(lambda x: x.zfill(4))
    df['pin'] = df.major_str + df.minor_str
    return df

parcel = add_pin_col(parcel)

In [None]:
u = input('Database user:')
p = getpass('Input database password')
engine_string = 'postgresql://{0}:{1}@handelstaccato.homenet.org:5432/king_county'.format(u, p)
engine = create_engine(engine_string)

parcel.to_sql('parcel_new', engine, schema='project', index=False, chunksize=5000)