# Import Tables

TO PREPARE:
 - The .csv files referenced below are STRAIGHT exports from the Socrata Files. 
 - Each Socrata file is linked below. 

 - The 311 data .csv import implies that you downloaded a monolithic 311 file from Socrata and then ran bash "split" on the file. 
 - Once the split files were obtained, you then further processed them to `cat columns.311 [split_file] > [split_file]_c` to obtain properly "headered" split files. 

TO DO: 
 - Please note that the guess_sql code above makes absurdly large varchar fields to account for large description fields in some data tables (specifically HPD Violations NOVDescription)
 - Need to clean up the field names for 311

NOTES: 
 - Far below is some random SQL SELECT statements
 - Far below are SQL statements for creating table indices
 - Questions: jpf321@gmail.com slack: jpfreeley

# Import desired libraries

In [45]:
import pandas as pd
import mysql.connector
import sqlalchemy
from sqlalchemy import create_engine
import datetime
import pickle
import logging

BASE_DIR = '/Users/jfreeley/Desktop/HeatSeek/'

LOG_FILE = BASE_DIR+'db_import.log'

logging.basicConfig(format= '[%(asctime)s] {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s',
    datefmt='%H:%M:%S',
    filename=LOG_FILE, 
    level=logging.INFO)

log = logging.getLogger(__name__)
print "This notebook will log to {}".format(LOG_FILE)
log.info("This notebook will log to {}".format(LOG_FILE))

This notebook will log to /Users/jfreeley/Desktop/HeatSeek/db_import.log


# Initialize connection to AWS mySQL DB

In [46]:
### AWS
#engine = create_engine('mysql+mysqlconnector://hsdbuser:hsdbpass@hsdb.cjjva3uq32na.us-west-2.rds.amazonaws.com:3306/heatseek', echo=False)

### LOCALHOST
### INSTALL ON MAC
### brew update
### brew doctor
### brew upgrade
### brew install mysql
### brew services start mysql

engine = create_engine('mysql+mysqlconnector://root@localhost/heatseek', echo=False)

# Helper functions

In [47]:
MAX_COLUMN_LENGTH = 255


def guess_sqlcol(dfparam):    

## GUESS AT SQL COLUMN TYPES FROM DataFrame dtypes. 
    
    dtypedict = {}
    for i,j in zip(dfparam.columns, dfparam.dtypes):
        if "object" in str(j):
            dtypedict.update({i: sqlalchemy.types.NVARCHAR(length=MAX_COLUMN_LENGTH)}) ##big field length for HPD violations description

        if "datetime" in str(j):
            dtypedict.update({i: sqlalchemy.types.DateTime()})

        if "float" in str(j):
            dtypedict.update({i: sqlalchemy.types.Float(precision=20, asdecimal=True)}) ##big precision for LAT/LONG fields

        if "int" in str(j):
            dtypedict.update({i: sqlalchemy.types.INT()})

    return dtypedict


def hpd_csv2sql(description, input_csv_file, sep_char, output_pickle,\
            table_name, dtype_dict, parse_dates, load_pickle, \
                input_pickle, db_action, truncate_columns, date_time_columns,\
               chunk_size, keep_cols):

    log.info("Beginning {} Import {}".format(description,datetime.datetime.now()))
    
    if load_pickle == True:
        log.info("Flagged load of PICKLE: {} = True".format(input_pickle))
        
        with open(input_pickle, 'r') as picklefile:
            log.info("Begin OPEN {} Pickle: {}".format(input_pickle, datetime.datetime.now()))
            log.info("Great we have a pickle file...Loading from {}".format(input_pickle))
            df = pickle.load(picklefile)

    else: 
        log.info("Reading CSV from {} .. This may take a while...".format(input_csv_file))
        
        with open(input_csv_file, 'r') as input_csv: ## should just change to IF EXISTS rather than open()???
            df = pd.read_csv(input_csv_file , sep=sep_char, dtype=dtype_dict, parse_dates=parse_dates)
                
        log.info("Why don't we save {} for next time".format(output_pickle))
        
        with open(output_pickle, 'w') as picklefile:
            log.info("Begin {} Pickle: {}".format(description,datetime.datetime.now()))
            pickle.dump(df, picklefile)
    
    
    
    ## LET'S SEE IF THERE ARE COLUMNS TO TRUNCATE
    ## CLEAN COLUMN NAMES
    cols = [i.lower().replace(" ","_").replace("'","").replace("\xe2\x80\x99","").\
            replace("#","num").replace("&","and").replace("(","").\
            replace(")","") for i in df.columns]
    df.columns = cols
    
    ## KEEP ONLY THE COLUMNS OF INTEREST
    df = df[keep_cols]
    
    ## TRIM COLUMN DATA TO MAX_LENGTH
    for i in truncate_columns:
        df[i] = df[i].str[:MAX_COLUMN_LENGTH]
    
    ## CONVERT DTETIME COLS TO DATETIME
    for i in date_time_columns:
        df[i] = pd.to_datetime(df[i])

        
    log.info("Let's now try to send it to the DB")
    outputdict = guess_sqlcol(df)  #Guess at SQL columns based on DF dtypes

    log.info("Begin Upload {} SQL".format(description, datetime.datetime.now()))
    log.info("Let's see if we should replace or append our table ...")

    if db_action == 'replace': 
        
        action = db_action 

    else:
        
        action = 'append'
    
    log.info("We're going with db_action = {}".format(action))
    log.info("Sending our df to {}".format(table_name))
    df.to_sql(name=table_name, con=engine, if_exists = action,\
              index=False, chunksize=chunk_size, dtype = outputdict)

    log.info("Completed {} Import".format(description, datetime.datetime.now()))
    log.info("Imported: {} rows".format(df.shape[0]))

#%load_ext sql
#%sql postgresql://jfreeley@localhost:5432/inspections

## HPD Violations
https://data.cityofnewyork.us/Housing-Development/Housing-Maintenance-Code-Violations/wvxf-dwi5

In [70]:
vio_dtype_dict = {
'ViolationID':                'int64',
'BuildingID':                 'int64',
'RegistrationID':             'int64',
'BoroID':                     'int64',
'Boro':                      'object',
'HouseNumber':               'object',
'LowHouseNumber':            'object',
'HighHouseNumber':           'object',
'StreetName':                'object',
'StreetCode':                 'int64',
'Zip':                      'float64',
'Apartment':                 'object',
'Story':                     'object',
'Block':                      'int64',
'Lot':                        'int64',
'Class':                     'object',
'InspectionDate':            'object',
'ApprovedDate':              'object',
'OriginalCertifyByDate':     'object',
'OriginalCorrectByDate':     'object',
'NewCertifyByDate':          'object',
'NewCorrectByDate':          'object',
'CertifiedDate':             'object',
'OrderNumber':               'object',
'NOVID':                    'float64',
'NOVDescription':            'object',
'NOVIssuedDate':             'object',
'CurrentStatusID':            'int64',
'CurrentStatus':             'object',
'CurrentStatusDate':         'object'
}    

vio_parse_dates = ['InspectionDate',
'ApprovedDate',
'OriginalCertifyByDate',
'OriginalCorrectByDate',
'NewCertifyByDate',
'NewCorrectByDate',
'CertifiedDate',
'NOVIssuedDate',
'CurrentStatusDate'] 

vio_date_time_columns = ['inspectiondate',
'approveddate',
'originalcertifybydate',
'originalcorrectbydate',
'newcertifybydate',
'newcorrectbydate',
'certifieddate',
'novissueddate',
'currentstatusdate'] 
    
vio_df_keep_cols = [
    'violationid',
    'buildingid',
    'registrationid',
    'boroid',
    'boro',
    'housenumber',
    'lowhousenumber',
    'highhousenumber',
    'streetname',
    'streetcode',
    'zip',
    'apartment',
    'story',
    'block',
    'lot',
    'class',
    'inspectiondate',
    'approveddate',
    'originalcertifybydate',
    'originalcorrectbydate',
    'newcertifybydate',
    'newcorrectbydate',
    'certifieddate',
    'ordernumber',
    'novid',
    'novdescription',
    'novissueddate',
    'currentstatusid',
    'currentstatus',
    'currentstatusdate'
]
vio_description = "HPD Violations"
vio_input_csv_file = BASE_DIR+'HPD/Data Files/Violations/Housing_Maintenance_Code_Violations.csv'
vio_sep_char = ","
vio_output_pickle = BASE_DIR+'HPD/Data Files/Violations/df_violations.pkl'
vio_table_name = 'hpd_violations'
vio_load_pickle = True
vio_input_pickle = BASE_DIR+'HPD/Data Files/Violations/df_violations.pkl'
vio_db_action = 'replace' ## if not = 'replace' then 'append' 
vio_truncate_columns = ['novdescription']
vio_chunk_size = 5000

hpd_csv2sql(
            vio_description,
            vio_input_csv_file, 
            vio_sep_char,
            vio_output_pickle, 
            vio_table_name, 
            vio_dtype_dict, 
            vio_parse_dates,
            vio_load_pickle,     # ATTEMPT TO LOAD PICKLE FILE (specfified above as 'input_pickle')
            vio_input_pickle,
            vio_db_action, # DB ACTiON set as REPLACE (rather than APPEND)
            vio_truncate_columns, 
            vio_date_time_columns, 
            vio_chunk_size,
            vio_df_keep_cols
           )

## HPD Buildings 
https://data.cityofnewyork.us/Housing-Development/Buildings-Subject-to-HPD-Jurisdiction/kj4p-ruqc

In [50]:
bld_dtype_dict = {
'BuildingID':              'int64',
'BoroID':                  'int64',
'Boro':                   'object',
'HouseNumber':            'object',
'LowHouseNumber':         'object',
'HighHouseNumber':        'object',
'StreetName':             'object',
'Zip':                    'object',
'Block':                   'int64',
'Lot':                     'int64',
'BIN':                   'float64',
'CommunityBoard':          'int64',
'CensusTract':           'float64',
'ManagementProgram':      'object',
'DoBBuildingClassID':    'float64',
'DoBBuildingClass':       'object',
'LegalStories':          'float64',
'LegalClassA':           'float64',
'LegalClassB':           'float64',
'RegistrationID':          'int64',
'LifeCycle':              'object',
'RecordStatusID':          'int64',
'RecordStatus':           'object'
}

bld_df_keep_cols = [
    'buildingid',
    'boroid',
    'boro',
    'housenumber',
    'lowhousenumber',
    'highhousenumber',
    'streetname',
    'zip',
    'block',
    'lot',
    'bin',
    'communityboard',
    'censustract',
    'managementprogram',
    'dobbuildingclassid',
    'dobbuildingclass',
    'legalstories',
    'legalclassa',
    'legalclassb',
    'registrationid',
    'lifecycle',
    'recordstatusid',
    'recordstatus'
]

bld_parse_dates = None

bld_description = "HPD Buildings"
bld_input_csv_file = BASE_DIR+'HPD/Data Files/Buildings/Buildings_Subject_to_HPD_Jurisdiction.csv'
bld_sep_char = ","
bld_output_pickle = BASE_DIR+'HPD/Data Files/Buildings/df_buildings.pkl'
bld_table_name = 'hpd_buildings'
bld_load_pickle = True
bld_input_pickle = BASE_DIR+'HPD/Data Files/Buildings/df_buildings.pkl'
bld_db_action = 'replace' ## if not = 'replace' then 'append' 
bld_truncate_columns = ''
bld_date_time_columns = ''
bld_chunk_size = 5000


hpd_csv2sql(
            bld_description,
            bld_input_csv_file, 
            bld_sep_char,
            bld_output_pickle, 
            bld_table_name, 
            bld_dtype_dict, 
            bld_parse_dates,
            True,     # ATTEMPT TO LOAD PICKLE FILE (specfified above as 'input_pickle')
            bld_input_pickle,
            'replace', # DB ACTiON set as REPLACE (rather than APPEND),
            bld_truncate_columns, 
            bld_date_time_columns, 
            bld_chunk_size,
            bld_df_keep_cols
           )

## HPD Complaints
https://data.cityofnewyork.us/Housing-Development/Housing-Maintenance-Code-Complaints/uwyv-629c

In [55]:
cmp_dtype_dict = {
'ComplaintID':         'int64',
'BuildingID':          'int64',
'BoroughID':           'int64',
'Borough':            'object',
'HouseNumber':        'object',
'StreetName':         'object',
'Zip':               'float64',
'Block':               'int64',
'Lot':                 'int64',
'Apartment':          'object',
'CommunityBoard':      'int64',
'ReceivedDate':       'object',
'StatusID':            'int64',
'Status':             'object',
'StatusDate':         'object'
}

cmp_df_keep_cols = [
    'complaintid',
    'buildingid',
    'boroughid',
    'borough',
    'housenumber',
    'streetname',
    'zip',
    'block',
    'lot',
    'apartment',
    'communityboard',
    'receiveddate',
    'statusid',
    'status',
    'statusdate'
]

cmp_parse_dates = ['StatusDate','ReceivedDate']
cmp_date_time_columns = ['statusdate','receiveddate']

cmp_truncate_columns = ''

cmp_description = "HPD Complaints"
cmp_input_csv_file = BASE_DIR+'HPD/Data Files/Complaints/Housing_Maintenance_Code_Complaints.csv'
cmp_sep_char = ","
cmp_output_pickle = BASE_DIR+'HPD/Data Files/Complaints/df_complaints.pkl'
cmp_table_name = 'hpd_complaints'
cmp_load_pickle = True
cmp_input_pickle = BASE_DIR+'HPD/Data Files/Complaints/df_complaints.pkl'
cmp_db_action = 'replace' ## if not = 'replace' then 'append' 
cmp_chunk_size = 5000

hpd_csv2sql(
            cmp_description,
            cmp_input_csv_file, 
            cmp_sep_char,
            cmp_output_pickle, 
            cmp_table_name, 
            cmp_dtype_dict, 
            cmp_parse_dates,
            True,     # ATTEMPT TO LOAD PICKLE FILE (specfified above as 'input_pickle')
            cmp_input_pickle,
            'replace', # DB ACTiON set as REPLACE (rather than APPEND),
            cmp_truncate_columns, 
            cmp_date_time_columns, 
            cmp_chunk_size,
            cmp_df_keep_cols
           )

## HPD Complaint - Problems
https://data.cityofnewyork.us/Housing-Development/Complaint-Problems/a2nx-4u46

In [59]:
cpb_dtype_dict = {
'ProblemID':             'int64',
'ComplaintID':           'int64',
'UnitTypeID':            'int64',
'UnitType':             'object',
'SpaceTypeID':           'int64',
'SpaceType':            'object',
'TypeID':                'int64',
'Type':                 'object',
'MajorCategoryID':       'int64',
'MajorCategory':        'object',
'MinorCategoryID':       'int64',
'MinorCategory':        'object',
'CodeID':                'int64',
'Code':                 'object',
'StatusID':              'int64',
'Status':               'object',
'StatusDate':           'object',
'StatusDescription':    'object',
}
cpb_parse_dates = ['StatusDate']
cpb_date_time_columns = ['statusdate']

cpb_df_keep_cols = [
    'problemid',
    'complaintid',
    'unittypeid',
    'unittype',
    'spacetypeid',
    'spacetype',
    'typeid',
    'type',
    'majorcategoryid',
    'majorcategory',
    'minorcategoryid',
    'minorcategory',
    'codeid',
    'code',
    'statusid',
    'status',
    'statusdate',
    'statusdescription'
]


cpb_description = "HPD ComplaintProblems"
cpb_input_csv_file = BASE_DIR+'HPD/Data Files/Complaints/Complaint_Problems.csv'
cpb_sep_char = ","
cpb_output_pickle = BASE_DIR+'HPD/Data Files/Complaints/df_prob.pkl'
cpb_table_name = 'hpd_complaintsProb'
cpb_load_pickle = True
cpb_input_pickle = BASE_DIR+'HPD/Data Files/Complaints/df_prob.pkl'
cpb_db_action = 'replace', ## if not = 'replace' then 'append' 
cpb_chunk_size = 5000
cpb_truncate_columns = ['statusdescription']

hpd_csv2sql(
            cpb_description,
            cpb_input_csv_file, 
            cpb_sep_char,
            cpb_output_pickle, 
            cpb_table_name, 
            cpb_dtype_dict, 
            cpb_parse_dates,
            True,     # ATTEMPT TO LOAD PICKLE FILE (specfified above as 'input_pickle')
            cpb_input_pickle,
            'replace', # DB ACTiON set as REPLACE (rather than APPEND)
            cpb_truncate_columns, 
            cpb_date_time_columns, 
            cpb_chunk_size,
            cpb_df_keep_cols
           )

## Registrations
https://data.cityofnewyork.us/Housing-Development/Multiple-Dwelling-Registrations/tesw-yqqr

In [60]:
reg_dtype_dict = {
'RegistrationID':            'int64',
'BuildingID':                'int64',
'BoroID':                    'int64',
'Boro':                     'object',
'HouseNumber':              'object',
'LowHouseNumber':           'object',
'HighHouseNumber':          'object',
'StreetName':               'object',
'StreetCode':               'int64',
'Zip':                     'float64',
'Block':                     'int64',
'Lot':                       'int64',
'BIN':                     'float64',
'CommunityBoard':            'int64',
'LastRegistrationDate':     'object',
'RegistrationEndDate':      'object'}

reg_df_keep_cols = [
    'registrationid',
    'buildingid',
    'boroid',
    'boro',
    'housenumber',
    'lowhousenumber',
    'highhousenumber',
    'streetname',
    'streetcode',
    'zip',
    'block',
    'lot',
    'bin',
    'communityboard',
    'lastregistrationdate',
    'registrationenddate'
]

reg_parse_dates = ['LastRegistrationDate', 'RegistrationEndDate']
reg_date_time_columns = ['lastregistrationdate', 'registrationenddate']
reg_truncate_columns = ''

reg_description = "HPD Registrations"
reg_input_csv_file = BASE_DIR+'HPD/Data Files/Registrations/Multiple_Dwelling_Registrations.csv'
reg_sep_char = ","
reg_output_pickle = BASE_DIR+'HPD/Data Files/Registrations/df_reg.pkl'
reg_table_name = 'hpd_registrations'
reg_load_pickle = True
reg_input_pickle = BASE_DIR+'HPD/Data Files/Registrations/df_reg.pkl'
reg_db_action = 'replace' ## if not = 'replace' then 'append' 
reg_chunk_size = 5000

hpd_csv2sql(
            reg_description,
            reg_input_csv_file, 
            reg_sep_char,
            reg_output_pickle, 
            reg_table_name, 
            reg_dtype_dict, 
            reg_parse_dates,
            True,     # ATTEMPT TO LOAD PICKLE FILE (specfified above as 'input_pickle')
            reg_input_pickle,
            'replace', # DB ACTiON set as REPLACE (rather than APPEND)
            reg_truncate_columns, 
            reg_date_time_columns, 
            reg_chunk_size,
            reg_df_keep_cols
           )

## Registration Contacts
https://data.cityofnewyork.us/Housing-Development/Registration-Contacts/feu5-w2e2

In [61]:
rcn_dtype_dict = {
'RegistrationContactID':     'int64',
'RegistrationID':            'int64',
'Type':                     'object',
'ContactDescription':       'object',
'CorporationName':          'object',
'Title':                    'object',
'FirstName':                'object',
'MiddleInitial':            'object',
'LastName':                 'object',
'BusinessHouseNumber':      'object',
'BusinessStreetName':       'object',
'BusinessApartment':        'object',
'BusinessCity':             'object',
'BusinessState':            'object',
'BusinessZip':              'object'
    }

rcn_df_keep_cols = [
    'registrationcontactid',
    'registrationid',
    'type',
    'contactdescription',
    'corporationname',
    'title',
    'firstname',
    'middleinitial',
    'lastname',
    'businesshousenumber',
    'businessstreetname',
    'businessapartment',
    'businesscity',
    'businessstate',
    'businesszip'
]

rcn_truncate_columns = ''

rcn_parse_dates = ''
rcn_date_time_columns = ''

rcn_description = "HPD RegistrationsContacts"
rcn_input_csv_file = BASE_DIR+'HPD/Data Files/Registrations/Registration_Contacts.csv'
rcn_sep_char = ","
rcn_output_pickle = BASE_DIR+'HPD/Data Files/Registrations/df_regCon.pkl'
rcn_table_name = 'hpd_registrationContact'
rcn_load_pickle = True
rcn_input_pickle = BASE_DIR+'HPD/Data Files/Registrations/df_regCon.pkl'
rcn_db_action = 'replace' ## if not = 'replace' then 'append' 
rcn_chunk_size = 5000

hpd_csv2sql(
            rcn_description,
            rcn_input_csv_file, 
            rcn_sep_char,
            rcn_output_pickle, 
            rcn_table_name, 
            rcn_dtype_dict, 
            rcn_parse_dates,
            True,     # ATTEMPT TO LOAD PICKLE FILE (specfified above as 'input_pickle')
            rcn_input_pickle,
            'replace', # DB A
            rcn_truncate_columns, 
            rcn_date_time_columns, 
            rcn_chunk_size,
            rcn_df_keep_cols
            )

## 311 Import
https://nycopendata.socrata.com/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9

In [12]:
call_311_dtype_dict = {'Unique Key':'int64',
'Created Date':'object',
'Closed Date':'object',
'Agency':'object',
'Agency Name':'object',
'Complaint Type':'object',
'Descriptor':'object',
'Location Type':'object',
'Incident Zip':'object',
'Incident Address':'object',
'Street Name':'object',
'Cross Street 1':'object',
'Cross Street 2':'object',
'Intersection Street 1':'object',
'Intersection Street 2':'object',
'Address Type':'object',
'City':'object',
'Landmark':'object',
'Facility Type':'object',
'Status':'object',
'Due Date':'object',
'Resolution Description':'object',
'Resolution Action Updated Date':'object',
'Community Board':'object',
'Borough':'object',
'X Coordinate (State Plane)':'float64',
'Y Coordinate (State Plane)':'float64',
'Park Facility Name':'object',
'Park Borough':'object',
'School Name':'object',
'School Number':'object',
'School Region':'object',
'School Code':'object',
'School Phone Number':'object',
'School Address':'object',
'School City':'object',
'School State':'object',
'School Zip':'object',
'School Not Found':'object',
'School or Citywide Complaint':'float64',
'Vehicle Type':'object',
'Taxi Company Borough':'object',
'Taxi Pick Up Location':'object',
'Bridge Highway Name':'object',
'Bridge Highway Direction':'object',
'Road Ramp':'object',
'Bridge Highway Segment':'object',
'Garage Lot Name':'object',
'Ferry Direction':'object',
'Ferry Terminal Name':'object',
'Latitude':'float64',
'Longitude':'float64',
'Location':'object'}

call_311_df_keep_cols = [
    "unique_key",
    "created_date",
    "closed_date",
    "agency",
    "complaint_type",
    "descriptor",
    "incident_zip",
    "incident_address",
    "street_name",
    "cross_street_1",
    "cross_street_2",
    "intersection_street_1",
    "intersection_street_2",
    "city",
    "status",
    "due_date",
    "resolution_description",
    "resolution_action_updated_date",
    "borough",
    "latitude",
    "longitude",
    "location"
    ]

### NOTE: each "split" of 250K rows takes about 15min on a macbook air laptop over wifi. 

In [34]:
call_311_description = "311_xaa"
call_311_input_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xaa_c.pkl'
call_311_output_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xaa_c.pkl'
call_311_input_csv_file = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xaa_c'
call_311_sep_char = ","
call_311_table_name = "call_311"
call_311_load_pickle = True
call_311_db_action = 'replace' ## if not = 'replace' then 'append' 
call_311_truncate_columns = ['Resolution Description']
call_311_date_time_columns = ['Created Date','Closed Date','Due Date', 'Resolution Action Updated Date']
call_311_chunk_size = 2500

hpd_csv2sql(
            call_311_description,
            call_311_input_csv_file, 
            call_311_sep_char,
            call_311_output_pickle, 
            call_311_table_name, 
            call_311_dtype_dict, 
            call_311_parse_dates,
            call_311_load_pickle,   
            call_311_input_pickle,
            call_311_db_action,
            call_311_truncate_columns, 
            call_311_date_time_columns, 
            call_311_chunk_size,
            call_311_df_keep_cols
            )


In [36]:
call_311_description = "311_xab"
call_311_input_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xab_c.pkl'
call_311_output_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xab_c.pkl'
call_311_input_csv_file = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xab_c'
call_311_sep_char = ","
call_311_table_name = "call_311"
call_311_load_pickle = False
call_311_db_action = 'append' ## if not = 'replace' then 'append' 
call_311_truncate_columns = ['Resolution Description']
call_311_date_time_columns = ['Created Date','Closed Date','Due Date', 'Resolution Action Updated Date']
call_311_chunk_size = 2500

hpd_csv2sql(
            call_311_description,
            call_311_input_csv_file, 
            call_311_sep_char,
            call_311_output_pickle, 
            call_311_table_name, 
            call_311_dtype_dict, 
            call_311_parse_dates,
            call_311_load_pickle,   
            call_311_input_pickle,
            call_311_db_action,
            call_311_truncate_columns, 
            call_311_date_time_columns, 
            call_311_chunk_size,
            call_311_df_keep_cols
            )

In [37]:
call_311_description = "311_xac"
call_311_input_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xac_c.pkl'
call_311_output_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xac_c.pkl'
call_311_input_csv_file = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xac_c'
call_311_sep_char = ","
call_311_table_name = "call_311"
call_311_load_pickle = False
call_311_db_action = 'append' ## if not = 'replace' then 'append' 
call_311_truncate_columns = ['Resolution Description']
call_311_date_time_columns = ['Created Date','Closed Date','Due Date', 'Resolution Action Updated Date']
call_311_chunk_size = 2500

hpd_csv2sql(
            call_311_description,
            call_311_input_csv_file, 
            call_311_sep_char,
            call_311_output_pickle, 
            call_311_table_name, 
            call_311_dtype_dict, 
            call_311_parse_dates,
            call_311_load_pickle,   
            call_311_input_pickle,
            call_311_db_action,
            call_311_truncate_columns, 
            call_311_date_time_columns, 
            call_311_chunk_size,
            call_311_df_keep_cols
            )

In [38]:
call_311_description = "311_xad"
call_311_input_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xad_c.pkl'
call_311_output_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xad_c.pkl'
call_311_input_csv_file = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xad_c'
call_311_sep_char = ","
call_311_table_name = "call_311"
call_311_load_pickle = False
call_311_db_action = 'append' ## if not = 'replace' then 'append' 
call_311_truncate_columns = ['Resolution Description']
call_311_date_time_columns = ['Created Date','Closed Date','Due Date', 'Resolution Action Updated Date']
call_311_chunk_size = 2500

hpd_csv2sql(
            call_311_description,
            call_311_input_csv_file, 
            call_311_sep_char,
            call_311_output_pickle, 
            call_311_table_name, 
            call_311_dtype_dict, 
            call_311_parse_dates,
            call_311_load_pickle,   
            call_311_input_pickle,
            call_311_db_action,
            call_311_truncate_columns, 
            call_311_date_time_columns, 
            call_311_chunk_size,
            call_311_df_keep_cols
            )

In [39]:
call_311_description = "311_xae"
call_311_input_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xae_c.pkl'
call_311_output_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xae_c.pkl'
call_311_input_csv_file = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xae_c'
call_311_sep_char = ","
call_311_table_name = "call_311"
call_311_load_pickle = False
call_311_db_action = 'append' ## if not = 'replace' then 'append' 
call_311_truncate_columns = ['Resolution Description']
call_311_date_time_columns = ['Created Date','Closed Date','Due Date', 'Resolution Action Updated Date']
call_311_chunk_size = 2500

hpd_csv2sql(
            call_311_description,
            call_311_input_csv_file, 
            call_311_sep_char,
            call_311_output_pickle, 
            call_311_table_name, 
            call_311_dtype_dict, 
            call_311_parse_dates,
            call_311_load_pickle,   
            call_311_input_pickle,
            call_311_db_action,
            call_311_truncate_columns, 
            call_311_date_time_columns, 
            call_311_chunk_size,
            call_311_df_keep_cols
            )

In [40]:
call_311_description = "311_xaf"
call_311_input_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xaf_c.pkl'
call_311_output_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xaf_c.pkl'
call_311_input_csv_file = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xaf_c'
call_311_sep_char = ","
call_311_table_name = "call_311"
call_311_load_pickle = False
call_311_db_action = 'append' ## if not = 'replace' then 'append' 
call_311_truncate_columns = ['Resolution Description']
call_311_date_time_columns = ['Created Date','Closed Date','Due Date', 'Resolution Action Updated Date']
call_311_chunk_size = 2500

hpd_csv2sql(
            call_311_description,
            call_311_input_csv_file, 
            call_311_sep_char,
            call_311_output_pickle, 
            call_311_table_name, 
            call_311_dtype_dict, 
            call_311_parse_dates,
            call_311_load_pickle,   
            call_311_input_pickle,
            call_311_db_action,
            call_311_truncate_columns, 
            call_311_date_time_columns, 
            call_311_chunk_size,
            call_311_df_keep_cols
            )

In [41]:
call_311_description = "311_xag"
call_311_input_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xag_c.pkl'
call_311_output_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xag_c.pkl'
call_311_input_csv_file = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xag_c'
call_311_sep_char = ","
call_311_table_name = "call_311"
call_311_load_pickle = False
call_311_db_action = 'append' ## if not = 'replace' then 'append' 
call_311_truncate_columns = ['Resolution Description']
call_311_date_time_columns = ['Created Date','Closed Date','Due Date', 'Resolution Action Updated Date']
call_311_chunk_size = 2500

hpd_csv2sql(
            call_311_description,
            call_311_input_csv_file, 
            call_311_sep_char,
            call_311_output_pickle, 
            call_311_table_name, 
            call_311_dtype_dict, 
            call_311_parse_dates,
            call_311_load_pickle,   
            call_311_input_pickle,
            call_311_db_action,
            call_311_truncate_columns, 
            call_311_date_time_columns, 
            call_311_chunk_size,
            call_311_df_keep_cols
            )

In [42]:
call_311_description = "311_xah"
call_311_input_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xah_c.pkl'
call_311_output_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xah_c.pkl'
call_311_input_csv_file = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xah_c'
call_311_sep_char = ","
call_311_table_name = "call_311"
call_311_load_pickle = False
call_311_db_action = 'append' ## if not = 'replace' then 'append' 
call_311_truncate_columns = ['Resolution Description']
call_311_date_time_columns = ['Created Date','Closed Date','Due Date', 'Resolution Action Updated Date']
call_311_chunk_size = 2500

hpd_csv2sql(
            call_311_description,
            call_311_input_csv_file, 
            call_311_sep_char,
            call_311_output_pickle, 
            call_311_table_name, 
            call_311_dtype_dict, 
            call_311_parse_dates,
            call_311_load_pickle,   
            call_311_input_pickle,
            call_311_db_action,
            call_311_truncate_columns, 
            call_311_date_time_columns, 
            call_311_chunk_size,
            call_311_df_keep_cols
            )

In [43]:
call_311_description = "311_xai"
call_311_input_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xai_c.pkl'
call_311_output_pickle = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xai_c.pkl'
call_311_input_csv_file = BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xai_c'
call_311_sep_char = ","
call_311_table_name = "call_311"
call_311_load_pickle = False
call_311_db_action = 'append' ## if not = 'replace' then 'append' 
call_311_truncate_columns = ['Resolution Description']
call_311_date_time_columns = ['Created Date','Closed Date','Due Date', 'Resolution Action Updated Date']
call_311_chunk_size = 2500

hpd_csv2sql(
            call_311_description,
            call_311_input_csv_file, 
            call_311_sep_char,
            call_311_output_pickle, 
            call_311_table_name, 
            call_311_dtype_dict, 
            call_311_parse_dates,
            call_311_load_pickle,   
            call_311_input_pickle,
            call_311_db_action,
            call_311_truncate_columns, 
            call_311_date_time_columns, 
            call_311_chunk_size,
            call_311_df_keep_cols
            )

In [None]:
df_311 = pd.read_csv(BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xab_c', sep=',',encoding='utf8',\
                     infer_datetime_format=True, parse_dates=parse_dates, dtype=call_311_col_dict)
outputdict = guess_sqlcol(df_311)  
print "Uploading SQL" 
df_311.to_sql(name='call_311', con=engine, if_exists = 'append', index=False, chunksize=2500, dtype = outputdict)


df_311 = pd.read_csv(BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xac_c', sep=',',encoding='utf8',\
                     infer_datetime_format=True, parse_dates=parse_dates, dtype=call_311_col_dict)
outputdict = guess_sqlcol(df_311)  
print "Uploading SQL" 
df_311.to_sql(name='call_311', con=engine, if_exists = 'append', index=False, chunksize=2500, dtype = outputdict)


df_311 = pd.read_csv(BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xad_c', sep=',',encoding='utf8',\
                     infer_datetime_format=True, parse_dates=parse_dates, dtype=call_311_col_dict)
outputdict = guess_sqlcol(df_311)  
print "Uploading SQL" 
df_311.to_sql(name='call_311', con=engine, if_exists = 'append', index=False, chunksize=2500, dtype = outputdict)


df_311 = pd.read_csv(BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xae_c', sep=',',encoding='utf8',\
                     infer_datetime_format=True, parse_dates=parse_dates, dtype=call_311_col_dict)
outputdict = guess_sqlcol(df_311)  
print "Uploading SQL" 
df_311.to_sql(name='call_311', con=engine, if_exists = 'append', index=False, chunksize=2500, dtype = outputdict)


df_311 = pd.read_csv(BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xaf_c', sep=',',encoding='utf8',\
                     infer_datetime_format=True, parse_dates=parse_dates, dtype=call_311_col_dict)
outputdict = guess_sqlcol(df_311)  
print "Uploading SQL i" 
df_311.to_sql(name='call_311', con=engine, if_exists = 'append', index=False, chunksize=2500, dtype = outputdict)

df_311 = pd.read_csv(BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xag_c', sep=',',encoding='utf8',\
                     infer_datetime_format=True, parse_dates=parse_dates, dtype=call_311_col_dict)
outputdict = guess_sqlcol(df_311)  
print "Uploading SQL" 
df_311.to_sql(name='call_311', con=engine, if_exists = 'append', index=False, chunksize=2500, dtype = outputdict)

df_311 = pd.read_csv(BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xah_c', sep=',',encoding='utf8',\
                     infer_datetime_format=True, parse_dates=parse_dates, dtype=call_311_col_dict)
outputdict = guess_sqlcol(df_311)  
print "Uploading SQL" 
df_311.to_sql(name='call_311', con=engine, if_exists = 'append', index=False, chunksize=2500, dtype = outputdict)

df_311 = pd.read_csv(BASE_DIR + '311/Data Files/2016_Jan1-Nov14/xai_c', sep=',',encoding='utf8',\
                     infer_datetime_format=True, parse_dates=parse_dates, dtype=call_311_col_dict)
outputdict = guess_sqlcol(df_311)  
print "Uploading SQL" 
df_311.to_sql(name='call_311', con=engine, if_exists = 'append', index=False, chunksize=2500, dtype = outputdict)

In [None]:
# Large 311 Select with ages
#
# SELECT 
# `Unique Key`,
# `Created Date`, 
# `Closed Date`,
# timestampdiff(day,`Created Date`,`Closed Date`) as AgeDays,
# timestampdiff(hour,`Created Date`,`Closed Date`) as AgeHr,
# `Agency`,
#  `Complaint Type`
# ,`Descriptor`,
# `Location Type`,
#  `Incident Zip`, 
# `Incident Address`,
# `Facility Type`,
#  `Status`,
# `Due Date`,
# `Borough`,
#  `Resolution Description`,
# `Resolution Action Updated Date`,
# `Latitude`,
#  `Longitude` 
# FROM `call_311` 
# WHERE Agency = "HPD" and `Complaint Type` = "HEAT/HOT WATER" and `Status` != "Closed"



# COUNT OF HEAT/HW with locations
#
# SELECT * 
# FROM (
# SELECT  `Incident Address` ,  `Borough` ,  `Latitude` ,  `Longitude` , COUNT(  `Unique Key` ) AS count, AVG( timestampdiff(
# DAY ,  `Created Date` ,  `Closed Date`
# ) ) AS average_day_age
# FROM call_311
# WHERE Agency =  "HPD"
# AND  `Complaint Type` =  "HEAT/HOT WATER"
# AND  `Status` =  'Closed'
# GROUP BY  `Incident Address`
# ) AS count_table
# ORDER BY average_day_age DESC


#
#SELECT TABLE_ROWS, TABLE NAME
#      FROM INFORMATION_SCHEMA.TABLES 
#      WHERE TABLE_SCHEMA = 'heatseak' AND
#         TABLE_NAME NOT LIKE '%pma_%';

## LIST ALL COLUMNS OF ALL TABLES WITH COL TYPE
select table_name, column_name, data_type 
from information_schema.columns where table_schema = 'heatseek' 
order by table_name,ordinal_position;

## LIST COUNT OF VIOLATION STATUS AND CLASS HPD VIOLATIONS
select class, currentstatus, count(violationid) as 
count from hpd_violations group by currentstatus, class;

In [None]:
# hpd_buildings indexes
ALTER TABLE `hpd_buildings` ADD PRIMARY KEY(`BuildingID`);
ALTER TABLE `hpd_buildings` ADD INDEX(`BoroID`);
ALTER TABLE `hpd_buildings` ADD INDEX(`RecordStatus`);
ALTER TABLE `hpd_buildings` ADD INDEX(`BIN`);
ALTER TABLE `hpd_buildings` ADD INDEX(`Lot`);
ALTER TABLE `hpd_buildings` ADD INDEX(`Block`);
ALTER TABLE `hpd_buildings` ADD INDEX(`StreetName`);
ALTER TABLE `hpd_buildings` ADD INDEX(`HouseNumber`);

## hpd_complaints indexes
ALTER TABLE `hpd_complaints` ADD PRIMARY KEY(`ComplaintID`);
ALTER TABLE `hpd_complaints` ADD INDEX(`StatusDate`);
ALTER TABLE `hpd_complaints` ADD INDEX(`Status`);
ALTER TABLE `hpd_complaints` ADD INDEX(`Lot`);
ALTER TABLE `hpd_complaints` ADD INDEX(`Block`);
ALTER TABLE `hpd_complaints` ADD INDEX(`StreetName`);
ALTER TABLE `hpd_complaints` ADD INDEX(`HouseNumber`);
ALTER TABLE `hpd_complaints` ADD INDEX(`BoroughID`);
ALTER TABLE `hpd_complaints` ADD INDEX(`BuildingID`);

## hpd_complaint_problem indexes
ALTER TABLE `hpd_complaint_problem` ADD PRIMARY KEY(`ProblemID`);
ALTER TABLE `hpd_complaint_problem` ADD INDEX(`ComplaintID`);
ALTER TABLE `hpd_complaint_problem` ADD INDEX(`MajorCategory`);
ALTER TABLE `hpd_complaint_problem` ADD INDEX(`MinorCategory`);
ALTER TABLE `hpd_complaint_problem` ADD INDEX(`Status`);
ALTER TABLE `hpd_complaint_problem` ADD INDEX(`StatusDate`);

## hpd_registration indexes
ALTER TABLE `hpd_registration` ADD INDEX(`RegistrationID`); #1062 - Duplicate entry '913236' for key 'PRIMARY'
ALTER TABLE `hpd_registration` ADD INDEX(`BuildingID`);
ALTER TABLE `hpd_registration` ADD INDEX(`BoroID`);
ALTER TABLE `hpd_registration` ADD INDEX(`HouseNumber`);
ALTER TABLE `hpd_registration` ADD INDEX(`StreetName`);
ALTER TABLE `hpd_registration` ADD INDEX(`Block`);
ALTER TABLE `hpd_registration` ADD INDEX(`Lot`);
ALTER TABLE `hpd_registration` ADD INDEX(`BIN`);
ALTER TABLE `hpd_registration` ADD INDEX(`RegistrationEndDate`);

## hpd_registrationContact indexes
ALTER TABLE `hpd_registrationContact` ADD INDEX(`RegistrationContactID`); ##1062 - Duplicate entry '91323603' for key 'PRIMARY'
ALTER TABLE `hpd_registrationContact` ADD INDEX(`RegistrationID`);

## call_311 indexes
ALTER TABLE `call_311` ADD PRIMARY KEY(`Unique Key`);
ALTER TABLE `call_311` ADD INDEX(`Created Date`);
ALTER TABLE `call_311` ADD INDEX(`Agency`);
ALTER TABLE `call_311` ADD INDEX(`Complaint Type`);
ALTER TABLE `call_311` ADD INDEX(`Descriptor`);
ALTER TABLE `call_311` ADD INDEX(`Incident Address`);
ALTER TABLE `call_311` ADD INDEX(`Status`);
ALTER TABLE `call_311` ADD INDEX(`Latitude`);
ALTER TABLE `call_311` ADD INDEX(`Longitude`);
ALTER TABLE `call_311` ADD INDEX(`Resolution Description`);
ALTER TABLE `call_311` ADD INDEX(`Resolution Action Updated Date`);
ALTER TABLE `call_311` ADD INDEX(`Borough`);
