# Purpose

# Outline

In [1]:
import sys
import os
import math
import warnings

import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import psycopg2
from sqlalchemy import create_engine

from sklearn import linear_model
from sklearn import metrics 
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from IPython.core.interactiveshell import InteractiveShell

from dd_credentials import *

warnings.filterwarnings(action='once')
InteractiveShell.ast_node_interactivity = "all"

In [3]:
def execute_sql(statement, dbname, user, host, password, port, isolation = False, results = True):
    """
    Use psycopg2 to execute PostgreSQL queries
    
    Input:
        statement (str): SQL statement to run in database
        dbname, user, host, password (str): database credentials
        isolation (bool): indicator for whether to change isolation level to autocommit; True for queries that cannot be run 
            from within a transation (see https://wiki.postgresql.org/wiki/Psycopg2_Tutorial), e.g. queries other than SELECT
        results (bool): indicator for whether the query is expected to output results;
            for example, True for SELECT statements and False for CREATE TABLE statements
    
    Output:
        relation (dataframe): query results or empty dataframe if results = False
    """
    conn = psycopg2.connect("dbname={} user={} host={} password={} port={}".format(dbname, user, host, password, port))
    cur = conn.cursor()
    if isolation:
        conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
    cur.execute(statement)
    relation = pd.DataFrame()
    if results:
        results = cur.fetchall()
        colnames = [desc[0] for desc in cur.description]
        relation = pd.DataFrame(results, columns=colnames)
    cur.close()
    conn.close()
    return relation


In [4]:
def randomize(df, do_not_randomize = None):
    """
    Randomize column values of a file. Each column is randomized independently.
    
    Inputs:
        df (dataframe): dataframe to randomize
        do_not_randomize (list): optional list of strings indicating names of 
            columns that should not be randomized
    Outputs:
        df (dataframe): dataframe of randomized data
    """     
    df_random = df.copy()
    if do_not_randomize:
        cols = [c for c in df.columns if c not in do_not_randomize]
    else:
        cols = df.columns
        
    for col in cols:
        #print('\t\tRandomizing column ' + col)
        df_random[col] = np.random.permutation(df_random[col])

    return df_random

In [16]:
# source: https://overlaid.net/2016/02/08/replace-words-in-files-or-strings-using-python/
def do_replacement(base_text, word_map):
    """
    Helper function for replace_words_in_file. Make replacements in base_text, as 
    indicated in  word_map.
    """
    for key, val in word_map.items():
        base_text = base_text.replace(key, val)
    return base_text

def replace_words_in_file(read_from, write_to, word_map):
    """
    Create copy of a file with certain words replaced
    
    Inputs
        read_from: name of file to read from
        write_to: name of new file to be created
        word_map: dictionary of mappings between words and their replacements
            (e.g. {'old_word': 'new_word'})
    Outputs
        None. Will create a new file with the name given in write_to
        
    """
    # Open your desired file as 't' and read the lines into string 'tempstr'
    t = open(read_from, 'r')
    tempstr = t.read()
    t.close()

    # Using the "replace_words" function, we'll pass in our tempstr to be used as the base, 
    # and our device_values to be used as replacement.  
    output = do_replacement(tempstr, word_map)

    # Write out the new config file
    fout = open(write_to, 'w')
    fout.write(output)
    fout.close()

## I. Create a randomized version of the raw schema

Pull the information schema from selected database.

In [None]:
# name of the database schema containing raw data to be randomized
RAW_SCHEMA = 'raw' 

In [5]:
statement = "SELECT * FROM information_schema.tables;"
tables = execute_sql(statement, dbname, user, host, password, port)

In [6]:
tables

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
0,food,postgis,geography_columns,VIEW,,,,,,NO,NO,
1,food,postgis,geometry_columns,VIEW,,,,,,YES,NO,
2,food,postgis,spatial_ref_sys,BASE TABLE,,,,,,YES,NO,
3,food,postgis,raster_columns,VIEW,,,,,,NO,NO,
4,food,postgis,raster_overviews,VIEW,,,,,,NO,NO,
5,food,cleaned,inspections,BASE TABLE,,,,,,YES,NO,
6,food,pg_catalog,pg_statistic,BASE TABLE,,,,,,YES,NO,
7,food,pg_catalog,pg_type,BASE TABLE,,,,,,YES,NO,
8,food,cleaned,violations,BASE TABLE,,,,,,YES,NO,
9,food,pg_catalog,pg_policy,BASE TABLE,,,,,,YES,NO,


Select tables from the raw schema; they will be randomized. In this tutorial, there is only one (raw.inspections).

In [7]:
raw = tables[tables.table_schema == RAW_SCHEMA]
raw.head()

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
12,food,raw,inspections,BASE TABLE,,,,,,YES,NO,


First created the _randomized schema if it doesn't yet exist

In [14]:
statement = "CREATE SCHEMA IF NOT EXISTS {}_randomized;".format(RAW_SCHEMA)
output = execute_sql(statement, dbname, user, host, password, port, isolation=True, results = False)

Randomize every table in raw and write output to _randomized schema

In [15]:
for table_name in raw.table_name:
    print("Working on table {}".format(table_name))
    
    # Pull the table from original schema
    print("\tPulling table")
    statement = "SELECT * FROM {}.{};".format(RAW_SCHEMA, table_name)
    table = execute_sql(statement, dbname, user, host, password, port)
    
    # Randomize the table
    print("\tRandomizing")
    randomized_table = randomize(table)
    
    # Make a new table in _randomized schema
    print("\tUploading randomized version")
    statement = "CREATE TABLE IF NOT EXISTS {}_randomized.{} (LIKE {}.{});".format(RAW_SCHEMA, table_name, RAW_SCHEMA, table_name)
    output = execute_sql(statement, dbname, user, host, password, port, isolation = True, results = False)
    
    # Write results into new table
    statement = "SELECT COUNT(*) FROM {}_randomized.{};".format(RAW_SCHEMA, table_name)
    output = execute_sql(statement, dbname, user, host, password, port)
    if output.iloc[0,0]>0: #  do nothing if new table already contains data
        print("\t*****SKIPPING TABLE {} -- it already has data".format(table_name))
    else:
        engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(user, password, host, port, dbname))
        randomized_table.to_sql(table_name, engine, schema = '{}_randomized'.format(RAW_SCHEMA), index = False, if_exists='append')
    

Working on table inspections
	Pulling table
	Randomizing
	Uploading randomized version


## II. Edit preprocessing files to point to newly randomized raw schema

In [18]:
# list of preprocessing files that get RAW_SCHEMA ready for triage experiments
# list should be in correct execution order
PREPROC_FILES = ['/home/ubuntu/dirtyduck/sql/create_cleaned_inspections_table.sql',
                 '/home/ubuntu/dirtyduck/sql/create_violations_table.sql',
                 '/home/ubuntu/dirtyduck/sql/create_semantic_tables.sql',
                 '/home/ubuntu/dirtyduck/sql/create_inspections_schema.sql'
                ]

# dictionary of schema name replacements to apply across every preprocessing file
WORD_MAP = {RAW_SCHEMA: RAW_SCHEMA+'_randomized', 
    'cleaned': 'cleaned_randomized', 
    'semantic': 'semantic_randomized',
    'inspections': 'inspections_randomized'}

Apply find+replace to every preprocessing file. Be sure to open the outputted files (with _randomized appended to the name) to check that no unexpected replacements were made.

In [13]:
for file in PREPROC_FILES:
    replace_words_in_file(file, file+'_randomized', WORD_MAP)

Execute the preprocessing files