# About

Connecting to redshift to pull down raw data is a common data need. This notebook aids this connection by prompting for required input. Resulting data is in a DataFrame format which can then be interacted with in the notebook or downloaded to a csv file for viewing and interaction off-notebook.

Of note, if the dataset is extrememly large, there may be problems converting to csv. Chunking is a possible solution.

# Imports

In [None]:
import json

import pandas as pd

from sqlalchemy import create_engine

# User Input Section

## Define Server Connection

In [None]:
# User input
SERVER = input("Server: ")
DATABASE = input("Database: ")
USERNAME = input("Username: ")
PASSWORD = input("Password: ")
PORT = input("Port: ")

In [None]:
ENGINE = create_engine(
    'postgresql://'+USERNAME+':'+PASSWORD+'@'+SERVER+':'+PORT+'/'+DATABASE)

## Constants

In [None]:
SCHEMA = ""
TABLE = ""

# Functions

In [None]:
'''Specific data queries'''

def get_columns(schema, table):
    """Returns alla available column names in the schema.table specified."""    
    query = pd.read_sql_query('SELECT TOP 1 * FROM '+schema+"."+table+';', ENGINE)
    return query.columns

# Currently, start_date and end_date are requriements for pulling down data
# Thought is that extremely large datasets should not be pulled in just one query on a notebook
# TODO: be adjusted to allow end_date to not be provided and be assumed to be current date
def get_data(schema, table, start_date, end_date, columns="", filters="", like=""):
    """Gets all data in the table provided, so long as it fits the filters and like attributes.
    Returns a tuple"""
    params_list = []
    params_list.append(start_date)
    params_list.append(end_date)
    
    # Select all columns, unless a list of columns has been passed as a parameter
    if columns != "":
        columns_as_string = ', '.join(columns)
    else: columns_as_string = "*"
    
    if filters != "":
        filters_query_list = []
    
        # Because there is already a filter requirement listed first (start_date and end_date)
        # The string concat can begin with the word AND
        for i in filters:
            filters_query_list.append(' AND '+i+' IN ('+'%s,'*(len(filters[i])-1)+'%s)')
            for j in filters[i]:
                params_list.append(j)

        filters_query_string = "".join(filters_query_list)
    else: filters_query_string = ""
    
    if like != "":
        for i in like:
            filters_query_string += ' AND '+i+' LIKE %s'
            params_list.append(like[i])
    
    query = "".join(['SELECT ', columns_as_string, ' FROM ', schema, ".", table, 
                    ' WHERE report_date BETWEEN %s AND %s', filters_query_string, ';'])
    
    print('\nThe query string is: ' + query)
    data = pd.DataFrame()
    for chunk in pd.read_sql_query(query, ENGINE, params=params_list, chunksize=10000):
        data = data.append(chunk, ignore_index = True)

    return data


def get_schemas():
    """Returns available schemas in the database."""
    schemas = pd.read_sql_query('select nspname from pg_namespace', ENGINE)
    non_temp_schemas = schemas[~schemas['nspname'].str.contains('temp')]['nspname']
    return non_temp_schemas
    
    

def get_distinct_values(schema, table, column):
    """Return only unique values in the column."""
    query = pd.read_sql_query('SELECT DISTINCT '+column+' FROM '+schema+"."+table+';', ENGINE)
    return query

## Get Data

In [None]:
data = get_data(SCHEMA, TABLE, START_DATE, END_DATE, filters=where_in_dict)
#print(data.shape)

# Save as csv

In [None]:
CSV_NAME = input('Please input what you would like to name the file. Do not include the .csv ending.')

In [None]:
data.to_csv(CSV_NAME + '.csv', index='None')