# Explore Data

In [1]:
import psycopg2
import pandas as pd
import redshift_connector
from config_loader import *

In [8]:
def get_data_redshift(query):
    '''function to return data from table in tuple form'''
    with redshift_connector.connect(
        host=db_host,
        database=db_name,
        user=db_user,
        password=db_pass
    )as conn:
        with conn.cursor() as cursor:
            cursor.execute(query)
            result = cursor.fetchall()
    return result


def get_cols_redshift(table):
    '''function to get column names from given table name'''
    query = f"""
            select ordinal_position as position,
                   column_name,
                   data_type,
                   case when character_maximum_length is not null
                        then character_maximum_length
                        else numeric_precision end as max_length,
                   is_nullable,
                   column_default as default_value
            from information_schema.columns
            where table_name = '{table}' -- enter table name here
                  -- and table_schema = 'Schema name'
            order by ordinal_position;
            """
    with redshift_connector.connect(
        host=db_host,
        database=db_name,
        user=db_user,
        password=db_pass
    )as conn:
        with conn.cursor() as cursor:
            cursor.execute(query)
            result = cursor.fetchall()
            columns = [lis[1] for lis in result]
    return columns


def create_df(table_cols, table_data):
    '''returnd dataframe from table cols & table data'''
    df = pd.DataFrame(table_data, columns=table_cols)
    return df

# Premise data

In [9]:
premise_cols = get_cols_redshift("premise_dim")
premise_cols

['premise_id', 'premise_location']

In [14]:
# premise data
query = """
SELECT *
FROM premise_dim
"""
premise_data = get_data_redshift(query)

In [28]:
df_premise = create_df(premise_cols,premise_data)

In [16]:
crime_cols = get_cols_redshift("crime_fact")
crime_cols

['crime_fact_id',
 'numoffenses',
 'temp',
 'feels_like',
 'humidity',
 'rain',
 'snow',
 'offense_dim_id',
 'police_beat_dim_id',
 'premises_dim_id',
 'address_dim_id',
 'datetime_id']

In [17]:
# premise data
query = """
SELECT *
FROM crime_fact
"""
crime_data = get_data_redshift(query)

In [19]:
df = create_df(crime_cols,crime_data)
df

Unnamed: 0,crime_fact_id,numoffenses,temp,feels_like,humidity,rain,snow,offense_dim_id,police_beat_dim_id,premises_dim_id,address_dim_id,datetime_id
0,1,1,86,91,60.0,0,0,1,1,1,1,1
1,2,1,58,57,62.0,0,0,1,14,1,2,2
2,3,1,90,96,53.0,0,0,1,35,1,3,3
3,4,1,50,45,87.0,0,0,4,46,1,4,4
4,5,1,57,54,39.0,0,0,1,2,2,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,2,60,58,69.0,0,0,4,56,40,95,96
96,97,1,63,62,79.0,0,0,4,57,41,96,97
97,98,1,70,70,84.0,0,0,6,57,42,97,98
98,99,1,75,76,75.0,1,0,5,59,43,98,99


# Join tables

In [26]:
query = """
SELECT crime_fact.temp, address_dim.full_address
FROM crime_fact
INNER JOIN address_dim
ON crime_fact.address_dim_id = address_dim.address_id
LIMIT 10;
"""

In [27]:
 get_data_redshift(query)

([86, '8150 Lynn Houston, TX'],
 [58, '12050 Pecan meadow Houston, TX'],
 [90, '750 Leicester Houston, TX'],
 [50, '6650 Loma vista Houston, TX'],
 [57, '7450 Fondren Houston, TX'],
 [85, '3950 Southwest fwy ib Houston, TX'],
 [91, '1750 West lp s Houston, TX'],
 [80, '1450 Texas Houston, TX'],
 [77, '12250 Cliffgate Houston, TX'],
 [87, '11550 South sam houston Houston, TX'])