In [1]:
import os
import glob
import psycopg2
import pandas as pd
import numpy as np
from sql_queries import *
# AWS
import boto3
import awswrangler as wr

# load secret keys
db_host = os.environ.get('DB_HOST')
db_name = os.environ.get('DB_NAME')
db_user = os.environ.get('DB_USER')
db_pass = os.environ.get('DB_PASS')
db_port = os.environ.get('DB_PORT')


# helprer functions
def block_range_split(df):
    '''split blockrange col values
    then give median value as a string'''
    first = df.block_range.str.split(pat='-', expand=True)[0].astype('int')
    second = df.block_range.str.split(pat='-', expand=True)[1].astype('int')
    med = np.ceil((second + first)/2).astype('int')
    med = med.astype('str')
    street = df.street_name
    res = med + " " + street + " Houston, TX"
    # res = f"{med}{street} Houston, TX"
    return res



# count_files = len(premise_table.values.tolist())
# files = premise_table.values.tolist()
# for count, value in enumerate(files):
#     #print(count, value)
#     print('{}/{} files processed.'.format(count, count_files))


def upload_table(table_df, upload_table_sql, table_name):
    print(f'uploading:{table_name}')
    files = table_df.values.tolist()
    count_files = len(files)
    # connection
    conn = psycopg2.connect(database=db_name, user=db_user,
                            password=db_pass, host=db_host, port=db_port)
    conn.set_session(autocommit=True)
    cur = conn.cursor()
#     for count, value in enumerate(files):
#         cur.execute(upload_table_sql, values)
#         conn.commit()
#         print('{}/{} files processed.'.format(count, count_files))
#     conn.close()

        
    for i in table_df.values.tolist():
        cur.execute(upload_table_sql, i)
        conn.commit()
    conn.close()


def bucket_raw_path(bucket_name, path_dir):
    '''get raw path of bucket'''
    raw_path = f's3://{bucket_name}/{path_dir}'
    return raw_path


def load_data(raw_s3):
    # depends what file you are uploading
    file = wr.s3.list_objects(raw_s3)[1]  # depends what file you are uploading
    return wr.s3.read_csv(file)


def create_table_from_df(dataframe, column_name, new_col_name, new_pk_name):
    """
    """
    col_data_list = dataframe[column_name].unique().tolist()
    total_rows = len(col_data_list)
    df_table = pd.DataFrame(col_data_list, columns=[new_col_name])
    df_table = df_table.reset_index()
    df_table.rename(columns={'index': new_pk_name}, inplace=True)
    df_table[new_pk_name] = df_table[new_pk_name] + 1
    return df_table


def drop_add_pk(data, data_table, lo, ro):
    data = data.merge(data_table, left_on=lo,
                      right_on=ro, suffixes=(True, True))
    data.drop([lo, ro], axis=1, inplace=True)
    return data


def get_data():
    raw = bucket_raw_path('dend-data', f'capstone/load-data/')
    df = load_data(raw)
    df = df[['date_time', 'offenses', 'offense_type', 'block_range',
             'street_name', 'beat', 'premise_description', 'temp', 'feels_like',
             'humidity_per', 'rain_vol_1h_mm', 'snow_vol_1h_mm']]
    return df

In [2]:
# get data
data = get_data()
# CREATE TABLE
# OFFENSE
offense_table = create_table_from_df(
    data, 'offense_type', 'offense_name', 'offense_pk')


In [4]:
offense_table

Unnamed: 0,offense_pk,offense_name
0,1,Theft
1,2,Aggravated Assault
2,3,Burglary
3,4,Auto Theft
4,5,Robbery
5,6,Rape


In [5]:
# POLICE BEAT
police_beat_table = create_table_from_df(
    data, 'beat', 'beat_name', 'beat_pk')

police_beat_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   beat_pk    60 non-null     int64 
 1   beat_name  60 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.1+ KB


In [6]:
# PREMISE
premise_table = create_table_from_df(
    data, 'premise_description', 'premise_name', 'premise_pk')
premise_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   premise_pk    41 non-null     int64 
 1   premise_name  41 non-null     object
dtypes: int64(1), object(1)
memory usage: 784.0+ bytes


In [7]:
# ADDRESS
data.block_range.replace('UNK', '10-100', inplace=True)
data.block_range.replace('1.1103e+006-1.1104e+006', '10-100', inplace=True)
data['og_address'] = block_range_split(data)
# drop street_name & block range
data.drop(['block_range', 'street_name'], axis=1, inplace=True)

address_table = create_table_from_df(
    data, 'og_address', 'full_address', 'address_pk')


address_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   address_pk    100 non-null    int64 
 1   full_address  100 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.7+ KB


In [8]:
# DATETIME
dt_table = create_table_from_df(
    data, 'date_time', 'date_time', 'date_time_pk')

# inster time data records
t = pd.to_datetime(dt_table['date_time'])

time_data = (dt_table.date_time_pk, dt_table.date_time, t.dt.hour,
             t.dt.day, t.dt.week, t.dt.month, t.dt.year, t.dt.weekday)
column_labels = ('date_time_pk', 'date_time', 'hour',
                 'day', 'week', 'month', 'year', 'weekday')
date_time_table = pd.DataFrame.from_dict(
    dict(zip(column_labels, time_data)))
date_time_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date_time_pk  100 non-null    int64 
 1   date_time     100 non-null    object
 2   hour          100 non-null    int64 
 3   day           100 non-null    int64 
 4   week          100 non-null    int64 
 5   month         100 non-null    int64 
 6   year          100 non-null    int64 
 7   weekday       100 non-null    int64 
dtypes: int64(7), object(1)
memory usage: 6.4+ KB


  t.dt.day, t.dt.week, t.dt.month, t.dt.year, t.dt.weekday)


In [None]:
# CRIME FACT
crime_fact = data.reset_index()
crime_fact.rename(columns={'index': 'pk'}, inplace=True)
crime_fact.pk = crime_fact.pk + 1

In [12]:
# data = drop_add_pk(data, offense_table, 'offense_type', 'offense_name')
# # upload table
# upload_table(offense_table, offense_table_insert, 'offense_table')


data = drop_add_pk(data, police_beat_table, 'beat', 'beat_name')
# upload table
#upload_table(police_beat_table, police_beat_table_insert,'police_beat_table')


data = drop_add_pk(data, premise_table,
                   'premise_description', 'premise_name')
# upload table
#upload_table(premise_table, premise_table_insert, 'premise_table')


data = drop_add_pk(data, address_table, 'og_address', 'full_address')
# upload table
#upload_table(address_table, address_table_insert, 'address_table')



data = data.merge(date_time_table, left_on='date_time',right_on='date_time', suffixes=(True, True))
data.drop(['date_time', 'hour', 'day', 'week', 'month',
          'year', 'weekday'], axis=1, inplace=True)
# upload table
#upload_table(date_time_table, datetime_table_insert, 'date_time_table')

In [13]:
# CRIME FACT
crime_fact = data.reset_index()
crime_fact.rename(columns={'index': 'pk'}, inplace=True)
crime_fact.pk = crime_fact.pk + 1

# upload table
#upload_table(crime_fact, crime_fact_table_insert, 'crime_fact_table')

In [14]:
crime_fact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pk              100 non-null    int64  
 1   offenses        100 non-null    int64  
 2   offense_type    100 non-null    object 
 3   temp            100 non-null    float64
 4   feels_like      100 non-null    float64
 5   humidity_per    100 non-null    int64  
 6   rain_vol_1h_mm  100 non-null    float64
 7   snow_vol_1h_mm  100 non-null    float64
 8   beat_pk         100 non-null    int64  
 9   premise_pk      100 non-null    int64  
 10  address_pk      100 non-null    int64  
 11  date_time_pk    100 non-null    int64  
dtypes: float64(4), int64(7), object(1)
memory usage: 9.5+ KB


In [15]:
crime_fact.head()

Unnamed: 0,pk,offenses,offense_type,temp,feels_like,humidity_per,rain_vol_1h_mm,snow_vol_1h_mm,beat_pk,premise_pk,address_pk,date_time_pk
0,1,1,Theft,91.652,101.246,56,0.0,0.0,1,1,1,1
1,2,1,Auto Theft,98.438,102.524,35,0.0,0.0,11,1,12,12
2,3,1,Theft,89.996,90.032,38,0.0,0.0,33,1,56,56
3,4,1,Theft,52.808,52.34,97,0.0,0.0,56,1,88,88
4,5,1,Auto Theft,42.746,39.146,84,0.76,0.0,57,1,91,91
