In [1]:
import os
import glob
import psycopg2
import pandas as pd
import numpy as np
from sql_queries import *
## AWS
import boto3
import awswrangler as wr

# load secret keys
db_host = os.environ.get('DB_HOST')
db_name = os.environ.get('DB_NAME')
db_user = os.environ.get('DB_USER')
db_pass = os.environ.get('DB_PASS')
db_port = os.environ.get('DB_PORT')

In [2]:
conn = psycopg2.connect(database=db_name,
user=db_user,
password=db_pass,
host=db_host,
port=db_port)
conn.set_session(autocommit=True)
cur = conn.cursor()

In [3]:
# load data
# helper function
def bucket_raw_path(bucket_name,path_dir):
    '''get raw path of bucket'''
    raw_path = f's3://{bucket_name}/{path_dir}'
    return raw_path


def load_data(raw_s3):
    # depends what file you are uploading
    file = wr.s3.list_objects(raw_s3)[1]  # depends what file you are uploading
    return wr.s3.read_csv(file)

def create_table_from_df(dataframe,column_name,new_col_name,new_pk_name):
    """
    """
    col_data_list = dataframe[column_name].unique().tolist()
    total_rows = len(col_data_list)
    df_table = pd.DataFrame(col_data_list,columns=[new_col_name])
    df_table = df_table.reset_index()
    df_table.rename(columns={'index':new_pk_name}, inplace=True)
    df_table[new_pk_name] = df_table[new_pk_name] +1
    return df_table




def drop_add_pk(data,data_table,lo,ro):
    data = data.merge(data_table, left_on=lo, right_on=ro,suffixes=(True,True))
    data.drop([lo,ro], axis=1,inplace=True)
    return data

# LOAD DATA

In [4]:
raw = bucket_raw_path('dend-data',f'capstone/load-data/')
wr.s3.list_objects(raw)

['s3://dend-data/capstone/load-data/crime-weather-09-18.csv',
 's3://dend-data/capstone/load-data/crime-weather-sample-100-09-18.csv',
 's3://dend-data/capstone/load-data/crime-weather-sample-1000-09-18.csv']

In [5]:
wr.s3.list_objects(raw)[1]

's3://dend-data/capstone/load-data/crime-weather-sample-100-09-18.csv'

In [6]:
%%time
data = load_data(raw)

CPU times: user 122 ms, sys: 40.5 ms, total: 163 ms
Wall time: 1.17 s


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date_time            100 non-null    object 
 1   offenses             100 non-null    int64  
 2   offense_type         100 non-null    object 
 3   block_range          100 non-null    object 
 4   street_name          100 non-null    object 
 5   type                 100 non-null    object 
 6   suffix               100 non-null    object 
 7   beat                 100 non-null    object 
 8   premise              100 non-null    object 
 9   date                 100 non-null    object 
 10  hour                 100 non-null    int64  
 11  year                 100 non-null    int64  
 12  premise_description  100 non-null    object 
 13  temp                 100 non-null    float64
 14  feels_like           100 non-null    float64
 15  temp_min             100 non-null    floa

In [8]:
data.head()

Unnamed: 0,date_time,offenses,offense_type,block_range,street_name,type,suffix,beat,premise,date,...,feels_like,temp_min,temp_max,humidity_per,wind_speed,rain_vol_1h_mm,snow_vol_1h_mm,clouds_all_per,weather_main,weather_description
0,2018-04-13 19:00:00,1,Theft,5100-5199,buffalo speedway,-,-,1A50,Restaurant or Cafeteria Parking Lot,04/13/2018,...,84.65,80.654,83.102,59,7.7,0.56,0.0,75,Rain,light rain
1,2009-11-10 10:00:00,1,Burglary,5400-5499,lymbar,dr,-,15E10,20R,11/10/09 00:00:00,...,63.5,62.654,64.418,92,4.1,0.0,0.0,1,Clear,sky is clear
2,2016-02-27 17:00:00,1,Burglary,6300-6399,bellfort,st,W,17E40,Apartment,2016-02-27 00:00:00,...,64.184,64.094,66.902,31,2.1,0.0,0.0,1,Clear,sky is clear
3,2013-10-26 17:00:00,1,Theft,7900-7999,wallisville,rd,-,9C20,250,2013-10-26 00:00:00,...,76.406,73.616,78.818,62,5.7,0.0,0.0,1,Clear,sky is clear
4,2018-03-10 00:00:00,1,Robbery,8900-8999,braesmont,dr,-,15E10,Apartment Parking Lot,03/10/2018,...,68.036,68.0,70.016,65,4.1,0.0,0.0,90,Clouds,overcast clouds


# CREATE TABLES FROM DF

## CREATE offense_table

In [9]:
offense_table = create_table_from_df(data,'offense_type','offense_name','offense_pk')
offense_table.head()

Unnamed: 0,offense_pk,offense_name
0,1,Theft
1,2,Burglary
2,3,Robbery
3,4,Aggravated Assault
4,5,Auto Theft


In [None]:
data.sample()

In [None]:
data = drop_add_pk(data,offense_table,'offense_type','offense_name')

## CREATE police_beat_table

In [None]:
police_beat_table = create_table_from_df(data,'beat','beat_name','beat_pk')
police_beat_table.head()

## remove columns & add pk col

In [None]:
data = drop_add_pk(data,police_beat_table,'beat','beat_name')
data.sample()

## CREATE premise_description_table

In [None]:
premise_table = create_table_from_df(data,'premise_description','premise_name','premise_pk')
premise_table.head()

## remove col & add pk col

In [None]:
data = drop_add_pk(data,premise_table,'premise_description','premise_name')
data.sample()

# CREATE address_table

In [None]:
# clenup data
data.block_range.replace('UNK','10-100',inplace=True)
data.block_range.replace('1.1103e+006-1.1104e+006','10-100',inplace=True)

def block_range_split(df):
    '''split blockrange col values
    then give median value as a string'''
    first = df.block_range.str.split(pat='-',expand=True)[0].astype('int')
    second = df.block_range.str.split(pat='-',expand=True)[1].astype('int')
    med = np.ceil((second + first)/2).astype('int')
    med = med.astype('str')
    street = df.street_name
    res = med + " " + street +" Houston, TX"
    #res = f"{med} {street} Houston, TX"
    return res


data['og_address'] = block_range_split(data)

In [None]:
data.head()

In [None]:
address_table = create_table_from_df(data,'og_address','full_address','address_pk')
address_table.head()

In [None]:
## 

In [None]:
data = drop_add_pk(data,address_table,'og_address','full_address')
data.sample()

# CREATE datetime_table

In [None]:
dt_table = create_table_from_df(data,'date_time','date_time','date_time_pk')

# inster time data records
t = pd.to_datetime(dt_table['date_time'])

time_data = (dt_table.date_time_pk,dt_table.date_time,t.dt.hour, t.dt.day, t.dt.week, t.dt.month, t.dt.year,t.dt.weekday)
column_labels = ('date_time_pk','date_time', 'hour', 'day','week','month','year','weekday')
date_time_table = pd.DataFrame.from_dict(dict(zip(column_labels, time_data)))
date_time_table.head()

In [None]:
data.sample()

In [None]:
data = data.merge(date_time_table, left_on='date_time', right_on='date_time', suffixes=(True,True))
#data.drop(['date_time','hour','day','week','month','year','weekday'], axis=1,inplace=True)

In [None]:
data.head()

In [None]:
data.head()