In [7]:
#Loaded Libraries
#-------------------
import pandas as pd
import numpy as np
import sys,os
import re
import glob
import pickle

# AWS
import boto3
import awswrangler as wr


# Helper functions

def bucket_raw_path(bucket_name,path_dir):
    '''get raw path of s3 for download'''
    raw_path = f's3://{bucket_name}/{path_dir}'
    return raw_path


def s3_files_to_df(s3_files_path):
    ''' load s3 file path from wr result
    returns a dataframe of concat files '''
    df_list = []
    for i in wr.s3.list_objects(s3_files_path):
        temp = wr.s3.read_excel(i)
        df_list.append(temp)
    # create df from list files
    df = pd.concat(df_list, ignore_index=True)
    return df



# Helper functions for cleaning data
def clean_2009(df):
    '''clean crime data from 2009'''
    df.columns = df.columns.str.lower().str.replace(' ','_').str.replace('\n','_').str.strip()
    # change column name
    df = df.rename(columns={'#_of_offenses': 'offenses'})
    # convert two columns into one datetime
    df['date_time'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')
    # convert offenses to int
    df['offenses'] = df['offenses'].astype('int64')
    col_ord =['date_time','offenses','offense_type', 'block_range', 'street_name', 'type', 'suffix','beat', 'premise','date','hour']
    df = df[col_ord]
    return df


def clean_2010(df):
    '''clean crime data from 2010'''
    df['offenses'] = pd.concat([df['. Of Offenses'].dropna(),
                              df['. of Offenses'].dropna()]).reindex_like(df)

    # drop columns
    df = df.drop(['. Of Offenses','. of Offenses', 'Field11',
    'Field12',
    'Field13'], axis = 1)
    ## cleanup columns
    df.columns = df.columns.str.lower().str.replace(' ','_').str.replace('\n','_').str.strip()

    # convert two columns into one datetime
    df['date_time'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')
    # convert offenses to int
    df['offenses'] = df['offenses'].astype('int64')

    col_ord =['date_time','offenses','offense_type', 'block_range', 'street_name', 'type', 'suffix','beat', 'premise','date','hour']
    df = df[col_ord]
    return df


def clean_2011(df):
    '''clean crime data from 2011'''
    ## cleanup columns
    df.columns = df.columns.str.lower().str.replace(' ','_').str.replace('\n','_').str.strip()
    df = df.drop([ 'field11','field12'], axis = 1)
    #change column name

    df.rename(columns={
    '._of_offenses': 'offenses',
    }, inplace=True)

    # convert offenses to int
    df['offenses'] = df['offenses'].astype('int64')

    # convert two columns into one datetime
    df['date_time'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')
    col_ord =['date_time','offenses','offense_type', 'block_range', 'street_name', 'type', 'suffix','beat', 'premise','date','hour']
    df = df[col_ord]
    return df

def clean_2012(df):
    '''clean crime data from 2012'''
    ## cleanup columns
    df.columns = df.columns.str.lower().str.replace(' ','_').str.replace('\n','_').str.strip()
    df = df.drop([ 'field11'], axis = 1)
    #change column name

    df.rename(columns={
        '._of_offenses': 'offenses',
    }, inplace=True)

    # convert two columns into one datetime
    df['date_time'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')
    # convert offenses to int
    df['offenses'] = df['offenses'].astype('int64')

    col_ord =['date_time','offenses','offense_type', 'block_range', 'street_name', 'type', 'suffix','beat', 'premise','date','hour']
    df = df[col_ord]
    return df


def clean_2013(df):
    '''clean crime data from 2013'''
    ## cleanup columns
    df.columns = df.columns.str.lower().str.replace(' ','_').str.replace('\n','_').str.strip()
    df = df.drop([  'field11','field2','field12','field13','field14'], axis = 1)
    #change column name

    df.rename(columns={
        '._of_offenses': 'offenses',
    }, inplace=True)
    # convert two columns into one datetime
    df['date_time'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')
    # convert offenses to int
    df['offenses'] = df['offenses'].astype('int64')
    col_ord =['date_time','offenses','offense_type', 'block_range', 'street_name', 'type', 'suffix','beat', 'premise','date','hour']
    df = df[col_ord]
    return df


def clean_2014(df):
    '''clean crime data from 2014'''
    # clean hour col
    print(f'unique hours:{df.Hour.unique()}')
    df['Hour'] =  df['Hour'].str.replace('\'', '')
    # Change hour to int
    df['Hour'] = df['Hour'].astype('int64')
    print(f'unique hours:{len(df.Hour.unique())}')
    df['block_range'] = pd.concat([df['Block Range'].dropna(),
                              df['BlockRange'].dropna()]).reindex_like(df)
    df['street_name'] = pd.concat([df['Street Name'].dropna(),
                                df['StreetName'].dropna()]).reindex_like(df)
    df['offenses'] = pd.concat([df['. Of Offenses'].dropna(),
                                df['. offenses'].dropna(),
                                df['. Offenses'].dropna(),
                                df['. Of'].dropna()]).reindex_like(df)
    # drop unused cols
    df = df.drop([  'Block Range','BlockRange','Street Name','StreetName','. Of Offenses','. offenses','. Offenses','. Of','Field11',
    ], axis = 1)

    ## cleanup col names
    df.columns = df.columns.str.lower().str.replace(' ','_').str.replace('\n','_').str.strip()

    # remove values with nat date
    df = df[~df.date.isna()].reset_index(drop=True)     
    df["premise"].fillna("UNK", inplace = True)
    df["block_range"].fillna("UNK", inplace = True)
    df["beat"].fillna("UNK", inplace = True)

    # convert two columns into one datetime
    df['date_time'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')

    # convert offenses to int
    df['offenses'] = df['offenses'].astype('int64')

    col_ord =['date_time','offenses','offense_type', 'block_range', 'street_name', 'type', 'suffix','beat', 'premise','date','hour']
    df = df[col_ord]
    return df





def s3_files_to_df_2014(s3_files_path):
    ''' load s3 file path from wr result
    returns a dataframe of concat files '''
    df_list = []
    for i in wr.s3.list_objects(s3_files_path):
        temp = wr.s3.read_excel(i, engine='openpyxl', dtype={'Hour': str})  # added dtype error fix for Hour
        df_list.append(temp)
    # create df from list files
    df = pd.concat(df_list, ignore_index=True)
    return df

In [47]:
def clean_2015(df):
    '''clean crime data from 2015'''
    ## cleanup columns
    df.columns = df.columns.str.lower().str.replace(' ','_').str.replace('\n','_').str.strip()
    df.rename(columns={'._offenses': 'offenses',}, inplace=True)
    df.rename(columns={'blockrange': 'block_range', 'streetname': 'street_name'}, inplace=True)
    ## Change null values to UNK
    df["premise"].fillna("UNK", inplace = True)
    # convert two columns into one datetime
    df['date_time'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')
    # convert offenses to int
    df['offenses'] = df['offenses'].astype('int64')

    col_ord =['date_time','offenses','offense_type', 'block_range', 'street_name', 'type', 'suffix','beat', 'premise','date','hour']
    df = df[col_ord]
    return df

def clean_2016(df):
    '''clean crime data from 2016'''
    ## cleanup columns
    df.columns = df.columns.str.lower().str.replace(' ','_').str.replace('\n','_').str.strip()
    #change column name
    df.rename(columns={'._offenses': 'offenses',}, inplace=True)
    df.rename(columns={'blockrange': 'block_range', 'streetname': 'street_name'}, inplace=True)
    ## Change null values to UNK

    df["premise"].fillna("UNK", inplace = True)
    # convert two columns into one datetime
    df['date_time'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')


    # convert offenses to int
    df['offenses'] = df['offenses'].astype('int64')

    col_ord =['date_time','offenses','offense_type', 'block_range', 'street_name', 'type', 'suffix','beat', 'premise','date','hour']
    df = df[col_ord]
    return df


def clean_2017(df):
    '''clean crime data from 2017'''
    df['block_range'] = pd.concat([df['Block Range'].dropna(),
                                df['BlockRange'].dropna()]).reindex_like(df)

    df['street_name'] = pd.concat([df['Street Name'].dropna(),
                                df['StreetName'].dropna()]).reindex_like(df)

    df['offenses'] = pd.concat([df['. offenses'].dropna(),
                                df['Offenses'].dropna()]).reindex_like(df)
    # drop unused columns
    df = df.drop([  'Block Range',
    'BlockRange','Street Name','StreetName','. offenses','Offenses'], axis = 1)
    df.columns = df.columns.str.lower().str.replace(' ','_').str.replace('\n','_').str.strip()
    ## Change null values to UNK

    df["premise"].fillna("UNK", inplace = True)
    df["street_name"].fillna("UNK", inplace = True)
    # convert two columns into one datetime
    df['date_time'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')

    col_ord =['date_time','offenses','offense_type', 'block_range', 'street_name', 'type', 'suffix','beat', 'premise','date','hour']
    df = df[col_ord]
    return df


def clean_2018(df):
    '''clean crime data from 2018'''
    ## cleanup columns
    df.columns = df.columns.str.lower().str.replace(' ','_').str.replace('\n','_').str.strip()
    df["street_name"].fillna("UNK", inplace = True)
    # convert two columns into one datetime
    df['date_time'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')
    # convert offenses to int
    df['offenses'] = df['offenses'].astype('int64')

    col_ord =['date_time','offenses','offense_type', 'block_range', 'street_name', 'type', 'suffix','beat', 'premise','date','hour']
    df = df[col_ord]
    return df



def Celsius_to_Kelvin(C):
    return (C + 273.15)

def Kelvin_to_Celsius(K):
    return (K - 273.15)


def Kelvin_to_Farh(K):
    return (K -273.15) * 9/5 + 32 


def fahr_to_celsius(temp_fahr):
    """Convert Fahrenheit to Celsius
    
    Return Celsius conversion of input"""
    temp_celsius = (temp_fahr - 32) * 5 / 9
    return temp_celsius


# Clean weather data
# source: https://openweathermap.org/history-bulk

def clean_weather(df):
    ## cleanup columns
    df.columns = df.columns.str.lower().str.replace(' ','_').str.replace('\n','_').str.strip()
    df = df[['dt','temp', 'feels_like',
    'temp_min',
    'temp_max', 'humidity',
    'wind_speed', 'rain_1h','snow_1h',
    'clouds_all',
    'weather_main',
    'weather_description']]

    df.rename(columns = {'dt':'date_time', 'clouds_all': 'clouds_all_per','humidity':'humidity_per','rain_1h':'rain_vol_1h_mm',
                        'snow_1h':'snow_vol_1h_mm'}, inplace = True)

    # change nan to zero
    df['rain_vol_1h_mm'] = df['rain_vol_1h_mm'].fillna(0)
    df['snow_vol_1h_mm'] = df['snow_vol_1h_mm'].fillna(0)

    # convert to datetime
    df.date_time = pd.to_datetime(df['date_time'], unit='s')

    df['temp'] = Kelvin_to_Farh(df["temp"])
    df['temp_min'] = Kelvin_to_Farh(df["temp_min"])
    df['feels_like'] = Kelvin_to_Farh(df["feels_like"])
    df['temp_max'] = Kelvin_to_Farh(df["temp_max"])
    # drop a duplicate weather rows with samde datetime
    df.drop_duplicates(subset=['date_time'],keep='first',inplace=True)
    return df


def clean_premise(df):
    # load premise dataset
    raw_directory = os.path.join('..','..','data','raw','crime_data','premise_codes.csv')
    premise_df = pd.read_csv(raw_directory)
    premise_df.columns = premise_df.columns.str.strip().str.lower().str.replace('-', '_').str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
    #  rename premise column for easy merge
    premise_df.rename(columns={'premise_type': 'premise',}, inplace=True)
    # merge with main dataframe
    df = pd.merge(df,premise_df, on='premise', how='outer')
    # combine both columns and remove any nulls in between 
    # if value is null in premise_descriptiom, copy the matching row of premise to it
    df.loc[df['premise_description'].isnull(),'premise_description'] = df['premise']
    # bit of str cleanup
    df.premise_description = df.premise_description.str.replace('/', ' ').str.replace(',', ' ').str.replace('(', '').str.replace(')', '').str.strip()
    # drop df.premise column
    df.drop(['premise'], axis=1, inplace=True)
    return df

def save_to_S3_c(df,year):
    '''save crme data to S3'''
    file_name = f'crime_{year}.csv'
    path_to_save = f"s3://dend-data/capstone/inter-data/crime-data/{file_name}"
    wr.s3.to_csv(df, path_to_save, index=False)

def save_to_S3_w(df):
    '''save weather data to S3'''
    file_name = f'weather-08-18.csv'
    path_to_save = f"s3://dend-data/capstone/inter-data/weather-data/{file_name}"
    wr.s3.to_csv(df, path_to_save, index=False)

# load data

In [13]:
year = '2009'
raw = bucket_raw_path('dend-data',f'capstone/raw-data/crime-data/{year}')
d09 = s3_files_to_df(raw)

df09 = clean_2009(d09)
df09.head()

Unnamed: 0,date_time,offenses,offense_type,block_range,street_name,type,suffix,beat,premise,date,hour
0,2009-08-01,1,Robbery,3000-3099,DURHAM,DR,N,3B30,210,08/01/09 00:00:00,0
1,2009-08-01,1,Robbery,500-599,SEMINAR,DR,-,6B60,20A,08/01/09 00:00:00,0
2,2009-08-01,1,Robbery,8300-8399,NORTH HOUSTON ROSSLYN,RD,-,6B30,18N,08/01/09 00:00:00,0
3,2009-08-01,1,Robbery,6300-6399,SKYLINE,DR,-,18F30,18A,08/01/09 00:00:00,0
4,2009-08-01,1,Robbery,5900-5999,FULTON,ST,-,2A20,18N,08/01/09 00:00:00,0


In [19]:
# save
save_to_S3_c(df09,'2009')

In [11]:
# 2010
year = '2010'
raw = bucket_raw_path('dend-data',f'capstone/raw-data/crime-data/{year}')
c10 = s3_files_to_df(raw)
df10 = clean_2010(c10)
df10.head()

Unnamed: 0,date_time,offenses,offense_type,block_range,street_name,type,suffix,beat,premise,date,hour
0,2010-04-17 00:00:00,1,Murder,6600-6699,HEFFERNAN,-,-,13D20,05W,2010-04-17,0
1,2010-04-08 20:00:00,1,Murder,10100-10199,LUCORE,-,-,11H40,20R,2010-04-08,20
2,2010-04-01 22:00:00,2,Murder,11400-11499,CARVEL,LN,-,19G20,13R,2010-04-01,22
3,2010-04-17 01:00:00,1,Murder,3700-3799,WHEELER,-,-,10H60,13R,2010-04-17,1
4,2010-04-08 23:00:00,1,Murder,5100-5199,MYRTLEWOOD,DR,-,14D30,20R,2010-04-08,23


In [20]:
save_to_S3_c(df10,'2010')

In [12]:
year = '2011'
raw = bucket_raw_path('dend-data',f'capstone/raw-data/crime-data/{year}')
d11 = s3_files_to_df(raw)
df11 = clean_2011(d11)
df11.head()

Unnamed: 0,date_time,offenses,offense_type,block_range,street_name,type,suffix,beat,premise,date,hour
0,2011-04-13 01:00:00,1,Murder,6400-6499,BANKSIDE,DR,-,17E40,20A,2011-04-13,1
1,2011-04-06 14:00:00,1,Murder,2900-2999,HAYES,RD,-,20G30,20A,2011-04-06,14
2,2011-01-10 19:00:00,1,Murder,8400-8499,GLENSCOT,-,-,13D20,20R,2011-01-10,19
3,2011-04-03 23:00:00,1,Murder,10700-10799,BELLFORT,ST,W,19G50,18N,2011-04-03,23
4,2011-04-17 23:00:00,1,Murder,2500-2599,BROADWAY,ST,-,11H20,20A,2011-04-17,23


In [21]:
save_to_S3_c(df11,'2011')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127659 entries, 0 to 127658
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   date_time     127659 non-null  datetime64[ns]
 1   offenses      127659 non-null  int64         
 2   offense_type  127659 non-null  object        
 3   block_range   127659 non-null  object        
 4   street_name   127659 non-null  object        
 5   type          127659 non-null  object        
 6   suffix        127659 non-null  object        
 7   beat          127659 non-null  object        
 8   premise       127659 non-null  object        
 9   date          127659 non-null  datetime64[ns]
 10  hour          127659 non-null  int64         
dtypes: datetime64[ns](2), int64(2), object(7)
memory usage: 10.7+ MB


In [14]:
year = '2012'
raw = bucket_raw_path('dend-data',f'capstone/raw-data/crime-data/{year}')
d12 = s3_files_to_df(raw)
df12 = clean_2012(d12)
df12.head()

Unnamed: 0,date_time,offenses,offense_type,block_range,street_name,type,suffix,beat,premise,date,hour
0,2012-04-05 02:00:00,1,Murder,6100-6199,CLARIDGE,DR,-,17E40,20R,2012-04-05,2
1,2012-04-04 22:00:00,1,Murder,11700-11799,HEMPSTEAD,HWY,-,3B10,18A,2012-04-04,22
2,2012-04-01 23:00:00,1,Murder,7500-7599,CORPORATE,DR,-,19G10,20A,2012-04-01,23
3,2012-04-21 23:00:00,1,Murder,6200-6299,RIETTA,-,-,8C10,20R,2012-04-21,23
4,2012-04-01 06:00:00,1,Murder,4200-4299,34TH,ST,W,3B10,20A,2012-04-01,6


In [22]:
save_to_S3_c(df12,'2012')

In [16]:
year = '2013'
raw = bucket_raw_path('dend-data',f'capstone/raw-data/crime-data/{year}')
d13 = s3_files_to_df(raw)
df13 = clean_2013(d13)
df13.head()

Unnamed: 0,date_time,offenses,offense_type,block_range,street_name,type,suffix,beat,premise,date,hour
0,2013-04-25 05:00:00,1,Murder,9400-9499,WOODFAIR,DR,-,19G10,13R,2013-04-25,5
1,2013-04-14 00:00:00,1,Murder,6100-6199,BELLFORT,ST,W,17E40,18A,2013-04-14,0
2,2013-04-02 16:00:00,1,Murder,9900-9999,RICHMOND,AVE,-,20G10,20A,2013-04-02,16
3,2013-04-19 22:00:00,1,Murder,1300-1399,29TH,ST,E,2A20,13R,2013-04-19,22
4,2013-04-23 00:00:00,1,Murder,500-599,RUSK,-,-,1A10,190,2013-04-23,0


In [23]:
save_to_S3_c(df13,'2013')

In [10]:
year = '2014'
raw = bucket_raw_path('dend-data',f'capstone/raw-data/crime-data/{year}')
d14 = s3_files_to_df_2014(raw)
df14 = clean_2014(d14)
df14.head()

unique hours:['17' '05' '18' '03' '06' '21' '13' '14' '02' '23' '12' '00' '08' '19'
 '20' '01' '09' '16' '15' '10' '22' '07' '11' '04' '24' "'15" "'07" "'19"
 "'18" "'00" "'05" "'13" "'14" "'22" "'23" "'20" "'16" "'17" "'06" "'03"
 "'01" "'09" "'02" "'10" "'11" "'12" "'21" "'08" "'04"]
unique hours:25


Unnamed: 0,date_time,offenses,offense_type,block_range,street_name,type,suffix,beat,premise,date,hour
0,2014-04-19 17:00:00,1,Murder,8500-8599,MARTIN LUTHER KING,BLVD,-,14D30,18D,2014-04-19,17
1,2014-04-28 05:00:00,1,Murder,3600-3699,MCKINNEY,ST,-,10H20,13R,2014-04-28,5
2,2014-04-27 18:00:00,3,Murder,7400-7499,HILLMONT,-,-,5F30,20A,2014-04-27,18
3,2014-04-09 18:00:00,2,Murder,5400-5499,RENWICK,-,-,17E10,20A,2014-04-09,18
4,2014-04-24 03:00:00,1,Murder,9300-9399,RICHMOND,AVE,-,18F50,03B,2014-04-24,3


In [24]:
save_to_S3_c(df14,'2014')

In [3]:
year = '2015'
raw = bucket_raw_path('dend-data',f'capstone/raw-data/crime-data/{year}')
d15 = s3_files_to_df(raw)
df15 = clean_2015(d15)
df15.head()

Unnamed: 0,date_time,offenses,offense_type,block_range,street_name,type,suffix,beat,premise,date,hour
0,2015-02-26 12:00:00,1,Theft,4900-4999,CANAL,ST,-,10H10,"Road, Street, or Sidewalk",2015-02-26,12
1,2015-04-05 16:00:00,1,Burglary,100-199,YORK,-,-,10H10,Residence or House,2015-04-05,16
2,2015-04-06 20:00:00,1,Rape,UNK,CANAL,CT,-,10H10,Residence or House,2015-04-06,20
3,2015-04-16 08:00:00,1,Theft,400-499,ENNIS,ST,-,10H10,Construction Site,2015-04-16,8
4,2015-04-01 19:00:00,1,Theft,UNK,SIDNEY,-,-,10H10,UNK,2015-04-01,19


In [25]:
save_to_S3_c(df15,'2015')

In [4]:
year = '2016'
raw = bucket_raw_path('dend-data',f'capstone/raw-data/crime-data/{year}')
d16 = s3_files_to_df(raw)
df16 = clean_2016(d16)
df16.head()

Unnamed: 0,date_time,offenses,offense_type,block_range,street_name,type,suffix,beat,premise,date,hour
0,2016-04-10 20:00:00,1,Robbery,4000-4099,MILBY,-,-,10H10,"Road, Street, or Sidewalk",2016-04-10,20
1,2016-04-11 19:00:00,2,Aggravated Assault,400-499,YORK,-,-,10H10,"Road, Street, or Sidewalk",2016-04-11,19
2,2016-04-12 20:00:00,1,Robbery,1900-1999,RUNNELS,-,-,10H10,Apartment,2016-04-12,20
3,2016-04-13 02:00:00,1,Auto Theft,100-199,SIDNEY,-,-,10H10,Driveway,2016-04-13,2
4,2016-04-14 03:00:00,1,Burglary,3300-3399,CANAL,ST,-,10H10,Service or Gas Station,2016-04-14,3


In [26]:
save_to_S3_c(df16,'2016')

In [5]:
year = '2017'
raw = bucket_raw_path('dend-data',f'capstone/raw-data/crime-data/{year}')
d17 = s3_files_to_df(raw)
df17 = clean_2017(d17)
df17.head()

Unnamed: 0,date_time,offenses,offense_type,block_range,street_name,type,suffix,beat,premise,date,hour
0,2017-04-10 15:00:00,1.0,Burglary,200-299,CLIFTON,-,-,10H10,Residence or House,2017-04-10 00:00:00,15
1,2017-04-11 15:00:00,1.0,Theft,2300-2399,CANAL,ST,-,10H10,Restaurant or Cafeteria Parking Lot,2017-04-11 00:00:00,15
2,2017-04-11 17:00:00,1.0,Theft,2300-2399,CANAL,ST,-,10H10,Restaurant or Cafeteria Parking Lot,2017-04-11 00:00:00,17
3,2017-04-12 09:00:00,1.0,Burglary,4600-4699,CANAL,ST,-,10H10,Miscellaneous Business (Non-Specific),2017-04-12 00:00:00,9
4,2017-04-12 19:00:00,1.0,Theft,100-199,ADAM,LN,-,10H10,"Other, Unknown, or Not Listed",2017-04-12 00:00:00,19


In [27]:
save_to_S3_c(df17,'2017')

In [6]:
year = '2018'
raw = bucket_raw_path('dend-data',f'capstone/raw-data/crime-data/{year}')
d18 = s3_files_to_df(raw)
df18 = clean_2018(d18)
df18.head()

Unnamed: 0,date_time,offenses,offense_type,block_range,street_name,type,suffix,beat,premise,date,hour
0,2018-04-03 14:00:00,1,Theft,5900-5999,HARRISBURG,-,-,10H10,Commercial Building,04/03/2018,14
1,2018-04-05 07:00:00,1,Theft,300-399,DELMAR,ST,-,10H10,"Road, Street, or Sidewalk",04/05/2018,7
2,2018-04-05 17:00:00,1,Theft,2100-2199,RUNNELS,-,-,10H10,Apartment,04/05/2018,17
3,2018-04-05 17:00:00,1,Theft,2400-2499,NAVIGATION,BLVD,-,10H10,Bar or Night Club Parking Lot,04/05/2018,17
4,2018-04-07 04:00:00,1,Aggravated Assault,3300-3399,NAVIGATION,BLVD,-,10H10,Residence or House,04/07/2018,4


In [28]:
save_to_S3_c(df18,'2018')

## weather

In [34]:

raw = bucket_raw_path('dend-data',f'capstone/raw-data/weather-data/')

's3://dend-data/capstone/raw-data/weather-data/'

In [37]:
wr.s3.list_objects(raw)[0]

's3://dend-data/capstone/raw-data/weather-data/b5af47a41a784be4c6fca0b53302f0a1.csv'

In [40]:
dfw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151265 entries, 0 to 151264
Data columns (total 25 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   dt                   151265 non-null  int64  
 1   dt_iso               151265 non-null  object 
 2   timezone             151265 non-null  int64  
 3   city_name            151265 non-null  object 
 4   lat                  151265 non-null  float64
 5   lon                  151265 non-null  float64
 6   temp                 151265 non-null  float64
 7   feels_like           151265 non-null  float64
 8   temp_min             151265 non-null  float64
 9   temp_max             151265 non-null  float64
 10  pressure             151265 non-null  int64  
 11  sea_level            0 non-null       float64
 12  grnd_level           0 non-null       float64
 13  humidity             151265 non-null  int64  
 14  wind_speed           151265 non-null  float64
 15  wind_deg         

In [41]:
dfweather = clean_weather(dfw)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [48]:
# save data
save_to_S3_w(dfweather)

In [49]:
dfweather.head()

Unnamed: 0,date_time,temp,feels_like,temp_min,temp_max,humidity_per,wind_speed,rain_vol_1h_mm,snow_vol_1h_mm,clouds_all_per,weather_main,weather_description
0,2005-01-01 00:00:00,68.918,69.692,66.218,69.134,89,3.1,0.0,0.0,75,Clouds,broken clouds
1,2005-01-01 01:00:00,66.092,67.1,64.418,66.254,100,2.6,0.0,0.0,40,Clouds,scattered clouds
2,2005-01-01 02:00:00,66.182,67.208,62.654,66.254,100,3.6,0.0,0.0,90,Fog,fog
3,2005-01-01 03:00:00,64.742,65.624,64.418,65.174,100,3.1,0.0,0.0,90,Fog,fog
4,2005-01-01 04:00:00,64.256,65.084,62.618,65.174,100,3.1,0.0,0.0,90,Fog,fog


In [55]:
df10.date_time.dt.year.unique()

array([2010, 2009, 1995, 2006, 2008, 2007, 1966, 2003, 1999, 2004, 2000,
       2002, 1990, 1985, 2001, 1974, 2005, 2011, 1983, 1977, 1998, 1991,
       1969, 1959, 1980, 1996])

### 