# ET Flood and Crime Data

In [1]:
#The following code takes the flood data from csv files extracted from APIs, transforms the data and loads it in SQL
# It also extracts crime data, transforms it and load it in SQl

In [2]:
# Import Dependencies
import pandas as pd
import scipy.stats as st
import gmaps
import numpy as np
import os

# Flood Data

### Extract Flood Data Files

In [3]:
entries=os.listdir("Flood_Data/")
entries

['flood_data_77002.csv',
 'flood_data_77005.csv',
 'flood_data_77006.csv',
 'flood_data_77019.csv',
 'flood_data_77025.csv',
 'flood_data_77027.csv',
 'flood_data_77030.csv',
 'flood_data_77054.csv',
 'flood_data_77098.csv',
 'flood_data_batch0.csv',
 'flood_data_batch1.csv',
 'flood_data_batch2.csv',
 'flood_data_batch3.csv',
 'flood_data_batch4.csv',
 'flood_data_batch5.csv',
 'flood_data_batch6.csv',
 'flood_data_batch7.csv',
 'flood_data_batch8.csv']

## Transformation

In [4]:
# read all files with flood data
flood_data_df=pd.DataFrame()

for entry in entries:
    file = (f'Flood_Data/{entry}')
    new_data = pd.read_csv(file)
    frames=[flood_data_df, new_data]
    flood_data_df=pd.concat(frames)    

#Remove all duplicate addresses
flood_data_df=flood_data_df.drop_duplicates(subset="Address")
flood_data_df=flood_data_df.reset_index(drop=True)
del flood_data_df['Unnamed: 0']
flood_data_df=flood_data_df.rename(columns={"Flood Description": "Flood_Description", "Flood Zone":"Flood_Zone"})
flood_data_df['Flood_Description'].fillna("High-Risk Flood Zone", inplace=True)
flood_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30687 entries, 0 to 30686
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Address            30687 non-null  object 
 1   Latitude           30687 non-null  float64
 2   Longitude          30687 non-null  float64
 3   Flood_Description  30687 non-null  object 
 4   Flood_Zone         30687 non-null  object 
dtypes: float64(2), object(3)
memory usage: 1.2+ MB


In [5]:
#Generate ZIP Code to Merge with Crime Data
address_split=flood_data_df['Address'].str.rsplit(' ',3).tolist()
address_split=pd.DataFrame(address_split)
flood_data_df['Zip_Code']= address_split[3]
flood_data_df.head()

Unnamed: 0,Address,Latitude,Longitude,Flood_Description,Flood_Zone,Zip_Code
0,"2202 CAROLINE ST Houston, Texas 77002",29.74614,-95.36987,AREA OF MINIMAL FLOOD HAZARD,X,77002
1,"2204 CAROLINE ST Houston, Texas 77002",29.74619,-95.36996,AREA OF MINIMAL FLOOD HAZARD,X,77002
2,"2206 CAROLINE ST Houston, Texas 77002",29.74624,-95.37004,AREA OF MINIMAL FLOOD HAZARD,X,77002
3,"2251 AUSTIN ST Houston, Texas 77002",29.7453,-95.36882,AREA OF MINIMAL FLOOD HAZARD,X,77002
4,"2255 AUSTIN ST Houston, Texas 77002",29.74525,-95.36874,AREA OF MINIMAL FLOOD HAZARD,X,77002


In [6]:
flood_data_df.to_csv('flood_data.csv')

In [7]:
#Create flood description table
flood_description=flood_data_df.groupby(['Flood_Description','Flood_Zone']).count()['Address']
flood_description_df=pd.DataFrame(flood_description)
del flood_description_df ['Address']
flood_description_df=flood_description_df.reset_index()
flood_description_df.head()

Unnamed: 0,Flood_Description,Flood_Zone
0,0.2 PCT ANNUAL CHANCE FLOOD HAZARD,X
1,AREA OF MINIMAL FLOOD HAZARD,X
2,FLOODWAY,AE
3,High-Risk Flood Zone,AE


In [8]:
flood_description_df.to_csv('flood_zones_description.csv')

# Crime Data

## Extract Crime Data Files

In [9]:
# Read Crime Data file
crime_data_df = pd.read_csv("Crime_Data/2019_Houston_Crimes.csv")
crime_data_df.head()

Unnamed: 0,Incident,Occurrence\nDate,Occurrence\nHour,NIBRS\nClass,NIBRSDescription,Offense Count,Beat,Premise,Block Range,StreetName,Street\nType,Suffix,City,ZIP Code
0,5619,1/1/2019,0,290,"Destruction, damage, vandalism",1,9C30,"Residence, Home (Includes Apartment)",9622.0,SAN CARLOS,,,HOUSTON,77013
1,17319,1/1/2019,0,35A,"Drug, narcotic violations",1,7C10,"Highway, Road, Street, Alley",,EAST,FWY,,HOUSTON,77020
2,18119,1/1/2019,0,290,"Destruction, damage, vandalism",1,1.60E+41,"Residence, Home (Includes Apartment)",16718.0,LONE QUAIL,CT,,HOUSTON,77489
3,19019,1/1/2019,0,520,Weapon law violations,1,,"Residence, Home (Includes Apartment)",1909.0,MELBOURNE,,,HOUSTON,77026-0000
4,20519,1/1/2019,0,13A,Aggravated Assault,1,1.50E+31,"Residence, Home (Includes Apartment)",4034.0,OSBY,DR,,HOUSTON,77025


## Transformation

In [10]:
#Clean Zip-Code Data
#Get 5 digit Zip Code
crime_data_df[['ZIP Code']]=crime_data_df[['ZIP Code']].astype(str)
zip_split=crime_data_df['ZIP Code'].str.rsplit('-',1).tolist()
zip_split=pd.DataFrame(zip_split)
crime_data_df['Zip_Code']= zip_split[0]

#Delete unecessary columns
del crime_data_df ['ZIP Code']
del crime_data_df ['Beat']
del crime_data_df ['Suffix']

#Clean ZIP Code Columns
crime_data_df=crime_data_df.loc[crime_data_df['Zip_Code']!='nan',:]
crime_data_df=crime_data_df.loc[(crime_data_df["Zip_Code"]=='77002')| (crime_data_df["Zip_Code"]=='77005')|
                               (crime_data_df["Zip_Code"]=='77006')| (crime_data_df["Zip_Code"]=='77019')|
                               (crime_data_df["Zip_Code"]=='77025')|(crime_data_df["Zip_Code"]=='77027')|
                               (crime_data_df["Zip_Code"]=='77030')|(crime_data_df["Zip_Code"]=='77054')|
                               (crime_data_df["Zip_Code"]=='77098'),:]
crime_data_df['Zip_Code']=crime_data_df['Zip_Code'].astype(int)
crime_data_df = crime_data_df.reset_index(drop=True)

#Rename Columns
crime_data_df=crime_data_df.rename(columns={"Occurrence\nDate": "Date",
                                           "Occurrence\nHour": "Hour",
                                            "NIBRS\nClass":"NIBRS_Class",
                                            "NIBRSDescription":"NIBRS_Description",
                                            "Offense Count":"Offense_Count",
                                            "Block Range":"Block_Range",
                                            "StreetName":"Street_Name",
                                            "Street\nType":"Street_Type"
                                           })

#Create CSV for QC
crime_data_df.to_csv('crime_data.csv')
crime_data_df.head()

Unnamed: 0,Incident,Date,Hour,NIBRS_Class,NIBRS_Description,Offense_Count,Premise,Block_Range,Street_Name,Street_Type,City,Zip_Code
0,20519,1/1/2019,0,13A,Aggravated Assault,1,"Residence, Home (Includes Apartment)",4034,OSBY,DR,HOUSTON,77025
1,20519,1/1/2019,0,23H,All other larceny,1,"Residence, Home (Includes Apartment)",4034,OSBY,DR,HOUSTON,77025
2,20519,1/1/2019,0,290,"Destruction, damage, vandalism",1,"Residence, Home (Includes Apartment)",4034,OSBY,DR,HOUSTON,77025
3,20519,1/1/2019,0,35A,"Drug, narcotic violations",1,"Residence, Home (Includes Apartment)",4034,OSBY,DR,HOUSTON,77025
4,34819,1/1/2019,0,290,"Destruction, damage, vandalism",1,"Residence, Home (Includes Apartment)",4065,SILVERWOOD,DR,HOUSTON,77025


In [11]:
crime_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22373 entries, 0 to 22372
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Incident           22373 non-null  int64 
 1   Date               22373 non-null  object
 2   Hour               22373 non-null  int64 
 3   NIBRS_Class        22373 non-null  object
 4   NIBRS_Description  22373 non-null  object
 5   Offense_Count      22373 non-null  int64 
 6   Premise            22373 non-null  object
 7   Block_Range        22249 non-null  object
 8   Street_Name        22373 non-null  object
 9   Street_Type        20152 non-null  object
 10  City               22373 non-null  object
 11  Zip_Code           22373 non-null  int32 
dtypes: int32(1), int64(3), object(8)
memory usage: 2.0+ MB
