In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib
import matplotlib.cm as cm
import shapefile
from shapely.geometry import shape, mapping, Point, Polygon
from zipfile import ZipFile
from io import BytesIO
import descartes
import matplotlib.pyplot as plt
import re

In [2]:
train_test_path = 'Data/test_data_042819.xlsx'

# Tornado Data Code

In [3]:
# Coordinates for tornado disasters in Columbus, 1950-2010: http://www.usa.com/columbus-oh-natural-disasters-extremes.htm
tornado_df = pd.read_csv("Data/HistoricalTornadoEvents.csv")
tornado_df.dropna(axis=0, inplace=True)

def deg_to_decimal(d, m, s, direction):
    res = int(d) + float(m)/60 + float(s)/3600
    if direction in ('S','W'):
        res *= -1
    return res

def tornado_df_coords(str_col):
    tornado_arr = np.array(tornado_df[str_col])
    coords = [coord.split(" / ") for coord in tornado_arr]

    p = re.compile('(\d+)°(\d+)\'([N|S|E|W])')
    coords_new = []
    for lat_long in coords:
        lat = p.match(lat_long[0])
        lat_dd = deg_to_decimal(lat.group(1), lat.group(2), 0, lat.group(3))

        long = p.match(lat_long[1])
        long_dd = deg_to_decimal(long.group(1), long.group(2), 0, long.group(3))
        
        coords_new.append([lat_dd, long_dd])

    return coords_new

starts = tornado_df_coords('Start Lat/Log')
ends = tornado_df_coords('End Lat/Log')

def convert(coords):
    lat = [coords[i][0] for i in range(len(coords))]
    long = [coords[i][1] for i in range(len(coords))]
    dist = 2*list(tornado_df["Distance (miles)"])
    magnitude = 2*list(tornado_df["Magnitude"])
    
    p = re.compile('(\d+.\d+)\sMile[s]*')
    length = [float(p.match(l).group(1)) for l in 2*list(tornado_df["Length"])]
    
    d = {"Lat": lat, "Long": long, "Distance": dist, "Magnitude": magnitude, "Length": length}
    df = pd.DataFrame(data=d)
    return df

dist_mag_tornado_df = convert(starts+ends)

In [4]:
traintest = pd.read_excel(train_test_path) # traintest has 158 rows

# find matching coordinates with smallest distance
def calc_smallest_dist_tornado(comp):
    d = []
    vals = {"Distance": [], "Magnitude": [], "Length": []}
    km_per_mi = 1.60934
    for outer in range(len(traintest)):
        smallest = np.inf
        tornado_dist = 0
        tornado_mag = 0
        tornado_length = 0
#         traintest_coord = LatLon.LatLon(Latitude(traintest.loc[outer, 'Latitude']), Longitude(traintest.loc[outer, 'Longitude']))
#         comp_coord = LatLon(Latitude(comp.loc[ctr, 'Lat']), Longitude(comp.loc[ctr, 'Long']))
        for ctr in range(len(comp)):
            dist = np.sqrt(((traintest.loc[outer, 'Latitude'] - comp.loc[ctr, 'Lat']) ** 2) + ((traintest.loc[outer, 'Longitude'] - comp.loc[ctr, 'Long']) ** 2))
#             dist = traintest_coord.distance(comp_coord) * km_per_mi
            if dist < smallest:
                smallest = dist
                tornado_dist = comp.loc[ctr, 'Distance']
                tornado_mag = int(comp.loc[ctr, 'Magnitude'])
                tornado_length = comp.loc[ctr, 'Length']
        d.append(smallest)
        vals["Distance"].append(tornado_dist)
        vals["Magnitude"].append(tornado_mag)
        vals["Length"].append(tornado_length)
    return (d, vals)

distances, vals = calc_smallest_dist_tornado(dist_mag_tornado_df)

In [5]:
final_vals = {"Distance": [], "Magnitude": []}

# if dist < tornado_length/69, leave as is, or it will be None (further than 1 mile)
miles_per_deg = 69
for ctr in range(len(distances)):
    miles_within = vals["Length"][ctr]
    if distances[ctr] < miles_within/miles_per_deg:
        final_vals["Distance"].append(vals["Distance"][ctr])
        final_vals["Magnitude"].append(vals["Magnitude"][ctr])
    else:
        final_vals["Distance"].append(0)
        final_vals["Magnitude"].append(0)
    
# df here is doing to be the final DF with distance/magnitude for each point in train/test data
# d = {"Lat": traintest['Latitude'], "Long": traintest['Longitude'], "Distance": final_vals["Distance"], "Magnitude": final_vals["Magnitude"]}

# unnecessary
# df = pd.DataFrame(data=d)

# Population Data Code

In [6]:
# For a given GEOID, return series that contains the Area and geometry of GEOID
def getMatchingGEOIDData(geoid):
    return gdf[gdf["GEOID"] == int(geoid)][["Area", "geometry"]]

# Return the county name where GEOID is in 
def geoidToCountyLatLong(geoid):
    countyCode = int(geoid / 10000000)
    return county_data[county_data["GEOID"] == countyCode][["NAME"]]

# Return the GEOID of the given coordinates
def pointToGeoid(long, lat):
    type1 = type(long)
    type2 = type(lat)
    assert(type1 == type2), "Parameters must be the same type"
    _pnts = []
    
    if (type1 == list):
        assert(len(long) == len(lat)), "Parameters must have same length"
        for i in range(len(long)):
            _pnts.append(Point(long[i], lat[i]))
    else:
        _pnts.append(Point(long, lat))
        
    pnts = gpd.GeoDataFrame(geometry=_pnts)
    for _, row in gdf_original.iterrows():
        if pnts.within(row.geometry)[0]:
            return row.GEOID
    
    return 0.0

def getGeoidPopulation(geoid):
    population = ohio_population_data[ohio_population_data['GEOID'] == int(geoid)]["2010 Total Population"]
    return population[0] if population.size == 1 else 0.0

def getGeoidArea(geoid):
    area = ohio_population_data[ohio_population_data['GEOID'] == int(geoid)]["Area (square miles)"]
    return area[0] if area.size == 1 else 0.0

def getGeoidCountyName(geoid):
    county = ohio_population_data[ohio_population_data['GEOID'] == int(geoid)]["CountyNames"]
    return county[0] if county.size == 1 else ""

# Returns copy of df with GEOID and data relevant to GEOID to df
# If lat long not associated with a GEOID, population and area = 0.0 and CountyName = "" 
# Precondition: df is a dataframe
def addGeoidColumns(df):
    assert('Longitude' in df.columns), "Cannot find longitude column"
    assert('Latitude' in df.columns), "Cannot find latitude column"
    df_copy = df.copy(deep=False)
    
    geoidData = df_copy.apply(lambda x: pointToGeoid(x['Longitude'], x['Latitude']), axis=1)
    df_copy = df_copy.assign(GEOID=geoidData.values)
    populationData = df_copy.apply(lambda x: getGeoidPopulation(x['GEOID']), axis=1)
    areaData = df_copy.apply(lambda x: getGeoidArea(x['GEOID']), axis=1)
    #countyData = df_copy.apply(lambda x: getGeoidCountyName(x['GEOID']), axis=1)
    df['Pop_Den'] = populationData / areaData
    
    return df

In [7]:
census_data = pd.read_excel("Data/Columbus_Population.xlsx").iloc[:,0:2]
geoids = census_data.iloc[:,0]

# CREDIT TO http://andrewgaidus.com/Reading_Zipped_Shapefiles/, used to parse census data taken from
# .dbf, .prj, .shp, and .shx files
zipFile = ZipFile("Data/ohio_tigerfiles.zip")
filenames = [y for y in sorted(zipFile.namelist()) for ending in ['dbf', 'prj', 'shp', 'shx'] if y.endswith(ending)] 
dbf, prj, shp, shx = [BytesIO(zipFile.read(filename)) for filename in filenames]

reader = shapefile.Reader(shp=shp, shx=shx, dbf=dbf)
attributes, geometry = [], []
field_names = [field[0] for field in reader.fields[1:]]  
for row in reader.shapeRecords():  
    geometry.append(shape(row.shape.__geo_interface__))
    attributes.append(dict(zip(field_names, row.record)))
    
# Put tigerfiles into GeoDataFrame
gdf = gpd.GeoDataFrame(data = attributes, geometry = geometry)[["ALAND10", "GEOID10", "geometry"]]
gdf = gdf.rename(index=str, columns={"ALAND10": "Area", "GEOID10": "GEOID"})
gdf.GEOID = gdf.GEOID.astype(int)
gdf = gdf[gdf["GEOID"].isin(geoids)] # Only get data on GEOIDs that match census data above

# Latitude/Longitude ordering is switched, swap it back and add it as the "geometry" column
block_coord_array = []
for _, row in gdf.iterrows():
    row_coord_array = []
    for coord in mapping(row['geometry'])['coordinates'][0]:
        correct_coord = reversed(coord)
        row_coord_array.append(list(correct_coord))
    
    block_coord_array.append(row_coord_array)
    
gdf_original = gdf.copy(deep=True)
gdf["geometry"] = pd.Series(block_coord_array, index=gdf.index)

blockRows = geoids.apply(getMatchingGEOIDData) #An array of DF rows

block_df = pd.DataFrame()
for row in blockRows:
    block_df = block_df.append(row, ignore_index=True)
    
ohio_population_data = census_data.join(block_df)

county_data = pd.read_excel("Data/Ohio_GEOID_Conversion.xlsx").iloc[:,1:4]

# geoids is only restricted to columbus
countyRows = geoids.apply(geoidToCountyLatLong)
county_df = pd.DataFrame()
for county in countyRows:
    county_df = county_df.append(county, ignore_index=True)
    
ohio_population_data["CountyNames"] = pd.Series(county_df.NAME.values, index=ohio_population_data.index)
ohio_population_data = ohio_population_data.rename(index=str, columns={"Area": "Area (square miles)"})

# Convert square meters to square miles
ohio_population_data["Area (square miles)"] = ohio_population_data["Area (square miles)"] / 2589988

In [8]:
### Here, endingDF is DF with population density for each coordinate
# test_data = pd.read_csv("./Data/Training.csv")
# endingDF = addGeoidColumns(test_data)
# endingDF.head()

# Point of Interest Data Code

In [9]:
# Points of Interest file - includes all popular areas in Columbus, OH
# Source: http://opendata.columbus.gov/datasets/86458e5d8a264dff9204518e109c0f93_10?geometry=-83.926%2C39.846%2C-82.168%2C40.214&page=7
poi_df = pd.read_csv('Data/Points_of_Interest.csv')

#drop irrelevant columns
poi_df = poi_df.drop(['POI_SOURCE', 'WEBSITE', 'OB_GYN', 'PEDIATRICS', 'PRIMARY_CARE'], axis=1)

zeros = ['Emergency', 'Medical', 'Industrial']
ones = ['Government', 'Group Quarters', 'Education']
twos = ['Transportation', 'Public Places', 'Retail', 'Office']

poi = poi_df[['X', 'Y', 'POI_TYPE']].copy()

# match all strings = 'category - subcategory' and remove the part immediately following the
# '-' to end with 'category'

new_poi = poi.replace(to_replace=r' - .*', value='', regex=True)
new_poi['Classification'] = new_poi['POI_TYPE']
new_poi['Classification'].replace({'Emergency': 0, 'Medical': 0, 'Industrial': 0, 'Government': 1, 
                                             'Group Quarters': 1, 'Education': 1, 'Transportation': 2, 
                                             'Public Places': 2, 'Retail': 2, 'Office': 2}, inplace=True)


In [10]:
# TODO: Make this the previous DF generated by past cells
traintest = pd.read_excel(train_test_path) # traintest has 158 rows
distances = []
classifications = []
final_distances = []
final_classifications = []

# find matching coordinates with smallest distance
def calc_smallest_dist_poi(comp):
    d = []
    c = []
    outer = 0
    while outer < len(traintest):
        ctr = 0
        smallest = np.inf
        cls = 0 # 0, 1, or 2
        while ctr < len(comp): # go through 15,000 rows
            dist = np.sqrt(((traintest.loc[outer, 'Latitude'] - comp.loc[ctr, 'Y']) ** 2) + ((traintest.loc[outer, 'Longitude'] - comp.loc[ctr, 'X']) ** 2))
            if dist < smallest:
                smallest = dist
                cls = comp.loc[ctr, 'Classification']
            ctr += 1
        d.append(smallest)
        c.append(cls)
        outer += 1
    return (d, c)

distances, classifications = calc_smallest_dist_poi(new_poi)

# if dist < 1/69, leave as is, or it will be None (further than 1 mile)
ctr = 0
for i in distances:
    if i < 1/69:
        final_distances.append(i)
        final_classifications.append(classifications[ctr])
    else:
        final_distances.append(None)
        final_classifications.append(None)
    ctr += 1

# TODO: Add final_classifications as column in DF
# final_classifications

# Consolidation

In [11]:
def addFeatureData(df):
    assert('Longitude' in df.columns), "Cannot find longitude column"
    assert('Latitude' in df.columns), "Cannot find latitude column"
    
    # Add final_vals["Distance"]
    df['Tornado Distance'] = pd.Series(final_vals['Distance'], index=df.index)
    
    # Add final_vals["Magnitude"]
    df['Tornado Magnitude'] = pd.Series(final_vals['Magnitude'], index=df.index)
    
    # Add final_classifications
    df['POI_TYPE'] = pd.Series(final_classifications, index=df.index)
    
    # Run addGeoidColumns
    df = addGeoidColumns(df)
    
    return df

total_train_test_data = pd.read_excel(train_test_path)
allFeatureDF = addFeatureData(total_train_test_data)
allFeatureDF.head()

Unnamed: 0,Latitude,Longitude,Tornado Distance,Tornado Magnitude,POI_TYPE,Pop_Den
0,40.087601,-83.003476,13.1,2,2.0,3240.893082
1,39.983963,-83.165229,5.2,3,1.0,1552.374779
2,40.129978,-82.806906,15.2,2,,176.61685
3,40.058561,-82.86206,12.6,2,2.0,3536.998504
4,39.884917,-82.957369,5.5,3,0.0,1122.177611


In [12]:
allFeatureDF['Pop_Den'] = allFeatureDF['Pop_Den'].fillna((allFeatureDF['Pop_Den'].mean()))
allFeatureDF['POI_TYPE'] = allFeatureDF['POI_TYPE'].replace('None', (allFeatureDF['POI_TYPE'].mode())[0])
allFeatureDF['POI_TYPE'] = allFeatureDF['POI_TYPE'].replace(np.NaN, (allFeatureDF['POI_TYPE'].mode())[0])

In [15]:
allFeatureDF.to_csv("REAL_DATA.csv")