# Classification of Flood&Crime Dataset  Using KNN
## Training dataset = School Ratings Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import os

In [2]:
# read school data file and make into a DataFrame
schools_file = pd.read_csv('Output/school_ratings.csv')
school_ratings_df = pd.DataFrame(schools_file)
school_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 822 entries, 0 to 821
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     822 non-null    int64  
 1   school_id      822 non-null    int64  
 2   name           822 non-null    object 
 3   address        822 non-null    object 
 4   city           822 non-null    object 
 5   zip_code       822 non-null    int64  
 6   district_name  822 non-null    object 
 7   school_type    822 non-null    object 
 8   latitude       822 non-null    float64
 9   longitude      822 non-null    float64
 10  school_rating  822 non-null    float64
dtypes: float64(3), int64(3), object(5)
memory usage: 70.8+ KB


In [3]:
# read all files with flood data
entries=os.listdir("Resources/flood/")
entries

flood_data_df=pd.DataFrame()

for entry in entries:
    file = (f'Resources/flood/{entry}')
    new_data = pd.read_csv(file)
    frames=[flood_data_df, new_data]
    flood_data_df=pd.concat(frames)    

#Remove all duplicate addresses
flood_data_df=flood_data_df.drop_duplicates(subset="Address")
flood_data_df=flood_data_df.reset_index(drop=True)
del flood_data_df['Unnamed: 0']
flood_data_df=flood_data_df.rename(columns={"Flood Description": "Flood_Description", "Flood Zone":"Flood_Zone"})
flood_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23230 entries, 0 to 23229
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Address            23230 non-null  object 
 1   Latitude           23230 non-null  float64
 2   Longitude          23230 non-null  float64
 3   Flood_Description  19160 non-null  object 
 4   Flood_Zone         23230 non-null  object 
dtypes: float64(2), object(3)
memory usage: 907.5+ KB


## Zoning for elementary public schools

In [4]:
# training dataset for elementary public schools
elem_pub = school_ratings_df.loc[school_ratings_df['school_type']=='Elementary',:]

In [5]:
# elementary public schools classifier
elem_pub_classifier = KNeighborsClassifier(n_neighbors=1)
elem_pub_classifier.fit(elem_pub.loc[:,["latitude","longitude"]],elem_pub["school_id"])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [6]:
elem_pub_classifier.score(elem_pub.loc[:,["latitude","longitude"]],elem_pub["school_id"])

0.9817850637522769

In [7]:
# elementary public schools classification results
elem_pub_pred = elem_pub_classifier.predict(flood_data_df.loc[:,['Latitude','Longitude']])

## Zoning for public middle schools

In [8]:
# training dataset for public middle schools
middle_pub = school_ratings_df.loc[school_ratings_df['school_type']=='Middle',:]

In [9]:
# public middle schools classifier
middle_pub_classifier = KNeighborsClassifier(n_neighbors=1)
middle_pub_classifier.fit(middle_pub.loc[:,["latitude","longitude"]],middle_pub["school_id"])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [10]:
middle_pub_classifier.score(middle_pub.loc[:,["latitude","longitude"]],middle_pub["school_id"])

0.9942528735632183

In [11]:
# public middle schools classification results
middle_pub_pred = middle_pub_classifier.predict(flood_data_df.loc[:,['Latitude','Longitude']])

## Zoning for public high schools

In [12]:
# training dataset for public high schools
high_pub = school_ratings_df.loc[school_ratings_df['school_type']=='High',:]

In [13]:
# public high schools classifier
high_pub_classifier = KNeighborsClassifier(n_neighbors=1)
high_pub_classifier.fit(high_pub.loc[:,["latitude","longitude"]],high_pub["school_id"])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [14]:
high_pub_classifier.score(high_pub.loc[:,["latitude","longitude"]],high_pub["school_id"])

1.0

In [15]:
# public high schools classification results
high_pub_pred = high_pub_classifier.predict(flood_data_df.loc[:,['Latitude','Longitude']])

## Adding classification and school ratings to main dataframe

In [16]:
# add columns to the flood dataset for the school IDs
flood_data_df["Elem_School_ID"] = elem_pub_pred
flood_data_df["Middle_School_ID"] = middle_pub_pred
flood_data_df["High_School_ID"] = high_pub_pred

In [17]:
flood_data_df.to_csv("Output/flood_data_and_school_id.csv")

In [18]:
flood_data_df.head()

Unnamed: 0,Address,Latitude,Longitude,Flood_Description,Flood_Zone,Elem_School_ID,Middle_School_ID,High_School_ID
0,"2202 CAROLINE ST Houston, Texas 77002",29.74614,-95.36987,AREA OF MINIMAL FLOOD HAZARD,X,101912110,101912467,101912025
1,"2204 CAROLINE ST Houston, Texas 77002",29.74619,-95.36996,AREA OF MINIMAL FLOOD HAZARD,X,101912110,101912467,101912025
2,"2206 CAROLINE ST Houston, Texas 77002",29.74624,-95.37004,AREA OF MINIMAL FLOOD HAZARD,X,101912058,101912463,101912025
3,"2251 AUSTIN ST Houston, Texas 77002",29.7453,-95.36882,AREA OF MINIMAL FLOOD HAZARD,X,101912110,101912467,101912025
4,"2255 AUSTIN ST Houston, Texas 77002",29.74525,-95.36874,AREA OF MINIMAL FLOOD HAZARD,X,101912110,101912467,101912025
