# Classification of Flood&Crime Dataset  Using KNN
## Training dataset = School Ratings Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import os

In [2]:
# read school data file and make into a DataFrame
schools_file = pd.read_csv('Output/school_ratings.csv')
school_ratings_df = pd.DataFrame(schools_file)
school_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 822 entries, 0 to 821
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   school_id      822 non-null    int64  
 1   name           822 non-null    object 
 2   address        822 non-null    object 
 3   city           822 non-null    object 
 4   zip_code       822 non-null    int64  
 5   school_type    822 non-null    object 
 6   latitude       822 non-null    float64
 7   longitude      822 non-null    float64
 8   school_rating  822 non-null    float64
 9   district_id    822 non-null    float64
dtypes: float64(4), int64(2), object(4)
memory usage: 64.3+ KB


In [3]:
# read the file with flood data
flood_data = pd.read_csv('Resources/flood_data.csv')
flood_data_df = pd.DataFrame(flood_data)
flood_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23230 entries, 0 to 23229
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         23230 non-null  int64  
 1   Address            23230 non-null  object 
 2   Latitude           23230 non-null  float64
 3   Longitude          23230 non-null  float64
 4   Flood_Description  19160 non-null  object 
 5   Flood_Zone         23230 non-null  object 
 6   Zip_Code           23230 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 1.2+ MB


In [4]:
# Delete the Unnamed Column
del flood_data_df['Unnamed: 0']

## Zoning for elementary public schools

In [5]:
# training dataset for elementary public schools
elem_pub = school_ratings_df.loc[school_ratings_df['school_type']=='Elementary',:]

In [6]:
# elementary public schools classifier
elem_pub_classifier = KNeighborsClassifier(n_neighbors=1)
elem_pub_classifier.fit(elem_pub.loc[:,["latitude","longitude"]],elem_pub["school_id"])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [7]:
elem_pub_classifier.score(elem_pub.loc[:,["latitude","longitude"]],elem_pub["school_id"])

0.9817850637522769

In [8]:
# elementary public schools classification results
elem_pub_pred = elem_pub_classifier.predict(flood_data_df.loc[:,['Latitude','Longitude']])

## Zoning for public middle schools

In [9]:
# training dataset for public middle schools
middle_pub = school_ratings_df.loc[school_ratings_df['school_type']=='Middle',:]

In [10]:
# public middle schools classifier
middle_pub_classifier = KNeighborsClassifier(n_neighbors=1)
middle_pub_classifier.fit(middle_pub.loc[:,["latitude","longitude"]],middle_pub["school_id"])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [11]:
middle_pub_classifier.score(middle_pub.loc[:,["latitude","longitude"]],middle_pub["school_id"])

0.9942528735632183

In [12]:
# public middle schools classification results
middle_pub_pred = middle_pub_classifier.predict(flood_data_df.loc[:,['Latitude','Longitude']])

## Zoning for public high schools

In [13]:
# training dataset for public high schools
high_pub = school_ratings_df.loc[school_ratings_df['school_type']=='High',:]

In [14]:
# public high schools classifier
high_pub_classifier = KNeighborsClassifier(n_neighbors=1)
high_pub_classifier.fit(high_pub.loc[:,["latitude","longitude"]],high_pub["school_id"])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [15]:
high_pub_classifier.score(high_pub.loc[:,["latitude","longitude"]],high_pub["school_id"])

1.0

In [16]:
# public high schools classification results
high_pub_pred = high_pub_classifier.predict(flood_data_df.loc[:,['Latitude','Longitude']])

## Adding classification and school ratings to main dataframe

In [17]:
# create a separate copy of the dataframe for each school type
elem_schools = flood_data_df.copy()

In [18]:
elem_schools["school_id"] = elem_pub_pred
elem_schools['school_type'] = 'Elementary'
elem_schools.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23230 entries, 0 to 23229
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Address            23230 non-null  object 
 1   Latitude           23230 non-null  float64
 2   Longitude          23230 non-null  float64
 3   Flood_Description  19160 non-null  object 
 4   Flood_Zone         23230 non-null  object 
 5   Zip_Code           23230 non-null  int64  
 6   school_id          23230 non-null  int64  
 7   school_type        23230 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 1.4+ MB


In [19]:
middle_schools = flood_data_df.copy()

In [20]:
middle_schools["school_id"] = middle_pub_pred
middle_schools['school_type'] = 'Middle'
middle_schools.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23230 entries, 0 to 23229
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Address            23230 non-null  object 
 1   Latitude           23230 non-null  float64
 2   Longitude          23230 non-null  float64
 3   Flood_Description  19160 non-null  object 
 4   Flood_Zone         23230 non-null  object 
 5   Zip_Code           23230 non-null  int64  
 6   school_id          23230 non-null  int64  
 7   school_type        23230 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 1.4+ MB


In [21]:
high_schools = flood_data_df.copy()

In [22]:
high_schools["school_id"] = high_pub_pred
high_schools['school_type'] = 'High'
high_schools.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23230 entries, 0 to 23229
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Address            23230 non-null  object 
 1   Latitude           23230 non-null  float64
 2   Longitude          23230 non-null  float64
 3   Flood_Description  19160 non-null  object 
 4   Flood_Zone         23230 non-null  object 
 5   Zip_Code           23230 non-null  int64  
 6   school_id          23230 non-null  int64  
 7   school_type        23230 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 1.4+ MB


In [23]:
# combine the three dataframes into one
flood_school_data = elem_schools.append(middle_schools, ignore_index=True)

In [24]:
flood_school_data = flood_school_data.append(high_schools, ignore_index=True)
flood_school_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69690 entries, 0 to 69689
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Address            69690 non-null  object 
 1   Latitude           69690 non-null  float64
 2   Longitude          69690 non-null  float64
 3   Flood_Description  57480 non-null  object 
 4   Flood_Zone         69690 non-null  object 
 5   Zip_Code           69690 non-null  int64  
 6   school_id          69690 non-null  int64  
 7   school_type        69690 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 4.3+ MB


In [30]:
flood_school_data.to_csv("Output/flood_data_and_school_id.csv", index=False)

In [26]:
flood_school_data.head()

Unnamed: 0,Address,Latitude,Longitude,Flood_Description,Flood_Zone,Zip_Code,school_id,school_type
0,"2202 CAROLINE ST Houston, Texas 77002",29.74614,-95.36987,AREA OF MINIMAL FLOOD HAZARD,X,77002,101912110,Elementary
1,"2204 CAROLINE ST Houston, Texas 77002",29.74619,-95.36996,AREA OF MINIMAL FLOOD HAZARD,X,77002,101912110,Elementary
2,"2206 CAROLINE ST Houston, Texas 77002",29.74624,-95.37004,AREA OF MINIMAL FLOOD HAZARD,X,77002,101912058,Elementary
3,"2251 AUSTIN ST Houston, Texas 77002",29.7453,-95.36882,AREA OF MINIMAL FLOOD HAZARD,X,77002,101912110,Elementary
4,"2255 AUSTIN ST Houston, Texas 77002",29.74525,-95.36874,AREA OF MINIMAL FLOOD HAZARD,X,77002,101912110,Elementary
