In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

import seaborn as sns
sns.set(style='whitegrid')
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)

In [None]:
# Use `conda install shapely` in your CS109a environment before attempting to import these libraries
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from shapely.geometry import shape, GeometryCollection
import json
import scipy as sp

In [None]:
# import master dataframe
master_df = pd.read_csv('data/master_df.csv')

# drop rows with NaN
master_df.dropna(how='any',axis=0, inplace=True) 

# drop index
master_df.drop(master_df.columns[0], axis=1, inplace=True)

# one hot encode days of the week, month
master_df = pd.get_dummies(master_df, columns=['DAY_OF_WEEK','MONTH'])

In [None]:
# Read in the GeoJSON file
with open("data/neighborhoods.geojson") as f:
  features = json.load(f)["features"]
features[23]['geometry']['coordinates'] = np.reshape(features[23]['geometry']['coordinates'][0], (1, 1635, 2))

In [None]:
# Add all the polygons to a list
polygons = []
for f in features:
    if(isinstance(f['geometry']['coordinates'][0][0][0], list)):
        polygons.append((f['properties']['Name'], Polygon(np.column_stack(f['geometry']['coordinates'][0]))))
    else:
        polygons.append((f['properties']['Name'], Polygon(np.column_stack(f['geometry']['coordinates']))))

In [None]:
# FUNC: Given a point, will return the neighborhood that the point belongs to
#       Returns None if no neighborhood is found
def get_neighborhood(latitude, longitude):
    point = Point(longitude, latitude)
    
    for (n, p) in polygons:
        if(point.within(p)):
            return(n)
    return(None)
get_neighborhood = np.vectorize(get_neighborhood)

In [None]:
neighborhoods = get_neighborhood(master_df.Lat, master_df.Long)

In [None]:
master_df['neighborhood'] = neighborhoods

In [None]:
master_df.neighborhood.value_counts()