In [63]:
#important libraries
%matplotlib inline
import sys
import os
import re
import time
import json
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set(rc={'figure.figsize':(12, 6),"font.size":20,"axes.titlesize":20,"axes.labelsize":20},style="darkgrid")
import matplotlib.dates as mdates
from datetime import datetime, date, time, timedelta
from geopy import distance
import geopy.distance
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

In [64]:
food_data = pd.read_csv('../data/processed/food_data_processed.csv',
                        parse_dates=['inspection_date'])
weather_data = pd.read_csv('../data/processed/weather_data_processed.csv',
                           parse_dates=['DATE'])
burglary = pd.read_csv('../data/processed/burglary_data_processed.csv',
                       parse_dates=['event_date', 'date'])
sanitation = pd.read_csv(
    '../data/processed/service_request_data_processed.csv',
    parse_dates=['created_date'])
date_cols = [
    'expiration_date', 'license_start_date',
    'application_requirements_complete', 'date_issued',
    'license_status_change_date'
]
business = pd.read_csv('../data/processed/business_data_processed.csv',
                       parse_dates=date_cols)

In [65]:
#merge inspection and weather data by Date
print(food_data.shape)
print(weather_data.shape)
food_data = pd.merge(left=food_data, right=weather_data, left_on='inspection_date', right_on='DATE')
print(food_data.shape)

(8234, 29)
(924, 8)
(8234, 37)


In [66]:
#Now merge inspection and business licenses by lincese number
print(business.shape)
food_business = pd.merge(left=food_data,
                         right=business,
                         left_on='license',
                         right_on='license_number',
                         validate='1:m')
food_business.sort_values('inspection_date', inplace=True)
print(food_business.shape)

(12701, 17)
(7450, 54)


In [73]:
from pandarallel import pandarallel
pandarallel.initialize()


def get_distance(a_lat, a_lng, b_lat, b_lng):
    R = 3956  # earth radius in miles
    a_lat = np.radians(a_lat)
    a_lng = np.radians(a_lng)
    b_lat = np.radians(b_lat)
    b_lng = np.radians(b_lng)
    d_lat = b_lat - a_lat
    d_lng = b_lng - a_lng

    d_lat_sq = np.sin(d_lat / 2)**2
    d_lng_sq = np.sin(d_lng / 2)**2

    a = d_lat_sq + np.cos(a_lat) * np.cos(b_lat) * d_lng_sq
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c  # returns distance between a and b in mile

def count_burglaries(row):
    df = burglary.copy(deep=True)
    time_window = 30
    distance_window = 1
    df = df[(df['date'] >
             (row['inspection_date'] - pd.Timedelta(days=time_window)))
            & (df['date'] < (row['inspection_date']))]
    #df['d_time'] = (row['inspection_date'] - df['date']).dt.days
    #df = df[(df['d_time'] < 0) & (df['d_time'] >= -time_window)]
    df['distance'] = get_distance(row['latitude'], row['longitude'],
                                  df['latitude'].values,
                                  df['longitude'].values)
    dfNew = df[df['distance'] < distance_window]
    return dfNew.shape[0]


def count_sanitation_complaints(row):
    df = sanitation.copy(deep=True)
    time_window = 30
    distance_window = 1
    #df['d_time'] = (row['inspection_date'] - df['created_date']).dt.days
    #df = df[(df['d_time'] < 0) & (df['d_time'] >= -time_window)]
    df = df[(df['created_date'] >
             (row['inspection_date'] - pd.Timedelta(days=time_window)))
            & (df['created_date'] < (row['inspection_date']))]
    df['distance'] = get_distance(row['latitude'], row['longitude'],
                                  df['latitude'].values,
                                  df['longitude'].values)
    dfNew = df[df['distance'] < distance_window]

    return dfNew.shape[0]
    #complaints = dfNew['sr_short_code'].value_counts()
    #comp_dict = {"sewer": complaints[0] , "sanitation": complaints[1], "rodent": complaints[2], "garbage": complaints[3]}
    #return comp_dict

New pandarallel memory created - Size: 2000 MB
Pandarallel will run on 4 workers


In [69]:
%timeit food_business['burglaries_last_month'] = food_business.parallel_apply(count_burglaries, axis=1)

3min 4s ± 1.15 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [74]:
%timeit food_business['burglaries_last_month'] = food_business.parallel_apply(count_sanitation_complaints, axis=1)

6min 9s ± 2min 14s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
food_business.isna().sum()

In [None]:
#food_business = food_business[~food_business['neighborhood'].isna()]
food_business = pd.concat([food_business, pd.get_dummies(food_business['neighborhood'], prefix='town')],axis=1)