In [463]:
# Add table of contents

# Libraries and Notebook Options

In [519]:
import pandas as pd
import numpy as np
from sodapy import Socrata

from collections import Counter

# Exploratory Data Analysis
from pandas_profiling import ProfileReport

# Modeling
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# Visualization
# import seaborn as sns
# import matplotlib.pyplot as plt


In [408]:
np.set_printoptions(suppress=True) # Suppress scientific notation where possible

def start():
    options = {
        'display': {
            'max_columns': None,
            'max_colwidth': -1,
            'expand_frame_repr': False,  # Don't wrap to multiple pages
            'max_rows': 10,
            'max_seq_items': 25,         # Max length of printed sequence
            'precision': 5,
            'show_dimensions': False
        },
        'mode': {
            'chained_assignment': None   # Controls SettingWithCopyWarning
        }
    }

    for category, option in options.items():
        for op, value in option.items():
            pd.set_option(f'{category}.{op}', value)  # Python 3.6+

if __name__ == '__main__':
    start()

del start  # Clean up namespace in the interpreter

# Get Data

In [53]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata('data.sfgov.org', None)

# Authenticate the client
domain = 'data.sfgov.org'
app_token = 'tZuHs7ko1xQezRBAgUN10D0i9'
username = 'corraljrmiguel@gmail.com'
password = 'Xd9@rzwPhZjKsNg8cima'

client = Socrata(domain,
                 app_token,
                 username=username,
                 password=password)

# Returns as JSON from API
# converted to Python list of dictionaries by sodapy.
results = client.get('vw6y-z8j6', limit=10000)
# results = client.get('vw6y-z8j6', where='service_request_id=12167455')

# Convert to pandas dataframe
df = pd.DataFrame.from_records(results)



In [60]:
df.shape

(10000, 47)

# Data Cleaning

In [491]:
# # Pickle dataframe
# df.to_pickle('data/df_raw_10k.pkl')

# Load dataframe
df = pd.read_pickle('data/df_raw_10k.pkl')

In [675]:
# Import CSV – 4.1M rows
df = pd.read_csv('data/311_Cases.csv', nrows=10000)

# Sort dataframe by CaseID (newest first)
df.sort_values(by='CaseID', ascending=False, inplace=True)

# Reset the index
df.reset_index(drop=True, inplace=True)

In [676]:
# Drop all undocumented columns
df = df.iloc[:, 0:19]
# df = df.loc[:, ~df.columns.str.contains('^:@')]

df.shape

(10000, 19)

In [677]:
df.head()

Unnamed: 0,CaseID,Opened,Closed,Updated,Status,Status Notes,Responsible Agency,Category,Request Type,Request Details,Address,Street,Supervisor District,Neighborhood,Police District,Latitude,Longitude,Point,Source
0,12145810,02/27/2020 02:57:00 PM,02/27/2020 03:40:00 PM,02/27/2020 03:40:00 PM,Closed,"Case Resolved - Maribel Jaldon ""I Emailed Customer.""",County Clerk - G,General Request - COUNTY CLERK,request_for_service,county_clerk - request_for_service,Not associated with a specific address,,,,,0.0,0.0,"(0.0, 0.0)",Phone
1,12145806,02/27/2020 02:56:00 PM,,02/27/2020 03:46:52 PM,Open,sent,Duplicate Case Hold Queue,Street and Sidewalk Cleaning,Human or Animal Waste,Human or Animal Waste,"1400 17TH ST, SAN FRANCISCO, CA, 94107",17TH ST,10.0,Potrero Hill,SOUTHERN,37.76535,-122.39787,"(37.76535231, -122.39786653)",Web
2,12145798,02/27/2020 02:56:26 PM,02/28/2020 07:30:11 AM,02/28/2020 07:30:11 AM,Closed,Case is a Duplicate,DPW BSSR Queue,Street Defects,Pavement_Defect,Pavement_Defect,Intersection of 33RD AVE and PACHECO ST,33RD AVE,4.0,Outer Sunset,TARAVAL,37.74985,-122.49121,"(37.74984741, -122.49121094)",Integrated Agency
3,12145586,02/27/2020 02:21:00 PM,02/27/2020 04:03:57 PM,02/27/2020 04:03:57 PM,Closed,Case Resolved - Pickup completed.,Recology_Abandoned,Street and Sidewalk Cleaning,Bulky Items,Furniture,Intersection of TURK ST and LAGUNA ST,TURK ST,5.0,Cathedral Hill,NORTHERN,37.78112,-122.4272,"(37.78111649, -122.42720032)",Phone
4,12145581,02/27/2020 02:20:00 PM,,02/27/2020 02:45:03 PM,Open,open,MUNI Work Queue,MUNI Feedback,MUNI - Services_Service_Delivery_Facilities,611_Signs_Maps_and_Auto_Announcements,Intersection of 3RD ST and PALOU AVE,3RD ST,10.0,Bayview,BAYVIEW,37.73401,-122.39097,"(37.7340126, -122.39096832)",Phone


In [678]:
# Strip column names and format
df.columns = df.columns.str.strip().str.lower().str.replace(
    ' ', '_').str.replace('(', '').str.replace(')', '')

# Rename 'caseid' column
df.rename({'caseid': 'case_id'}, axis='columns', inplace=True)

In [679]:
# Convert time columns to datetime
df['opened'] = pd.to_datetime(df['opened'])
df['closed'] = pd.to_datetime(df['closed'])
df['updated'] = pd.to_datetime(df['updated'])

# df.iloc[:, 1:4] = pd.to_datetime(df.iloc[:, 1:4].stack()).unstack()

In [680]:
df.shape

(10000, 19)

In [681]:
# Remove incomplete years (2008 & 2020)
df = df.loc[(df['opened'] >= '2009-01-01') & (df['opened'] <= '2019-12-31')]

# Drop all rows with any missing value
df.dropna(how='any', inplace=True)

# Drop 'status' column as all incidents are 'closed' after running the code above
df.drop('status', axis=1, inplace=True)

df.shape

(2238, 18)

In [697]:
# Cast correct type for 'supervisor_district'
df['supervisor_district'] = df['supervisor_district'].astype('category')

In [698]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2238 entries, 7182 to 9999
Data columns (total 19 columns):
case_id                2238 non-null int64
opened                 2238 non-null datetime64[ns]
closed                 2238 non-null datetime64[ns]
updated                2238 non-null datetime64[ns]
status_notes           2238 non-null object
responsible_agency     2238 non-null object
category               2238 non-null object
request_type           2238 non-null object
request_details        2238 non-null object
address                2238 non-null object
street                 2238 non-null object
supervisor_district    2238 non-null category
neighborhood           2238 non-null object
police_district        2238 non-null object
latitude               2238 non-null float64
longitude              2238 non-null float64
point                  2238 non-null object
source                 2238 non-null object
is_duplicate           2238 non-null bool
dtypes: bool(1), category(1),

In [683]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()

In [None]:
# # Define datatypes
# cols = df[['responsible_agency', 'category', 'supervisor_district', 'request_type', 'neighborhoods_sffind_boundaries', 'police_district', ]]
# for col in cols:
#     df[col] = df[col].astype('category')

# df.info()

In [688]:
# Create is_duplicate column
def create_is_duplicate(df):
    '''Create 'is_duplicate' column'''
    
    # Search for 'duplicate' in status_notes
    df['is_duplicate'] = df['status_notes'].str.lower().str.contains('duplicate')
    
    # Fill 'nan' values with False
    df['is_duplicate'].fillna(False, inplace=True)
    
    return df

create_is_duplicate(df)
df.head(5)

Unnamed: 0,case_id,opened,closed,updated,status_notes,responsible_agency,category,request_type,request_details,address,street,supervisor_district,neighborhood,police_district,latitude,longitude,point,source,is_duplicate
7182,11879660,2019-12-30 23:38:00,2020-01-04 14:12:01,2020-01-04 14:12:01,Case Transferred - Should be routed to 311 Call Center,DPW Ops Queue,Tree Maintenance,Trees - Overgrown_Tree,Pruning_request,"43 CHENERY ST, SAN FRANCISCO, CA, 94131",CHENERY ST,8.0,Fairmount,INGLESIDE,37.74109,-122.42565,"(37.74109, -122.42565)",Mobile/Open311,False
7183,11879654,2019-12-30 23:36:00,2019-12-31 14:06:24,2019-12-31 14:06:24,Case Resolved - SES Graffiti Crew - Scrape - Pole - Sign (Comment),DPW Ops Queue,Illegal Postings,Illegal Postings - Affixed_Improperly,Affixed Improperly,"5828 GEARY BLVD, SAN FRANCISCO, CA, 94121",GEARY BLVD,1.0,Outer Richmond,RICHMOND,37.78036,-122.48195,"(37.78036333, -122.481955)",Mobile/Open311,False
7184,11879642,2019-12-30 23:30:00,2020-01-03 10:14:44,2020-01-03 10:14:44,Encampment Removed,HSOC Queue,General Request - PUBLIC WORKS,request_for_service,bsm - request_for_service,"49 ISIS ST, SAN FRANCISCO, CA, 94103",ISIS ST,6.0,South of Market,SOUTHERN,37.77003,-122.41459,"(37.77003037, -122.41458782)",Web,False
7185,11879593,2019-12-30 22:48:00,2020-01-03 09:54:29,2020-01-03 09:54:29,Encampment Removed,Duplicate Case Hold Queue,Encampments,Encampment Reports,Encampment Cleanup,"3515 16TH ST, SAN FRANCISCO, CA, 94114",16TH ST,8.0,Castro,MISSION,37.76424,-122.43125,"(37.7642427, -122.43125)",Web,False
7186,11879571,2019-12-30 22:19:33,2020-01-03 00:00:00,2020-01-03 00:00:00,Case Resolved - Abated Tags,DPW Ops Queue,Graffiti,Graffiti on Fire_Police_Callbox,Fire_Police_Callbox - Not_Offensive,Intersection of POST ST and TAYLOR ST,POST ST,3.0,Lower Nob Hill,CENTRAL,37.78789,-122.41171,"(37.78788736, -122.41170562)",Mobile/Open311,False


## Balancing Data

In [692]:
# Target variable
target_count = df['is_duplicate'].value_counts()

# Print class balance
print(f'Class 0: {target_count[0]}')
print(f'Class 1: {target_count[1]}')
print(f'Proportion: {round(target_count[0] / target_count[1], 2)} : 1')
print(f'Percentage of Majority Class: {round(target_count[0] / sum(target_count), 4)*100}')

Class 0: 2096
Class 1: 142
Proportion: 14.76 : 1
Percentage of Majority Class: 93.66


# Pre-Processing

In [693]:
df.head()

Unnamed: 0,case_id,opened,closed,updated,status_notes,responsible_agency,category,request_type,request_details,address,street,supervisor_district,neighborhood,police_district,latitude,longitude,point,source,is_duplicate
7182,11879660,2019-12-30 23:38:00,2020-01-04 14:12:01,2020-01-04 14:12:01,Case Transferred - Should be routed to 311 Call Center,DPW Ops Queue,Tree Maintenance,Trees - Overgrown_Tree,Pruning_request,"43 CHENERY ST, SAN FRANCISCO, CA, 94131",CHENERY ST,8.0,Fairmount,INGLESIDE,37.74109,-122.42565,"(37.74109, -122.42565)",Mobile/Open311,False
7183,11879654,2019-12-30 23:36:00,2019-12-31 14:06:24,2019-12-31 14:06:24,Case Resolved - SES Graffiti Crew - Scrape - Pole - Sign (Comment),DPW Ops Queue,Illegal Postings,Illegal Postings - Affixed_Improperly,Affixed Improperly,"5828 GEARY BLVD, SAN FRANCISCO, CA, 94121",GEARY BLVD,1.0,Outer Richmond,RICHMOND,37.78036,-122.48195,"(37.78036333, -122.481955)",Mobile/Open311,False
7184,11879642,2019-12-30 23:30:00,2020-01-03 10:14:44,2020-01-03 10:14:44,Encampment Removed,HSOC Queue,General Request - PUBLIC WORKS,request_for_service,bsm - request_for_service,"49 ISIS ST, SAN FRANCISCO, CA, 94103",ISIS ST,6.0,South of Market,SOUTHERN,37.77003,-122.41459,"(37.77003037, -122.41458782)",Web,False
7185,11879593,2019-12-30 22:48:00,2020-01-03 09:54:29,2020-01-03 09:54:29,Encampment Removed,Duplicate Case Hold Queue,Encampments,Encampment Reports,Encampment Cleanup,"3515 16TH ST, SAN FRANCISCO, CA, 94114",16TH ST,8.0,Castro,MISSION,37.76424,-122.43125,"(37.7642427, -122.43125)",Web,False
7186,11879571,2019-12-30 22:19:33,2020-01-03 00:00:00,2020-01-03 00:00:00,Case Resolved - Abated Tags,DPW Ops Queue,Graffiti,Graffiti on Fire_Police_Callbox,Fire_Police_Callbox - Not_Offensive,Intersection of POST ST and TAYLOR ST,POST ST,3.0,Lower Nob Hill,CENTRAL,37.78789,-122.41171,"(37.78788736, -122.41170562)",Mobile/Open311,False


In [523]:
# exclude
'case_id',
'status_notes' # Needs NLP
'request_type' # Needs NLP
'request_details' # Needs NLP
'address' # Needs NLP
'


Unnamed: 0,service_request_id,requested_datetime,updated_datetime,closed_date,status_description,status_notes,agency_responsible,service_name,service_subtype,service_details,address,street,supervisor_district,neighborhoods_sffind_boundaries,police_district,lat,long,source,media_url,is_duplicate
0,12190946,2020-03-09 01:11:00,2020-03-09 01:13:08,NaT,Open,,Noise Report Queue,Noise Report,other_excessive_noise,Noise Report - other_excessive_noise,"1005 MARKET ST, SAN FRANCISCO, CA, 94103",MARKET ST,6,South of Market,TENDERLOIN,37.781860351563,-122.410179138184,Phone,,False
1,12190944,2020-03-09 01:10:00,2020-03-09 03:34:13,NaT,Open,closed,Parking Enforcement Dispatch Queue,Parking Enforcement,Other_Illegal_Parking,Unknown - Mercedes Benz - Unknown,"37 PHELAN AVE, SAN FRANCISCO, CA, 94112 (Virtual)",PHELAN AVE,7,Sunnyside,INGLESIDE,37.724251469399,-122.452882728836,Web,,False
2,12190940,2020-03-09 01:08:03,2020-03-09 01:08:04,NaT,Open,open,Entertainment Commission - G,Noise Report,entertainment,Noise Report - entertainment,"752 LARKIN ST, SAN FRANCISCO, CA, 94109",LARKIN ST,6,Tenderloin,TENDERLOIN,37.78490415,-122.41786379,Mobile/Open311,,False
3,12190935,2020-03-09 01:02:48,2020-03-09 01:02:51,NaT,Open,,DPT Abandoned Vehicles Work Queue,Abandoned Vehicle,Abandoned Vehicles,DPT Abandoned Vehicles Low,"790 43RD AVE, SAN FRANCISCO, CA, 94121",43RD AVE,1,Outer Richmond,RICHMOND,37.7737818,-122.5034043,Web,{'url': 'http://mobile311.sfgov.org/reports/12190935/photos'},False
4,12190918,2020-03-09 00:41:23,2020-03-09 02:10:01,NaT,Open,accepted,DPW Ops Queue,Litter Receptacles,Add_remove_garbage_can,,Intersection of WAGNER ALY and END (000 BLOCK OF),WAGNER ALY,6,Tenderloin,TENDERLOIN,37.783569,-122.413326,Web,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,12168816,2020-03-03 14:51:00,2020-03-08 10:03:34,2020-03-08 10:03:34,Closed,Encampment Removed\nEncampment Cleared from sidewalk.,Duplicate Case Hold Queue,Street and Sidewalk Cleaning,Bulky Items,Mattress,"1402 17TH ST, SAN FRANCISCO, CA, 94107",17TH ST,10,Potrero Hill,SOUTHERN,37.765233571552,-122.397823615344,Web,,False
9996,12168817,2020-03-03 14:51:00,2020-03-03 17:48:33,2020-03-03 17:48:33,Closed,Case Resolved - Pickup completed.,Recology_Abandoned,Street and Sidewalk Cleaning,Bulky Items,Boxed or Bagged Items,Intersection of FOERSTER ST and MANGELS AVE,FOERSTER ST,7,Sunnyside,INGLESIDE,37.733070373535,-122.448875427246,Phone,,False
9997,12168810,2020-03-03 14:50:40,2020-03-03 15:15:04,NaT,Open,accepted,DPT Meter_Bike Queue,Graffiti,Graffiti on Parking_meter,Parking_meter - Not_Offensive,"550 TOWNSEND ST, SAN FRANCISCO, CA, 94103",TOWNSEND ST,6,South of Market,SOUTHERN,37.77282802,-122.40047436,Mobile/Open311,{'url': 'http://mobile311.sfgov.org/reports/12168810/photos'},False
9998,12168808,2020-03-03 14:50:20,2020-03-03 16:00:54,2020-03-03 16:00:54,Closed,Case Resolved - Pickup completed.,Recology_Overflowing,Street and Sidewalk Cleaning,City_garbage_can_overflowing,City_garbage_can_overflowing,"2256 GREAT HWY, SAN FRANCISCO, CA, 94116",GREAT HWY,4,Outer Sunset,TARAVAL,37.744293212891,-122.508027926202,Mobile/Open311,{'url': 'http://mobile311.sfgov.org/reports/12168808/photos'},False


In [522]:
for idx, col in enumerate(df.columns):
    print(idx, col)

0 service_request_id
1 requested_datetime
2 updated_datetime
3 closed_date
4 status_description
5 status_notes
6 agency_responsible
7 service_name
8 service_subtype
9 service_details
10 address
11 street
12 supervisor_district
13 neighborhoods_sffind_boundaries
14 police_district
15 lat
16 long
17 source
18 media_url
19 is_duplicate


In [512]:
X = df.iloc[:, 1:-1]
y = df['is_duplicate']

# split train, test for calibration
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020, stratify=y)

In [520]:
X_smoted, y_smoted = SMOTE(random_state=2020).fit_sample(X,y)

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [None]:
Counter(y_smoted)

# Feature Engineering

In [694]:
df.head()

Unnamed: 0,case_id,opened,closed,updated,status_notes,responsible_agency,category,request_type,request_details,address,street,supervisor_district,neighborhood,police_district,latitude,longitude,point,source,is_duplicate
7182,11879660,2019-12-30 23:38:00,2020-01-04 14:12:01,2020-01-04 14:12:01,Case Transferred - Should be routed to 311 Call Center,DPW Ops Queue,Tree Maintenance,Trees - Overgrown_Tree,Pruning_request,"43 CHENERY ST, SAN FRANCISCO, CA, 94131",CHENERY ST,8.0,Fairmount,INGLESIDE,37.74109,-122.42565,"(37.74109, -122.42565)",Mobile/Open311,False
7183,11879654,2019-12-30 23:36:00,2019-12-31 14:06:24,2019-12-31 14:06:24,Case Resolved - SES Graffiti Crew - Scrape - Pole - Sign (Comment),DPW Ops Queue,Illegal Postings,Illegal Postings - Affixed_Improperly,Affixed Improperly,"5828 GEARY BLVD, SAN FRANCISCO, CA, 94121",GEARY BLVD,1.0,Outer Richmond,RICHMOND,37.78036,-122.48195,"(37.78036333, -122.481955)",Mobile/Open311,False
7184,11879642,2019-12-30 23:30:00,2020-01-03 10:14:44,2020-01-03 10:14:44,Encampment Removed,HSOC Queue,General Request - PUBLIC WORKS,request_for_service,bsm - request_for_service,"49 ISIS ST, SAN FRANCISCO, CA, 94103",ISIS ST,6.0,South of Market,SOUTHERN,37.77003,-122.41459,"(37.77003037, -122.41458782)",Web,False
7185,11879593,2019-12-30 22:48:00,2020-01-03 09:54:29,2020-01-03 09:54:29,Encampment Removed,Duplicate Case Hold Queue,Encampments,Encampment Reports,Encampment Cleanup,"3515 16TH ST, SAN FRANCISCO, CA, 94114",16TH ST,8.0,Castro,MISSION,37.76424,-122.43125,"(37.7642427, -122.43125)",Web,False
7186,11879571,2019-12-30 22:19:33,2020-01-03 00:00:00,2020-01-03 00:00:00,Case Resolved - Abated Tags,DPW Ops Queue,Graffiti,Graffiti on Fire_Police_Callbox,Fire_Police_Callbox - Not_Offensive,Intersection of POST ST and TAYLOR ST,POST ST,3.0,Lower Nob Hill,CENTRAL,37.78789,-122.41171,"(37.78788736, -122.41170562)",Mobile/Open311,False


* Cosine similarity of service_subtype
* Time of day
* Day of the week
* Date
* Block type / building permit
* Lat/long rounded to 3rd decimal place
* Zip code
* Duplication ratio by category or something similar
* 