In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
orig_data = pd.read_csv(os.path.join(dirname, filename))

In [None]:
orig_data.head()

In [None]:
cont_cols = [
    'x_coordinate_state_plane',
    'y_coordinate_state_plane',
    'latitude',
    'longitude',
    'time_to_close',
    'due_len',
    'time_over'
]
cat_cols = [
    'agency',
    'borough',
    'location_type',
    'incident_zip',
    'street_name',
    'cross_street_1',
    'cross_street_2',
    'intersection_street_1',
    'intersection_street_2',
    'address_type',
    'city',
    'landmark',
    'facility_type',
    'status',
    'community_board',
    'open_data_channel_type',
    'park_facility_name',
    'park_borough',
    'vehicle_type',
    'taxi_company_borough',
    'taxi_pick_up_location',
    'bridge_highway_name',
    'bridge_highway_direction',
    'road_ramp',
    'bridge_highway_segment',
]
date_cols = [
    'created_date',
    'closed_date',
    'due_date',
]
dep_var = ['complaint_type']

drop_cols = [
    'unique_key',                     # If interested in leakage, investigate this column
    'agency_name',                    # Redundant to agancy
    'descriptor',                     # This gives away the complaint type, too easy
    'incident_address',               # Didnt want street numbers, too easy
    'bbl',                            # We already have enough goelocaions
    'location',                       # Redundant to Lat and Lng
    'resolution_action_updated_date', # caused errors
    # Maybe Use if there is time
    'resolution_description',         # Probably very useful but not enough time to use

]


In [None]:
labels = ['APPLIANCE', 'Abandoned Vehicle', 'Air Quality', 'Animal Abuse',
       'Animal in a Park', 'Asbestos', 'BEST/Site Safety',
       'Benefit Card Replacement', 'Blocked Driveway', 'Boilers',
       'Borough Office', 'Broken Muni Meter', 'Broken Parking Meter',
       'Building Marshals office', 'Building/Use',
       'Bus Stop Shelter Complaint', 'Construction',
       'Construction Safety Enforcement', 'Consumer Complaint',
       'Curb Condition', 'DCA / DOH New License Application Request',
       'DHS Advantage - Tenant', 'DHS Advantage -Landlord/Broker',
       'DOF Parking - Payment Issue', 'DOF Parking - Request Copy',
       'DOF Parking - Request Status', 'DOF Property - Owner Issue',
       'DOF Property - Payment Issue', 'DOF Property - Reduction Issue',
       'DOF Property - Request Copy', 'DOF Property - Update Account',
       'DOOR/WINDOW', 'DPR Internal', 'Damaged Tree', 'Dead Tree',
       'Dead/Dying Tree', 'Derelict Bicycle', 'Derelict Vehicle',
       'Derelict Vehicles', 'Dirty Conditions', 'Drinking', 'ELECTRIC',
       'Electrical', 'Electronics Waste', 'Elevator',
       'Emergency Response Team (ERT)', 'FLOORING/STAIRS',
       'Food Establishment', 'Food Poisoning',
       'For Hire Vehicle Complaint', 'GENERAL', 'GENERAL CONSTRUCTION',
       'General Construction/Plumbing', 'Graffiti', 'HEAT/HOT WATER',
       'HEATING', 'HPD Literature Request', 'Hazardous Materials',
       'Highway Condition', 'Homeless Encampment',
       'Homeless Person Assistance', 'Housing - Low Income Senior',
       'Housing Options', 'Illegal Parking', 'Illegal Tree Damage',
       'Indoor Air Quality', 'Indoor Sewage', 'Industrial Waste',
       'Investigations and Discipline (IAD)', 'Lead',
       'Litter Basket / Request', 'Maintenance or Facility',
       'Miscellaneous Categories', 'Missed Collection (All Materials)',
       'NONCONST', 'New Tree Request', 'Noise', 'Noise - Commercial',
       'Noise - Helicopter', 'Noise - House of Worship', 'Noise - Park',
       'Noise - Residential', 'Noise - Street/Sidewalk',
       'Noise - Vehicle', 'Noise Survey', 'Non-Emergency Police Matter',
       'Non-Residential Heat', 'OEM Literature Request',
       'OUTSIDE BUILDING', 'Other Enforcement',
       'Overflowing Litter Baskets', 'Overgrown Tree/Branches',
       'PAINT - PLASTER', 'PAINT/PLASTER', 'PLUMBING', 'Plumbing',
       'Recycling Enforcement', 'Request Large Bulky Item Collection',
       'Rodent', 'Root/Sewer/Sidewalk Condition', 'SAFETY', 'SCRIE',
       'Sanitation Condition', 'School Maintenance', 'Sewer',
       'Sidewalk Condition', 'Smoking', 'Snow',
       'Special Projects Inspection Team (SPIT)', 'Standing Water',
       'Street Condition', 'Street Light Condition',
       'Street Sign - Damaged', 'Street Sign - Dangling',
       'Street Sign - Missing', 'Taxi Complaint', 'Taxi Report',
       'Traffic', 'Traffic Signal Condition', 'Traffic/Illegal Parking',
       'UNSANITARY CONDITION', 'Unleashed Dog',
       'Unsanitary Animal Pvt Property', 'Vacant Lot', 'Vending',
       'Violation of Park Rules', 'WATER LEAK', 'Water Conservation',
       'Water Quality', 'Water System']

In [None]:
df = orig_data.drop(columns=drop_cols)
df = df[df['complaint_type'].isin(labels)]

In [None]:
df.head()

In [None]:
for dc in date_cols:
    df[dc] = pd.to_datetime(df[dc])


In [None]:
df['time_to_close'] = (df['closed_date']-df['created_date']).astype('timedelta64[h]')
df['due_len'] = (df['due_date']-df['created_date']).astype('timedelta64[h]')
df['time_over'] = (df['due_date']-df['closed_date']).astype('timedelta64[h]')
cont_cols += ['time_to_close', 'due_len', 'time_over']

In [None]:
train_perc = 0.8
valid_idx = int(len(df) * train_perc)

In [None]:
from fastai import *
from fastai.tabular import *

In [None]:
procs = [FillMissing, Categorify, Normalize] # FillMissing
dep_var = 'complaint_type'

In [None]:
data = (TabularList.from_df(df, path='.', cat_names=cat_cols, cont_names=cont_cols, procs=procs)
                        .split_by_idx(list(range(valid_idx,len(df))))
                        .label_from_df(cols = dep_var)
                        .databunch())

In [None]:
learn = tabular_learner(data, layers=[200, 100], metrics=accuracy)
learn.fit(1, 1e-4)

Train Loss: 1.053598	Val Loss: 1064.030518	Accuracy: 0.022021