In [1]:
%pylab nbagg

Populating the interactive namespace from numpy and matplotlib


In [2]:
from pathlib import Path
from os import listdir

In [3]:
import pandas as pd
from datetime import datetime, timedelta

In [4]:
from importlib import reload
import utils
import pickle

In [24]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from scipy.stats import mode

In [6]:
reload(utils)

<module 'utils' from '/storage/new_york_311/Notebooks/utils.py'>

# Load the data
- We are only dealing with 10% of the data for memory and time purposes

In [7]:
data_folder = Path('../../data/new_york_311')
data_path = data_folder / 'data_311.csv'

In [12]:
orig_data = pd.read_csv(data_path)

  interactivity=interactivity, compiler=compiler, result=result)


# Data Processing

### Drop Certain Columns

One of the main columns that would be useful is resolution description. Depending on the scenario or application this could be considered to have leakage. However, if we were to include it in the features, we can try several approaches:
- Drop stop words
- Convert words to vectors using word2vec, take the average of the vectors
- Sklearn HashingVectorizer
- Use Tf-Idf to find important words, use word2vec on top N words

In [None]:
drop_cols = [
    'unique_key',                     # If interested in finding hidden leakage, investigate this column
    'agency_name',                    # Redundant to agancy
    'descriptor',                     # This gives away the complaint type, leakage
    'incident_address',               # Didnt want street numbers
    'bbl',                            # We already have enough goelocaions
    'location',                       # Redundant to Lat and Lng
    'resolution_action_updated_date', # caused errors
    # Maybe Use if there is time
    'resolution_description'         # Probably very useful but not enough time to use and maybe leakage
]


In [13]:
df = utils.drop_useless_cols(orig_data, drop_cols)       

### Filter Rows
Only use rows that contain the 130 complaint types

In [15]:
df = utils.filter_rows(df)

### Define Continuous, Categorical, Date Columns

In [16]:
cont_cols = [
    'x_coordinate_state_plane',
    'y_coordinate_state_plane',
    'latitude',
    'longitude',
    'time_to_close',
    'due_len',
    'time_over'
]
cat_cols = [
    'agency',
    'borough',
    'location_type',
    'incident_zip',
    'street_name',
    'cross_street_1',
    'cross_street_2',
    'intersection_street_1',
    'intersection_street_2',
    'address_type',
    'city',
    'landmark',
    'facility_type',
    'status',
    'community_board',
    'open_data_channel_type',
    'park_facility_name',
    'park_borough',
    'vehicle_type',
    'taxi_company_borough',
    'taxi_pick_up_location',
    'bridge_highway_name',
    'bridge_highway_direction',
    'road_ramp',
    'bridge_highway_segment',
]
date_cols = [
    'created_date',
    'closed_date',
    'due_date',
]
dep_var = ['complaint_type']

### Process Date Columns
- Add columns for time elapsed between dates
- Add features for day of week, end of year, etc

In [17]:
 df, cont_cols, cat_cols = utils.process_date_cols(df, cont_cols, cat_cols, date_cols, dep_var)

### Process Continuous Columns
- Fill in missing values with the median of the column, then add another feature that idicates which rows had missing values
- Normalize Z-score

In [18]:
df, cont_cols, cat_cols = utils.process_cont_cols(df, cont_cols, cat_cols)

### Process Categorical Columns
- Ordinal Encoding for columns with number of classes greater than 20
- One hot encoding for columns with number of classes 20 or less

In [19]:
df, cat_cols, label_encoders = utils.process_cat_cols(df, cat_cols)

### Split Dataframe into Features and Target

In [20]:
x, y, label_encoders = utils.split_target(df, dep_var[0], label_encoders)

# Train a Random Forest Classifier and Compare to Naive Solution

In [22]:
seed = 33
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=seed)
clf = RandomForestClassifier(max_depth=4, random_state=seed)

In [23]:
clf.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=33, verbose=0,
                       warm_start=False)

In [25]:
preds = clf.predict(test_x)
correct_loc = np.where(preds == test_y)[0]

In [28]:
perc_correct = len(correct_loc) / len(preds)
perc_correct_naive = len(np.where(test_y == mode(test_y)[0][0])[0]) / len(test_y)

In [30]:
print('Accuracy:')
print(f'Random Forest: {round(perc_correct, 4)}, Mode: {round(perc_correct_naive, 4)}')

Accuracy:
Random Forest: 0.4234, Mode: 0.0819


### Next Step 
- Find the hyperparameters using a randomized search like below
- This uses k fold cross validation, which can cause problems when trying to predict the future
- Use randomized search with k fold to find the right parameters, then train a model using the same parameters on data 2010 - 2018 and validate on 2019

In [None]:
# rf_random = RandomizedSearchCV(
#     estimator = clf, 
#     param_distributions = random_grid, 
#     n_iter = 10,                            # number of random trials 
#     cv = 3,                                 # K in k fold
#     verbose=10,                             # How much updating and printing
#     random_state=seed, 
#     n_jobs = -1
# )
# rf_random.fit(train_x, train_y)