In [1]:
%pylab nbagg

Populating the interactive namespace from numpy and matplotlib


In [2]:
from pathlib import Path
from os import listdir
import pandas as pd
from datetime import datetime, timedelta
from importlib import reload
import utils
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from scipy.stats import mode

In [63]:
reload(utils)

<module 'utils' from 'C:\\Users\\efreiling\\Desktop\\new_york_311\\Notebooks\\utils.py'>

# Load the data
- We are only dealing with 10% of the data for memory and time purposes

In [4]:
data_folder = Path('../../data/new_york_311')
data_path = data_folder / 'data_311.csv'

In [5]:
orig_data = pd.read_csv(data_path)

  interactivity=interactivity, compiler=compiler, result=result)


# Data Processing

### Drop Certain Columns

One of the main columns that would be useful is resolution description. Depending on the scenario or application this could be considered to have leakage. However, if we were to include it in the features, we can try several approaches:
- Drop stop words
- Convert words to vectors using word2vec, take the average of the vectors
- Sklearn HashingVectorizer
- Use Tf-Idf to find important words, use word2vec on top N words

In [6]:
drop_cols = [
    'unique_key',                     # If interested in finding hidden leakage, investigate this column
    'agency_name',                    # Redundant to agancy
    'descriptor',                     # This gives away the complaint type, leakage
    'incident_address',               # Didnt want street numbers
    'bbl',                            # We already have enough goelocaions
    'location',                       # Redundant to Lat and Lng
    'resolution_action_updated_date', # caused errors
    'resolution_description'          # Probably very useful but not enough time to use and maybe leakage
]


In [76]:
df = utils.drop_useless_cols(orig_data, drop_cols)       

### Filter Rows
- Only use rows that contain the 130 complaint types
- Filter rows that are all NaN

In [77]:
df = utils.filter_rows(df)

### Define Continuous, Categorical, Date Columns

In [78]:
cont_cols = [
    'x_coordinate_state_plane',
    'y_coordinate_state_plane',
    'latitude',
    'longitude',
    'time_to_close',
    'due_len',
    'time_over'
]
cat_cols = [
    'agency',
    'borough',
    'location_type',
    'incident_zip',
    'street_name',
    'cross_street_1',
    'cross_street_2',
    'intersection_street_1',
    'intersection_street_2',
    'address_type',
    'city',
    'landmark',
    'facility_type',
    'status',
    'community_board',
    'open_data_channel_type',
    'park_facility_name',
    'park_borough',
    'vehicle_type',
    'taxi_company_borough',
    'taxi_pick_up_location',
    'bridge_highway_name',
    'bridge_highway_direction',
    'road_ramp',
    'bridge_highway_segment',
]
date_cols = [
    'created_date',
    'closed_date',
    'due_date',
]
dep_var = ['complaint_type']

### Process Date Columns
- Add columns for time elapsed between dates
- Add features for day of week, end of year, etc

In [79]:
 df, cont_cols, cat_cols = utils.process_date_cols(df, cont_cols, cat_cols, date_cols, dep_var)

### Process Continuous Columns
- Fill in missing values with the median of the column, then add another feature that idicates which rows had missing values
- Normalize Z-score

In [80]:
df, cont_cols, cat_cols = utils.process_cont_cols(df, cont_cols, cat_cols)

### Process Categorical Columns
- Ordinal Encoding for columns with number of classes greater than 20
- One hot encoding for columns with number of classes 20 or less

In [82]:
df, cat_cols, label_encoders = utils.process_cat_cols(df, cat_cols)

### Split Dataframe into Features and Target

In [None]:
x, y, label_encoders = utils.split_target(df, dep_var[0], label_encoders)

# Train a Random Forest Classifier and Compare to Naive Solution

In [37]:
seed = 45
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=seed)
clf = RandomForestClassifier(max_depth=4, random_state=seed)

In [38]:
clf.fit(train_x, train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=45, verbose=0,
                       warm_start=False)

In [21]:
preds = clf.predict(test_x)
correct_loc = np.where(preds == test_y)[0]

In [22]:
perc_correct = len(correct_loc) / len(preds)
perc_correct_naive = len(np.where(test_y == mode(test_y)[0][0])[0]) / len(test_y)

In [23]:
print('Accuracy:')
print(f'Random Forest: {round(perc_correct, 4)}, Mode: {round(perc_correct_naive, 4)}')

Accuracy:
Random Forest: 0.3709, Mode: 0.0819


### Look at Feature Importance

In [25]:
importances = clf.feature_importances_

In [27]:
indices = np.argsort(importances)[::-1]

In [36]:
list(zip(train_x.columns[indices], importances[indices]))

[('location_type', 0.13413712905088399),
 ('facility_type=missing', 0.08736788701911924),
 ('agency=NYPD', 0.06943029322972669),
 ('due_Elapsed', 0.059404650462870266),
 ('due_len_missing=0.0', 0.04644923446253123),
 ('created_Elapsed', 0.045418975289304775),
 ('due_Dayofyear', 0.04341417287545382),
 ('address_type=INTERSECTION', 0.039566238582605),
 ('time_over', 0.037816529474270874),
 ('agency=DOT', 0.03548596458764168),
 ('cross_street_2', 0.02782223136862763),
 ('facility_type=Precinct', 0.026923240392949195),
 ('time_to_close', 0.02299322794584701),
 ('street_name', 0.022283453225225958),
 ('status=Closed', 0.01853681468623733),
 ('closed_Elapsed', 0.01832531226289153),
 ('facility_type=DSNY Garage', 0.01786498417948957),
 ('borough=Unspecified', 0.017473911383111107),
 ('city', 0.014650682921886052),
 ('y_coordinate_state_plane_missing=0.0', 0.013212402361475247),
 ('due_Dayofweek=missing', 0.013122251922591158),
 ('time_over_missing=1.0', 0.012595844129405395),
 ('intersection_

### Drop Features that aren't Important

In [60]:
drop_cols_loc = np.where(importances < 1e-6)[0]
drop_cols = list(np.array(train_x.columns)[drop_cols_loc])
train_x = train_x.drop(columns=drop_cols)
test_x = test_x.drop(columns=drop_cols)

In [61]:
train_x.shape

(1583727, 73)

In [62]:
clf.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=45, verbose=0,
                       warm_start=False)

In [63]:
preds = clf.predict(test_x)
correct_loc = np.where(preds == test_y)[0]
perc_correct = len(correct_loc) / len(preds)
perc_correct_naive = len(np.where(test_y == mode(test_y)[0][0])[0]) / len(test_y)
print('Accuracy:')
print(f'Random Forest: {round(perc_correct, 4)}, Mode: {round(perc_correct_naive, 4)}')

Accuracy:
Random Forest: 0.3843, Mode: 0.0812


# Hyperparameter Tuning

In [67]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

### Down Sample Data for Speed Up

In [76]:
train_x['target'] = train_y
x_train = train_x.sample(frac=0.1, replace=False)
y_train = x_train['target']
x_train = x_train.drop(columns=['target'])

### Vary Number of Trees

In [84]:
n_trees = [50, 100, 150, 200]

tree_scores = []
for t in n_trees:
    print(f'Num Trees: {t}')
    rf = RandomForestClassifier(max_depth=depth[0], n_estimators=t, class_weight="balanced_subsample")
    cross_val = cross_val_score(rf, x_train, y_train, cv=3, scoring="balanced_accuracy")
    print(cross_val)
    tree_scores.append(cross_val)
tree_scores = np.array(tree_scores)
np.save("tree_scores", tree_scores)
tree_avg = np.mean(tree_scores, axis=1)
idx = np.argmax(tree_avg)
num_trees = n_trees[idx]
print("Best num trees: ", num_trees)


Num Trees: 50
[0.391306 0.399848 0.384107]
Num Trees: 100
[0.396151 0.405488 0.393057]
Num Trees: 150
[0.416462 0.40443  0.406666]
Num Trees: 200
[0.426623 0.425043 0.407703]
Best num trees:  200


### Vary Tree Depth

In [83]:
depth_scores = []
for d in depth:
    print(f"Depth: {d}")
    rf = RandomForestClassifier(max_depth=d, n_estimators=50, class_weight="balanced_subsample")
    cross_val = cross_val_score(rf, x_train, y_train, cv=3, scoring="balanced_accuracy")
    print(cross_val)
    depth_scores.append(cross_val)
depth_scores = np.array(depth_scores)
np.save("depth_scores", depth_scores)
depth_avg = np.mean(depth_scores, axis=1)
idx = np.argmax(depth_avg)
print("Best depth: ", depth[idx])

Depth: 4
[0.373872 0.392568 0.366777]
Depth: 8
[0.4994   0.494036 0.481254]
Depth: 12
[0.549855 0.540086 0.549453]
Depth: 16
[0.553707 0.549908 0.553146]
Best depth:  16


### Double Check Number of Trees

In [85]:
n_trees = [50, 100]

tree_scores = []
for t in n_trees:
    print(f'Num Trees: {t}')
    rf = RandomForestClassifier(max_depth=12, n_estimators=t, class_weight="balanced_subsample")
    cross_val = cross_val_score(rf, x_train, y_train, cv=3, scoring="balanced_accuracy")
    print(cross_val)
    tree_scores.append(cross_val)
tree_scores = np.array(tree_scores)
tree_avg = np.mean(tree_scores, axis=1)
idx = np.argmax(tree_avg)
num_trees = n_trees[idx]
print("Best num trees: ", num_trees)

Num Trees: 50
[0.551502 0.545424 0.54451 ]
Num Trees: 100
[0.555039 0.547201 0.548143]
Best num trees:  100


### Apply New Hyperparameters

In [87]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=seed)
clf = RandomForestClassifier(max_depth=12, n_estimators=50, random_state=seed)

In [88]:
train_x = train_x.drop(columns=drop_cols)
test_x = test_x.drop(columns=drop_cols)

In [89]:
clf.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=12, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=45, verbose=0,
                       warm_start=False)

In [90]:
preds = clf.predict(test_x)
correct_loc = np.where(preds == test_y)[0]
perc_correct = len(correct_loc) / len(preds)
print('Accuracy:')
print(f'Random Forest: {round(perc_correct, 4)}')

Accuracy:
Random Forest: 0.6386


# Date Validation

In [2]:
val_filt = (x['created_Year=2018'] == 1) | (x['created_Year=2019'] == 1)
val_x = x.loc[val_filt]
val_y = y[val_filt]
val_x.shape, val_y.shape

In [None]:
trn_filt = ( 
    (df['created_Year=2010'] == 1) | 
    (df['created_Year=2011'] == 1) | 
    (df['created_Year=2012'] == 1) |
    (df['created_Year=2013'] == 1) |
    (df['created_Year=2014'] == 1) |
    (df['created_Year=2015'] == 1) |
    (df['created_Year=2016'] == 1) |
    (df['created_Year=2017'] == 1) 
)
trn_x = x.loc[trn_filt]
trn_y = y[trn_filt]
trn_x.shape, trn_y.shape

In [None]:
trn_x = trn_x.drop(columns=drop_cols)
val_x = val_x.drop(columns=drop_cols)

In [None]:
clf = RandomForestClassifier(max_depth=12, n_estimators=50, random_state=seed)
clf.fit(trn_x, trn_y)

In [None]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=12, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=45, verbose=0,
                       warm_start=False)

In [None]:
preds = clf.predict(val_x)
correct_loc = np.where(preds == val_y)[0]
perc_correct = len(correct_loc) / len(preds)
print('Accuracy:')
print(f'Random Forest: {round(perc_correct, 4)}')

In [None]:
Accuracy:
Random Forest: 0.578

### Accuracy suffers but also missing data where created_Year = NaN

# Next Steps
- Look at confusion matrix 
- Try combining categories, i.e. Noise - Street/Sidewalk', 'Noise - Vehicle',
- Search other hyperparameters (max_features, etc) using a randomized search, RandomizedSearchCV 
- Experiment with different ways to handle categorical, try nn.Embeddings
- Figure out why fastai over trains