In [1]:
%pylab nbagg

Populating the interactive namespace from numpy and matplotlib


In [80]:
from pathlib import Path
from os import listdir
import pandas as pd
from datetime import datetime, timedelta
from importlib import reload
import utils
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from scipy.stats import mode

In [81]:
from fastai.tabular import *

In [82]:
reload(utils)

<module 'utils' from 'C:\\Users\\efreiling\\Desktop\\new_york_311\\Notebooks\\utils.py'>

# Load the data
- We are only dealing with 10% of the data for memory and time purposes

In [4]:
data_folder = Path('../../data/new_york_311')
data_path = data_folder / 'data_311.csv'

In [5]:
orig_data = pd.read_csv(data_path)

  interactivity=interactivity, compiler=compiler, result=result)


# Data Processing

### Drop Certain Columns

One of the main columns that would be useful is resolution description. Depending on the scenario or application this could be considered to have leakage. However, if we were to include it in the features, we can try several approaches:
- Drop stop words
- Convert words to vectors using word2vec, take the average of the vectors
- Sklearn HashingVectorizer
- Use Tf-Idf to find important words, use word2vec on top N words

In [6]:
drop_cols = [
    'unique_key',                     # If interested in finding hidden leakage, investigate this column
    'agency_name',                    # Redundant to agancy
    'descriptor',                     # This gives away the complaint type, leakage
    'incident_address',               # Didnt want street numbers
    'bbl',                            # We already have enough goelocaions
    'location',                       # Redundant to Lat and Lng
    'resolution_action_updated_date', # caused errors
    'resolution_description'          # Probably very useful but not enough time to use and maybe leakage
]


In [102]:
df = utils.drop_useless_cols(orig_data, drop_cols)       

### Filter Rows
- Only use rows that contain the 130 complaint types
- Filter rows that are all NaN

In [103]:
df = utils.filter_rows(df)

### Define Continuous, Categorical, Date Columns

# Data Product 

Instead of building a model on the complaint type, I think it would be interesting to build a model on predicting the time elapsed between date created and closed date. I think this would be more product focused. Given a complaint, a user/manager can use the model to determine if it is going to take a long time to close. Maybe resources could be better allocated for quicler resolutions.

### Complaint type is a Cat Col

In [104]:
cont_cols = [
    'x_coordinate_state_plane',
    'y_coordinate_state_plane',
    'latitude',
    'longitude'
]
cat_cols = [
    'complaint_type',
    'agency',
    'borough',
    'location_type',
    'incident_zip',
    'street_name',
    'cross_street_1',
    'cross_street_2',
    'intersection_street_1',
    'intersection_street_2',
    'address_type',
    'city',
    'landmark',
    'facility_type',
    'status',
    'community_board',
    'open_data_channel_type',
    'park_facility_name',
    'park_borough',
    'vehicle_type',
    'taxi_company_borough',
    'taxi_pick_up_location',
    'bridge_highway_name',
    'bridge_highway_direction',
    'road_ramp',
    'bridge_highway_segment',
]
date_cols = [
    'created_date',
    'closed_date',
    'due_date',
]


### Process All Date Columns Except Closed Date
- Add columns for time elapsed between dates
- Add features for day of week, end of year, etc

In [105]:
# Convert all date cols to datetimes
for dc in date_cols:
    df[dc] = pd.to_datetime(df[dc])

# add columns for time elapsed between dates
df['time_to_close'] = (df['closed_date']-df['created_date']).astype('timedelta64[h]')
df['due_len'] = (df['due_date']-df['created_date']).astype('timedelta64[h]')
# df['time_over'] = (df['due_date']-df['closed_date']).astype('timedelta64[h]')
cont_cols += ['time_to_close', 'due_len']
df, cont_cols = utils.create_date_lengths(df, cont_cols)
date_cols.remove('closed_date')
# Add date boolean features, day of week, end of year, etc
for d in date_cols:
    add_datepart(df, d, drop=True)

# Keep track of created categorical columns
cat_cols += list(set(df.columns) - set(cont_cols) - set(cat_cols) - set(dep_var))

In [106]:
cat_cols.remove('closed_date')
df = df.drop(columns=['closed_date'])
dep_var = ['time_to_close']

### Process Continuous Columns
- Fill in missing values with the median of the column, then add another feature that idicates which rows had missing values
- Normalize Z-score

In [107]:
df, cont_cols, cat_cols = utils.process_cont_cols(df, cont_cols, cat_cols)

### Process Categorical Columns
- Ordinal Encoding for columns with number of classes greater than 20
- One hot encoding for columns with number of classes 20 or less

In [108]:
df, cat_cols, label_encoders = utils.process_cat_cols(df, cat_cols)

### Split Dataframe into Features and Target

In [109]:
y = df[dep_var[0]]
x = df.drop(columns=dep_var)

# Train a Random Forest Regressor and Compare to Naive Solution

In [133]:
seed = 45
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=seed)
clf = RandomForestRegressor(max_depth=4, random_state=seed)

In [111]:
clf.fit(train_x, train_y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=45, verbose=0,
                      warm_start=False)

In [120]:
preds = clf.predict(test_x)
rf_error = np.mean(np.abs(preds - np.array(test_y)))
naive_error = np.mean(np.abs(np.array(test_y)))
print('Accuracy')
print(f'Random Forest: {rf_error}, Naive: {naive_error}')

Accuracy
Random Forest: 0.022147717147440557, Naive: 0.03878168471282954


### Look at Feature Importance

In [114]:
importances = clf.feature_importances_

In [115]:
indices = np.argsort(importances)[::-1]

In [116]:
list(zip(train_x.columns[indices], importances[indices]))

[('time_over', 0.999262727257816),
 ('agency=DOB', 0.0004515258965276997),
 ('created_Elapsed', 0.00017768196978527315),
 ('due_Elapsed', 4.7922780976852544e-05),
 ('created_Year=2017', 2.4968938565360684e-05),
 ('due_len', 2.12454244010075e-05),
 ('due_Year=2017.0', 1.3913108836627094e-05),
 ('created_Dayofyear', 4.223636638326871e-09),
 ('due_Year=2018.0', 3.1382220175657653e-09),
 ('due_Week', 2.9819429626778257e-09),
 ('created_Month=7', 2.863481000812537e-09),
 ('created_Day', 5.416056945449304e-10),
 ('created_Month=4', 3.551070150067043e-10),
 ('created_Week', 3.373685296890105e-10),
 ('created_Month=5', 1.781644959481562e-10),
 ('park_borough=QUEENS', 3.562818857003088e-12),
 ('status=Started', 0.0),
 ('status=Unassigned', 0.0),
 ('time_over_missing=1.0', 0.0),
 ('status=Pending', 0.0),
 ('status=missing', 0.0),
 ('status=Open', 0.0),
 ('status=In Progress', 0.0),
 ('status=Draft', 0.0),
 ('status=Closed - Testing', 0.0),
 ('status=Closed', 0.0),
 ('status=Assigned', 0.0),
 ('f

### Drop Features that aren't Important

In [117]:
drop_cols_loc = np.where(importances < 1e-12)[0]
drop_cols = list(np.array(train_x.columns)[drop_cols_loc])
train_x = train_x.drop(columns=drop_cols)
test_x = test_x.drop(columns=drop_cols)

In [118]:
train_x.shape

(1583727, 16)

In [119]:
clf.fit(train_x, train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=45, verbose=0,
                      warm_start=False)

In [121]:
preds = clf.predict(test_x)
rf_error = np.mean(np.abs(preds - np.array(test_y)))
naive_error = np.mean(np.abs(np.array(test_y)))
print('Accuracy')
print(f'Random Forest: {rf_error}, Naive: {naive_error}')

Accuracy
Random Forest: 0.022147717147441452, Naive: 0.03878168471282954


# Hyperparameter Tuning

In [122]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

### Down Sample Data for Speed Up

In [123]:
train_x['target'] = train_y
x_train = train_x.sample(frac=0.1, replace=False)
y_train = x_train['target']
x_train = x_train.drop(columns=['target'])

### Vary Number of Trees

In [130]:
n_trees = [50, 100, 150, 200]
depth = [4, 8, 12]
tree_scores = []
for t in n_trees:
    print(f'Num Trees: {t}')
    rf = RandomForestRegressor(max_depth=depth[0], n_estimators=t)
    cross_val = cross_val_score(rf, x_train, y_train, cv=3, scoring='neg_mean_absolute_error')
    print(cross_val)
    tree_scores.append(cross_val)
tree_scores = np.array(tree_scores)
tree_avg = np.mean(tree_scores, axis=1)
idx = np.argmax(tree_avg)
num_trees = n_trees[idx]
print("Best num trees: ", num_trees)


Num Trees: 50
[-0.021753 -0.022265 -0.021911]
Num Trees: 100
[-0.021728 -0.022243 -0.021877]
Num Trees: 150
[-0.021719 -0.022225 -0.021918]
Num Trees: 200
[-0.0217   -0.022264 -0.021897]
Best num trees:  100


### Vary Tree Depth

In [132]:
depth_scores = []
for d in depth:
    print(f"Depth: {d}")
    rf = RandomForestRegressor(max_depth=d, n_estimators=50)
    cross_val = cross_val_score(rf, x_train, y_train, cv=3, scoring='neg_mean_absolute_error')
    print(cross_val)
    depth_scores.append(cross_val)
depth_scores = np.array(depth_scores)
np.save("depth_scores", depth_scores)
depth_avg = np.mean(depth_scores, axis=1)
idx = np.argmax(depth_avg)
print("Best depth: ", depth[idx])

Depth: 4
[-0.021713 -0.022313 -0.021914]
Depth: 8
[-0.017878 -0.018449 -0.017788]
Depth: 12
[-0.016409 -0.016786 -0.016294]
Best depth:  12


# Dont drop other columns

In [134]:
seed = 45
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=seed)

In [139]:
train_x['target'] = train_y
x_train = train_x.sample(frac=0.1, replace=False)
y_train = x_train['target']
x_train = x_train.drop(columns=['target'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [136]:
depth_scores = []
for d in depth:
    print(f"Depth: {d}")
    rf = RandomForestRegressor(max_depth=d, n_estimators=50)
    cross_val = cross_val_score(rf, x_train, y_train, cv=3, scoring='neg_mean_absolute_error')
    print(cross_val)
    depth_scores.append(cross_val)
depth_scores = np.array(depth_scores)
np.save("depth_scores", depth_scores)
depth_avg = np.mean(depth_scores, axis=1)
idx = np.argmax(depth_avg)
print("Best depth: ", depth[idx])

Depth: 4
[-0.022304 -0.022537 -0.02185 ]
Depth: 8
[-0.017478 -0.017565 -0.017211]
Depth: 12
[-0.015421 -0.015557 -0.015158]
Best depth:  12


In [140]:
rf = RandomForestRegressor(max_depth=12, n_estimators=50)
rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [141]:
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
list(zip(train_x.columns[indices], importances[indices]))

[('time_over', 0.9975609891733058),
 ('due_len', 0.0004319570794085206),
 ('agency=DOB', 0.0003555415613521718),
 ('created_Elapsed', 0.0003318855595745658),
 ('complaint_type', 0.00016321353491971047),
 ('created_Day', 8.896632661640047e-05),
 ('street_name', 8.842144145503708e-05),
 ('longitude', 7.943391257544284e-05),
 ('y_coordinate_state_plane', 7.067044789836401e-05),
 ('x_coordinate_state_plane', 6.548168692913363e-05),
 ('created_Dayofyear', 6.349475948380524e-05),
 ('community_board', 6.084915796843199e-05),
 ('latitude', 5.8606471998068886e-05),
 ('incident_zip', 5.606748169897259e-05),
 ('cross_street_1', 5.22899393843127e-05),
 ('created_Week', 3.500279149162649e-05),
 ('cross_street_2', 3.418263283379376e-05),
 ('facility_type=missing', 3.3467264473625715e-05),
 ('city', 3.129475922002533e-05),
 ('created_Dayofweek=6', 3.0075477801048807e-05),
 ('due_Elapsed', 2.552879421044456e-05),
 ('created_Month=10', 2.0941607122615773e-05),
 ('time_to_close_missing=0.0', 2.039865097