In [None]:
cat /proc/cpuinfo

In [None]:
# from dask.distributed import Client, progress
# from sklearn.externals.joblib import parallel_backend

# client = Client(processes=False)
# # client = Client(processes=False, n_workers=4, threads_per_worker=8)
# client
# # client.close()

In [16]:
import pandas as pd
import numpy as np

# Tools
from collections import Counter
import pickle

# Preprocessing & Selections
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import train_test_split

# Sampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler

In [17]:
# Load dataframe
df = pd.read_pickle('../data/02_df_pre_model_2018.pkl')

# # Convert to Dask dataframe
# df = dd.from_pandas(df_pd, npartitions=16)
df.head()

Unnamed: 0,case_id,target,opened,closed,updated,status,status_notes,responsible_agency,category,request_type,...,opened_year,opened_month_sin,opened_month_cos,opened_week_sin,opened_week_cos,opened_day_sin,opened_day_cos,opened_hour_sin,opened_hour_cos,resolution_time
811474,9993791,1,2018-12-30 22:40:00,2018-12-30 23:13:06,2018-12-30 23:13:06,Closed,Case Resolved - Officer responded to request u...,Parking Enforcement Dispatch Queue,Parking Enforcement,Parking_on_Sidewalk,...,2018,-2.449294e-16,1.0,-2.449294e-16,1.0,-0.781831,0.62349,-0.5,0.866025,33.1
811481,9993771,1,2018-12-30 22:18:00,2018-12-30 22:19:29,2018-12-30 22:19:29,Closed,Case is Invalid - Contact name and phone numbe...,Parking Enforcement Dispatch Queue,Parking Enforcement,Blocking_Driveway_Cite_Only,...,2018,-2.449294e-16,1.0,-2.449294e-16,1.0,-0.781831,0.62349,-0.5,0.866025,1.483333
811482,9993764,0,2018-12-30 22:14:00,2018-12-30 23:12:50,2018-12-30 23:12:50,Closed,Case Resolved - Officer responded to request u...,Parking Enforcement Dispatch Queue,Parking Enforcement,Blocking_Driveway_Cite_Tow,...,2018,-2.449294e-16,1.0,-2.449294e-16,1.0,-0.781831,0.62349,-0.5,0.866025,58.833333
811494,9993737,0,2018-12-30 21:53:14,2018-12-30 22:18:00,2018-12-30 22:18:00,Closed,Case Transferred - See encampment sr#9993757,311 Supervisor Queue,General Request - PUBLIC WORKS,request_for_service,...,2018,-2.449294e-16,1.0,-2.449294e-16,1.0,-0.781831,0.62349,-0.707107,0.707107,24.766667
811508,9993687,0,2018-12-30 21:16:00,2018-12-30 21:38:46,2018-12-30 21:38:46,Closed,Case Resolved - Police Officer responded to re...,Parking Enforcement Dispatch Queue,Parking Enforcement,Blocking_Driveway_Cite_Only,...,2018,-2.449294e-16,1.0,-2.449294e-16,1.0,-0.781831,0.62349,-0.707107,0.707107,22.766667


In [18]:
# Train and test splitting

# Columns to exclude
exclude_cols = [
    'target', # Target variable
    'case_id',
    'opened', # Feature Eng
    'closed', # Feature Eng
    'updated',
    'status',
    'status_notes', # Needs NLP
    'request_details', # Needs NLP
    'address', # Needs NLP
#     'street', # Convert to 'category' type to get dummies
    'point',

    # New items
    'responsible_agency',
    'category', # Need to choose 'category' or 'request_type' NOT BOTH
#     'request_type', # Needs NLP
    'opened_year',
#     'opened_month_sin',
#     'opened_month_cos',
#     'opened_week_sin',
#     'opened_week_cos',
#     'opened_day_sin',
#     'opened_day_cos',
#     'opened_hour_sin',
#     'opened_hour_cos',
    'police_district',
    'supervisor_district',
    'latitude',
    'longitude',
]

# # Scale data using MinMax scaler
# # No need to standardize as all features are categorical (maybe scale lat/long....)
# scaler = MinMaxScaler()

# Predictor variables
X = df.drop(columns=exclude_cols, axis=0, inplace=False)

# Get dummies for categorical variables
X = pd.get_dummies(X, drop_first=True)

# Target variable
y = df['target']

# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, 
                                                    random_state=2020, 
                                                    stratify=y,  # Stratify to keep same class ratios
                                                    shuffle=True # Shuffle data since it's ordered chronologically
                                                   )
X_train.head()

Unnamed: 0,opened_month_sin,opened_month_cos,opened_week_sin,opened_week_cos,opened_day_sin,opened_day_cos,opened_hour_sin,opened_hour_cos,resolution_time,request_type_Abandoned Vehicle - Car2door,...,neighborhood_Westwood Park,neighborhood_Yerba Buena Island,source_Integrated Agency,source_Mail,source_Mobile/Open311,source_Other Department,source_Phone,source_Twitter,source_Web,has_media_1
1105065,-0.5,-0.866025,-0.2393157,-0.970942,0.0,1.0,0.258819,-0.965926,1374.0,0,...,0,0,0,0,1,0,0,0,0,1
1242659,0.8660254,-0.5,0.9350162,-0.354605,-0.974928,-0.222521,-0.965926,-0.258819,13356.0,0,...,0,0,0,0,0,0,0,0,1,0
1223943,0.5,-0.866025,0.8229839,-0.568065,0.781831,0.62349,-0.866025,0.5,354.183333,0,...,0,0,0,0,1,0,0,0,0,0
1125416,1.224647e-16,-1.0,1.224647e-16,-1.0,0.781831,0.62349,-0.866025,0.5,5.266667,0,...,0,0,0,0,0,0,0,0,1,1
1137108,1.224647e-16,-1.0,0.1205367,-0.992709,0.974928,-0.222521,0.965926,-0.258819,97.0,0,...,0,0,0,0,0,0,1,0,0,0


In [33]:
# Pickle for later use
with open('../data/03_X.pkl', 'wb') as f:
    pickle.dump(X, f)
    f.close()

# # TEMP
# X_train.to_pickle('../data/X_train.pkl')
# X_test.to_pickle('../data/X_test.pkl')
# y_train.to_pickle('../data/y_train.pkl')
# y_test.to_pickle('../data/y_test.pkl')

# Feature Selection

In [20]:
def select_features(X_train, y_train, X_test):
    '''Returns X_train, X_test, and feature selection function'''
    fs = SelectKBest(score_func=f_classif, k='all')
    fs.fit(X_train, y_train)
#     X_train_fs = fs.transform(X_train)
#     X_test_fs = fs.transform(X_test)
#     return X_train_fs, X_test_fs, fs
    return fs

# Feature selection
# X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
fs = select_features(X_train, y_train, X_test)

 1526 1584 1585 1639 1713 1748 1840 1915 1974 1993 2021 2058 2280 2293
 2312 2379 2384 2447 2500 2555 2574 2592 2783] are constant.
  f = msb / msw


In [21]:
# Feature scores
features_df = pd.DataFrame(data=[X_train.columns, fs.scores_.astype(int)]).transpose()
features_df.rename(columns={0: 'Feature', 1: 'ANOVA F-Value'}, inplace=True)
features_df.sort_values(by='ANOVA F-Value', ascending=False, inplace=True)
features_df.reset_index(drop=True, inplace=True)
features_df

Unnamed: 0,Feature,ANOVA F-Value
0,request_type_Abandoned Vehicles,39328
1,request_type_Bulky Items,20767
2,request_type_Abandoned Vehicle - Car4door,8708
3,request_type_Encampment Reports,6761
4,request_type_Parking_on_Sidewalk,4354
...,...,...
2785,street_AVENUE E,-9223372036854775808
2786,street_Adam Rodgers Park,-9223372036854775808
2787,street_HWY 101 TO I-80 RAMP,-9223372036854775808
2788,street_BLANCHE ST,-9223372036854775808


In [22]:
# Select features above threshold
threshold = 8
best_features_df = features_df[(features_df['ANOVA F-Value'] > threshold)]
best_features_df

Unnamed: 0,Feature,ANOVA F-Value
0,request_type_Abandoned Vehicles,39328
1,request_type_Bulky Items,20767
2,request_type_Abandoned Vehicle - Car4door,8708
3,request_type_Encampment Reports,6761
4,request_type_Parking_on_Sidewalk,4354
...,...,...
626,street_27TH AVE,9
627,street_BRADFORD ST,9
628,street_GENEBERN WAY,9
629,street_MARIN ST,9


In [23]:
# best_features_df.to_pickle('../data/best_features_df.pkl')
# best_features_df = pd.read_pickle('../data/best_features_df.pkl')

In [24]:
# Filter X_train & X_test with selected features
X_train = X_train.filter(items=best_features_df['Feature'])
X_test  = X_test.filter(items=best_features_df['Feature'])

# Clean column names
X_train.columns = X_train.columns.str.strip().str.lower().str.replace(
    ' ', '_').str.replace('(', '').str.replace(')', '')

X_test.columns = X_test.columns.str.strip().str.lower().str.replace(
    ' ', '_').str.replace('(', '').str.replace(')', '')

In [25]:
print('df\t', df.shape)
print('X_train\t', X_train.shape)
print('X_test\t', X_test.shape)
print('y_train\t', y_train.shape)
print('y_test\t', y_test.shape)

df	 (529769, 31)
X_train	 (423815, 631)
X_test	 (105954, 631)
y_train	 (423815,)
y_test	 (105954,)


# Class Balancing

In [26]:
# Target variable
target_count = df['target'].value_counts()

# Print class balance
print(f'Class 0: {target_count[0]}')
print(f'Class 1: {target_count[1]}')
print(f'Proportion: {round(target_count[0] / target_count[1], 2)} : 1')
print(f'Percentage of Majority Class: {round(target_count[0] / sum(target_count), 3)*100}')

Class 0: 418265
Class 1: 111504
Proportion: 3.75 : 1
Percentage of Majority Class: 79.0


## Oversampling

In [27]:
# # Define the oversampling method – SMOTE
# smote = SMOTE(random_state=2020)
# X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train)

# # Summarize the new class distribution
# Counter(y_train_smote)

## Undersampling

In [28]:
# # Define the undersampling method – RandomUnderSampler
rndm_under = RandomUnderSampler(random_state=2020)

# Transform the dataset
X_train_under, y_train_under = rndm_under.fit_sample(X_train, y_train)

# Summarize the new class distribution
Counter(y_train_under)

Counter({0: 89203, 1: 89203})

In [29]:
# # Define the undersampling method – NearMiss
# # Selects the closest examples from the majority class for each minority class.
# undersample = NearMiss(version=3, n_neighbors_ver3=3)

# # Transform the dataset
# X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

# # Summarize the new class distribution
# Counter(y_train_under)

In [30]:
# Pickle dataframes
df.to_pickle('../data/df.pkl')
X_train_under.to_pickle('../data/03_X_train_under.pkl')
X_test.to_pickle('../data/03_X_test.pkl')
y_train_under.to_pickle('../data/03_y_train_under.pkl')
y_test.to_pickle('../data/03_y_test.pkl')

# # Transform to Dask dataframes
# X_train_under = dd.from_pandas(X_train_under, npartitions=16)
# X_test        = dd.from_pandas(X_test, npartitions=16)
# y_train_under = dd.from_pandas(y_train_under, npartitions=16)
# y_test        = dd.from_pandas(y_test, npartitions=16)