# 3.0 McNulty Model Development

In [1]:
import mcnultymod
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
# from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import (train_test_split,
                                     cross_validate,
                                     cross_val_score,
                                     # cross_val_predict,
                                     learning_curve,
                                     StratifiedKFold,
                                     GridSearchCV,
                                     RandomizedSearchCV
                                     # KFold
                                    )

from datetime import datetime, timedelta, date
# import os
import pickle

import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
import seaborn as sns
plt.style.use('seaborn')
%matplotlib inline

## 1. First Round of Features

In [176]:
auctions = pd.read_pickle('../data/auctionsclosed.pkl')
auctions_test = pd.read_pickle('../data/auctionsclosed_test.pkl')

In [193]:
# What's the right way to do this...?
auctions['ptype_num'] = 0
# Evidently not this:
# auctions.ptype_num[auctions.purchasertype == 'Investor'] = 1
# Supposedly this is, but it still throws an error message:
# auctions.loc[:, 'ptype_num'][auctions.purchasertype == 'Investor'] = 1
# THIS IS IT! NEVER FORGET:
auctions.loc[auctions.purchasertype == 'Investor', 'ptype_num'] = 1

In [195]:
auctions.head()

Unnamed: 0,address,parcelid,price,closingdate,salestatus,buyerstatus,purchasertype,program,councildistrict,neighborhood,latitude,longitude,location,ptype_num
243,6784 Asbury Park,22060958-9,7700.0,2014-12-19,Closed,Selected,Homebuyer,Auction,7,Warren Ave Community,42.341163,-83.205641,"location\n(42.341163, -83.205641)",0
819,14345 Grandville,22088763.,11400.0,2015-04-23,Closed,Selected,Investor,Auction,1,Westwood Park,42.391877,-83.231375,"location\n(42.391877, -83.231375)",1
1068,6754 Iowa,15004080-1,1000.0,2017-01-05,Closed,Selected,Homebuyer,Auction,3,Grant,42.424756,-83.032456,"location\n(42.424756, -83.032456)",0
674,2224 W Boston Blvd,10002831.,62800.0,2015-06-17,Closed,Selected,Investor,Auction,5,Boston Edison,42.380017,-83.106803,"location\n(42.380017, -83.106803)",1
1339,17650 Beland,17015265.,1100.0,2017-04-28,Closed,Selected,Homebuyer,Auction,3,Mount Olivet,42.424499,-83.008787,"location\n(42.424499, -83.008787)",0


In [4]:
auctions.sample(10)

Unnamed: 0,address,parcelid,price,closingdate,salestatus,buyerstatus,purchasertype,program,councildistrict,neighborhood,latitude,longitude,location,ptype_num
674,2224 W Boston Blvd,10002831.,62800.0,2015-06-17,Closed,Selected,Investor,Auction,5,Boston Edison,42.380017,-83.106803,"location\n(42.380017, -83.106803)",1
234,5249 Cadieux,21075285.,4000.0,2014-10-29,Closed,Selected,Investor,Auction,4,East English Village,42.408295,-82.931747,"location\n(42.408295, -82.931747)",1
282,19801 Prairie,16023096.,5500.0,2016-07-29,Closed,Selected,Homebuyer,Auction,2,Oak Grove,42.438371,-83.146317,"location\n(42.438371, -83.146317)",0
375,3530 Kensington,21072727.,38450.0,2016-02-09,Closed,Selected,Homebuyer,Auction,4,East English Village,42.394606,-82.928536,"location\n(42.394606, -82.928536)",0
897,6458 Brace,22080871.,4900.0,2015-09-04,Closed,Selected,Homebuyer,Auction,7,Warrendale,42.33832,-83.222668,"location\n(42.33832, -83.222668)",0
131,4884 Yorkshire,21073180.,1400.0,2015-01-22,Closed,Selected,Homebuyer,Auction,4,East English Village,42.404036,-82.933418,"location\n(42.404036, -82.933418)",0
118,6245 Stahelin,22084069.,7700.0,2017-04-28,Closed,Selected,Homebuyer,Auction,7,Warrendale,42.335424,-83.224351,"location\n(42.335424, -83.224351)",0
12,5291 Courville,21071681.,1100.0,2017-09-15,Closed,Selected,Homebuyer,Auction,4,Morningside,42.405489,-82.941161,"location\n(42.405489, -82.941161)",0
1341,3878 Burns,17006227.,1000.0,2017-06-21,Closed,Selected,Homebuyer,Auction,5,Pingree Park,42.371679,-83.00159,"location\n(42.371679, -83.00159)",0
928,9192 Ward,22024554-5,4400.0,2017-04-18,Closed,Selected,Homebuyer,Auction,7,Barton-McFarland,42.362309,-83.17238,"location\n(42.362309, -83.17238)",0


For the inital run, I'll use price (continuous), lat/lon (continuous) and council district (categorical). If I have time before MVP, I'll bin neighborhood in some way, though that would inject colinearity with council district, and I don't think it would be that useful anyway.

In [5]:
auctions_short = auctions.filter(['ptype_num', 'price', 'latitude', 'longitude', 'councildistrict'])
auctions_short.sample(5)

Unnamed: 0,ptype_num,price,latitude,longitude,councildistrict
420,0,7400.0,42.408571,-82.937187,4
1015,0,12100.0,42.414861,-82.937985,4
239,0,7250.0,42.368342,-83.264727,7
326,0,4200.0,42.410837,-83.151302,2
1171,0,10450.0,42.410899,-83.202905,1


In [6]:
auctions_short.corr()

Unnamed: 0,ptype_num,price,latitude,longitude,councildistrict
ptype_num,1.0,0.029036,-0.01654,-0.041226,0.013897
price,0.029036,1.0,0.000813,0.046877,0.018727
latitude,-0.01654,0.000813,1.0,0.362209,-0.768277
longitude,-0.041226,0.046877,0.362209,1.0,-0.04558
councildistrict,0.013897,0.018727,-0.768277,-0.04558,1.0


In [7]:
y = auctions_short.iloc[:, 0]
X = auctions_short.iloc[:, 1:]

Sets certain columns to categories, creates dummies, standardizes data:

In [8]:
X_std = mcnultymod.prep_X(['councildistrict'], X)

### The Baseline: Guessing Investor for Everything

In [9]:
mcnultymod.print_scores('Dummy (guessing Investor)', mcnultymod.the_dummy(y))


Dummy (guessing Investor)
*    Accuracy:   0.3078
*    Precision:  0.3078
*    Recall:     1.0
*    F1:         0.4707


### KNN

In [10]:
knn_scores = mcnultymod.try_some_ks(X_std, y, max_k=15)

mcnultymod.print_knn_scores(knn_scores)

K-value	Acc.	Pre.	Rec.	F1
1	0.6012	0.3544	0.3676	0.3605
2	0.6596	0.3731	0.1558	0.2197
3	0.6069	0.3346	0.2835	0.3067
4	0.6597	0.3612	0.1277	0.1868
5	0.6357	0.3694	0.2523	0.2983
6	0.6568	0.36	0.134	0.1939
7	0.6491	0.3868	0.215	0.2756
8	0.6596	0.3389	0.1028	0.1573
9	0.6501	0.3657	0.1713	0.2327
10	0.6606	0.3582	0.0997	0.1545
11	0.6549	0.3694	0.1464	0.2089
12	0.6645	0.3658	0.0903	0.1426
13	0.6597	0.3651	0.1184	0.177
14	0.674	0.3619	0.0685	0.1143
15	0.6673	0.3614	0.0872	0.1385


### Logistic Regresion, SVM

In [40]:
model_list = [
    LogisticRegression(class_weight='balanced', random_state=23),
    LinearSVC(class_weight='balanced', random_state=23),
    SVC(class_weight='balanced', random_state=23),
    GaussianNB()
]

name_list = [
    'Logistic Regression',
    'Linear Support Vector Classifier',
    'Support Vector Classifier with RBF Kernel',
    'Naive Bayes (Gaussian)'
]

model_scores = mcnultymod.crossval(X_std, y, model_list, name_list)


Logistic Regression
		TRAIN	TEST
*   Accuracy:	0.5458	0.5216
*   Precision:	0.3392	0.3157
*   Recall:	0.5016	0.4764
*   F1:		0.4037	0.3794

Linear Support Vector Classifier
		TRAIN	TEST
*   Accuracy:	0.5453	0.5235
*   Precision:	0.338	0.3184
*   Recall:	0.4985	0.4825
*   F1:		0.4017	0.383

Support Vector Classifier with RBF Kernel
		TRAIN	TEST
*   Accuracy:	0.5786	0.5293
*   Precision:	0.3792	0.3259
*   Recall:	0.56	0.501
*   F1:		0.45	0.3914

Naive Bayes (Gaussian)
		TRAIN	TEST
*   Accuracy:	0.6647	0.6453
*   Precision:	0.2802	0.2004
*   Recall:	0.1005	0.0745
*   F1:		0.1476	0.1067


Our dummy had an F1 score of 0.4569. We're not even close... need more data.

## 2. With A Few More Features

In [129]:
auctions_v2 = pd.read_pickle('../data/auctionsclosed_v2.pkl')

In [130]:
auctions_v2.head()

Unnamed: 0,address,parcelid,price,closingdate,salestatus,buyerstatus,purchasertype,program,councildistrict,neighborhood,latitude,longitude,location,ptype_num,n_bin,thr_0,tcount_0,p_bin
0,6784 Asbury Park,22060958-9,7700.0,2014-12-19,Closed,Selected,Homebuyer,Auction,7,Warren Ave Community,42.341163,-83.205641,"location\n(42.341163, -83.205641)",0,2,1,58,1
1,14345 Grandville,22088763.,11400.0,2015-04-23,Closed,Selected,Investor,Auction,1,Westwood Park,42.391877,-83.231375,"location\n(42.391877, -83.231375)",1,4,1,6,1
2,6754 Iowa,15004080-1,1000.0,2017-01-05,Closed,Selected,Homebuyer,Auction,3,Grant,42.424756,-83.032456,"location\n(42.424756, -83.032456)",0,0,1,1,0
3,2224 W Boston Blvd,10002831.,62800.0,2015-06-17,Closed,Selected,Investor,Auction,5,Boston Edison,42.380017,-83.106803,"location\n(42.380017, -83.106803)",1,4,0,28,2
4,17650 Beland,17015265.,1100.0,2017-04-28,Closed,Selected,Homebuyer,Auction,3,Mount Olivet,42.424499,-83.008787,"location\n(42.424499, -83.008787)",0,1,1,8,0


In [131]:
auctions_v2_short = (auctions_v2
                     .filter(['ptype_num',
                              'p_bin',
                              'n_bin',
                              'thr_0',
                              'tcount_0'
                             ]))

In [132]:
auctions_v2_short.corr()

Unnamed: 0,ptype_num,p_bin,n_bin,thr_0,tcount_0
ptype_num,1.0,0.053263,0.326055,0.036197,-0.041813
p_bin,0.053263,1.0,0.127289,0.027718,0.066608
n_bin,0.326055,0.127289,1.0,0.055193,-0.00716
thr_0,0.036197,0.027718,0.055193,1.0,0.416548
tcount_0,-0.041813,0.066608,-0.00716,0.416548,1.0


In [133]:
yv2 = auctions_v2_short.iloc[:, 0]
Xv2 = auctions_v2_short.iloc[:, 1:]

In [134]:
Xv2_std = mcnultymod.prep_X(Xv2.columns[:-1], Xv2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X[col] = X[col].astype('category')


### LR, SVM, NB

In [135]:
model_scores = mcnultymod.crossval(Xv2_std, yv2, model_list, name_list)


Logistic Regression
		TRAIN	TEST
*   Accuracy:	0.6342	0.625
*   Precision:	0.4388	0.4293
*   Recall:	0.6752	0.6572
*   F1:		0.5319	0.5188

Linear Support Vector Classifier
		TRAIN	TEST
*   Accuracy:	0.6342	0.6222
*   Precision:	0.4391	0.4268
*   Recall:	0.6791	0.6602
*   F1:		0.5333	0.5181

Support Vector Classifier with RBF Kernel
		TRAIN	TEST
*   Accuracy:	0.6239	0.6049
*   Precision:	0.4338	0.4141
*   Recall:	0.7196	0.6882
*   F1:		0.5409	0.5169

Naive Bayes (Gaussian)
		TRAIN	TEST
*   Accuracy:	0.7037	0.7037
*   Precision:	1.0	1.0
*   Recall:	0.0374	0.0374
*   F1:		0.0721	0.072


With 10 folds instead of 5:

In [136]:
model_scores = mcnultymod.crossval(Xv2_std, yv2, model_list, name_list, folds=10)


Logistic Regression
		TRAIN	TEST
*   Accuracy:	0.6333	0.6259
*   Precision:	0.4381	0.4315
*   Recall:	0.6764	0.6632
*   F1:		0.5317	0.5218

Linear Support Vector Classifier
		TRAIN	TEST
*   Accuracy:	0.6327	0.6221
*   Precision:	0.438	0.4283
*   Recall:	0.6822	0.6694
*   F1:		0.5334	0.5213

Support Vector Classifier with RBF Kernel
		TRAIN	TEST
*   Accuracy:	0.6214	0.5962
*   Precision:	0.4316	0.4078
*   Recall:	0.7189	0.6819
*   F1:		0.539	0.5088

Naive Bayes (Gaussian)
		TRAIN	TEST
*   Accuracy:	0.7037	0.7037
*   Precision:	1.0	0.9
*   Recall:	0.0374	0.0374
*   F1:		0.0721	0.0715


And the dummy, for baseline comparison:

In [137]:
mcnultymod.print_scores('Dummy (guessing Investor)', mcnultymod.the_dummy(yv2))


Dummy (guessing Investor)
*    Accuracy:   0.3078
*    Precision:  0.3078
*    Recall:     1.0
*    F1:         0.4707


### RF

Since one can't do CV with Random Forest, here's a DIY tuning parameters based on training and OOB accuracy scores.

In [138]:
rf_scores = mcnultymod.rf_grid(Xv2_std, 
                               yv2, 
                               num_estimators=[15, 20, 50, 100, 500, 1000, 2000, 5000], 
                               max_depths=[3, 2]
                              )

Number of Trees: [15, 20, 50, 100, 500, 1000, 2000, 5000]
Max Depths: [3, 2]

Random Forest Scores
TREES	MAX D	TRAIN	OOB	DIFF
15	3	0.6299	0.5762	0.0537
20	3	0.652	0.6136	0.0384
50	3	0.6635	0.6299	0.0336
100	3	0.6711	0.6424	0.0288
500	3	0.6711	0.6424	0.0288
1000	3	0.6817	0.6453	0.0364
2000	3	0.6865	0.6405	0.046
5000	3	0.6779	0.6433	0.0345
15	2	0.6376	0.5944	0.0431
20	2	0.6731	0.6405	0.0326
50	2	0.65	0.6127	0.0374
100	2	0.651	0.6376	0.0134
500	2	0.6568	0.6424	0.0144
1000	2	0.6721	0.6539	0.0182
2000	2	0.6721	0.6529	0.0192
5000	2	0.6721	0.6587	0.0134

Maximum OOB: 1.0
Minimum difference between TRAIN and OOB of 0.0134 found with:
*   maxd_2 and trees_100
TRAIN SCORE LESS THAN MAX OOB SCORE, MODEL LIKELY UNDERFIT


#### Final round will include RF models with 5000 trees/max depth 2.

If time — make dictionary keys just the number of trees and number for max depth.

## 3. Grid Search on LR and SVC (RBF)

If time (or after presentation) — wrap this into a function.

In [139]:
# rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=42)

In [140]:
# LogisticRegression(class_weight='balanced')
# LinearSVC(class_weight='balanced')

### LR

In [141]:
lr_pgrid = {
    'penalty': ['l1', 'l2'],
    'C': [i / 10 for i in range(1,11)]
}

In [142]:
# lr_grid = RandomizedSearchCV(
#     LogisticRegression(class_weight='balanced'),
#     lr_pgrid,
#     cv=10,
#     scoring='f1'
#     n_iter=10,
#     random_state=23
# )

Because there are only 20 combinations to investigate, normal GridSearchCV will suffice.

In [143]:
lr_grid = GridSearchCV(
    LogisticRegression(class_weight='balanced', random_state=23),
    lr_pgrid,
    cv=10,
    scoring='f1'
)

In [144]:
lr_grid.fit(Xv2_std, yv2)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=23,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=0)

In [145]:
lr_grid.grid_scores_



[mean: 0.51107, std: 0.04931, params: {'C': 0.1, 'penalty': 'l1'},
 mean: 0.53632, std: 0.05332, params: {'C': 0.1, 'penalty': 'l2'},
 mean: 0.52625, std: 0.05181, params: {'C': 0.2, 'penalty': 'l1'},
 mean: 0.52910, std: 0.05272, params: {'C': 0.2, 'penalty': 'l2'},
 mean: 0.53277, std: 0.05459, params: {'C': 0.3, 'penalty': 'l1'},
 mean: 0.53140, std: 0.04817, params: {'C': 0.3, 'penalty': 'l2'},
 mean: 0.52740, std: 0.05365, params: {'C': 0.4, 'penalty': 'l1'},
 mean: 0.52627, std: 0.04651, params: {'C': 0.4, 'penalty': 'l2'},
 mean: 0.52459, std: 0.05070, params: {'C': 0.5, 'penalty': 'l1'},
 mean: 0.52632, std: 0.05077, params: {'C': 0.5, 'penalty': 'l2'},
 mean: 0.52674, std: 0.05061, params: {'C': 0.6, 'penalty': 'l1'},
 mean: 0.52612, std: 0.04795, params: {'C': 0.6, 'penalty': 'l2'},
 mean: 0.52584, std: 0.04823, params: {'C': 0.7, 'penalty': 'l1'},
 mean: 0.52350, std: 0.04996, params: {'C': 0.7, 'penalty': 'l2'},
 mean: 0.52647, std: 0.04824, params: {'C': 0.8, 'penalty': 'l

In [146]:
print(lr_grid.best_score_)
print(lr_grid.best_params_)
print(lr_grid.best_estimator_)

0.536324540935
{'C': 0.1, 'penalty': 'l2'}
LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=23,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)


### SVC Linear

In [147]:
svc_pgrid = {
    # 'penalty': ['l1', 'l2'],
    # 'loss': ['hinge', 'squared_hinge'],
    'C': [i / 10 for i in range(1,11)]
}

In [148]:
# svc_grid = RandomizedSearchCV(
#     LinearSVC(class_weight='balanced', random_state=23),
#     svc_pgrid,
#     cv=10,
#     scoring='f1',
#     n_iter=20,
#     random_state=23
# )

In [149]:
svc_grid = GridSearchCV(
    LinearSVC(class_weight='balanced', random_state=23),
    svc_pgrid,
    cv=10,
    scoring='f1'
)

In [150]:
svc_grid.fit(Xv2_std, yv2)

GridSearchCV(cv=10, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=23, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=0)

In [151]:
svc_grid.grid_scores_



[mean: 0.52125, std: 0.04750, params: {'C': 0.1},
 mean: 0.52155, std: 0.04798, params: {'C': 0.2},
 mean: 0.52155, std: 0.04798, params: {'C': 0.3},
 mean: 0.52155, std: 0.04798, params: {'C': 0.4},
 mean: 0.52155, std: 0.04798, params: {'C': 0.5},
 mean: 0.52155, std: 0.04798, params: {'C': 0.6},
 mean: 0.52155, std: 0.04798, params: {'C': 0.7},
 mean: 0.52155, std: 0.04798, params: {'C': 0.8},
 mean: 0.52155, std: 0.04798, params: {'C': 0.9},
 mean: 0.52155, std: 0.04798, params: {'C': 1.0}]

In [152]:
print(svc_grid.best_score_)
print(svc_grid.best_params_)
print(svc_grid.best_estimator_)

0.521551423546
{'C': 0.2}
LinearSVC(C=0.2, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=23, tol=0.0001,
     verbose=0)
