In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

## Load features and target

In [2]:
features = np.load('../data/businesses_train_features.npy')
target = np.load('../data/business_target.npy')

## Scale, train/test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, target)

ss = StandardScaler()

X_train = ss.fit_transform(X_train)

X_test = ss.transform(X_test)



## Baseline Accuracy

In [9]:
target.mean()

0.31229046971224506

## Logistic Regression

In [10]:
lr = GridSearchCV(LogisticRegression(), param_grid={'random_state': [32], 
                                                    'C': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 
                                                    'solver': ['saga'],
                                                    'penalty': ['l2'],
                                                    'n_jobs': [-1],
                                                    'verbose': [1]})

In [None]:
lr.fit(X_train, y_train)

In [12]:
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.82158141719403521, 0.82175349520813346)

In [18]:
lr.best_params_

{'C': 10,
 'n_jobs': -1,
 'penalty': 'l2',
 'random_state': 32,
 'solver': 'saga',
 'verbose': 1}

## Random Forest

In [16]:
rf = GridSearchCV(RandomForestClassifier(), param_grid={'random_state': [32],
                                                        'n_estimators': [10, 50, 100],
                                                        'min_samples_split': range(2, 4),
                                                        'min_samples_leaf': range(8, 12),
                                                        'n_jobs': [-1],
                                                        'verbose':[-1]})

In [None]:
%%time
rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:   32.9s remaining:   49.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   59.1s finished
[Parallel(n_jobs=8)]: Done   4 out of  10 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=8)]: Done   4 out of  10 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:   28.4s remaining:   42.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   55.0s finished
[Parallel(n_jobs=8)]: Done   4 out of  10 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=8)]: Done   4 out of  10 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:   28.0s remaining:   42.1s
[Parallel(n_job

[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    1.0s finished
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    2.1s finished
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.5min finished
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.1s finished
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    2.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    4.4s finished
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  6.6min finished
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    3.7s finished
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   10.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   21.9s finished
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 14.1min
[Paralle

In [None]:
rf.score(X_train, y_train), rf.score(X_test, y_test)