In [130]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import sys
sys.path.append('C:/Users/EvanMi/003_Shelter_Outcomes')

with open("helper_functions.py") as f:
    code = compile(f.read(), "helper_functions.py", 'exec')
    exec(code)

In [131]:
rng = np.random.RandomState(0)

print ('Reading in train data..')
train = pd.read_csv('train.csv')
train['type'] = 'train'

print ('Reading in test data..')
test = pd.read_csv('test.csv')
test['type'] = 'test'
test['OutcomeSubtype'] = ''
test['OutcomeType'] = ''

df = data_import(train, test)

print ('Running data preparation for train dataset')
X_train, y_train, le_train, X_train_cols = prep_data(df, 'train')

print ('Running data preparation for test dataset')
X_test, y_test, le_test, X_test_cols = prep_data(df, 'test')

col_check(X_train_cols, X_test_cols)

Reading in train data..
Reading in test data..
Running feature extraction process..
Dropping unused variables..
Running data preparation for train dataset
Encoding labels of the outcome variable..
Using one hot encoding for predictor variables..
Running data preparation for test dataset
Encoding labels of the outcome variable..
Using one hot encoding for predictor variables..
Columns are the same!!


In [7]:
from sklearn.svm import SVC

linear_svm = SVC(kernel="linear", C=0.025, probability = True)
rbf_svm = SVC(gamma=2, C=1, probability = True)

print ('Fitting linear SVM..')
linear_svm.fit(X_train, y_train)

print ('Fitting SVM with RBF kernel..')
rbf_svm.fit(X_train, y_train)

Fitting linear SVM..
Fitting SVM with RBF kernel..


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=2, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [92]:
predict_output(rbf_svm, 'rbf_svm')

Predicting outcomes..
Correct number of rows
Saving to CSV..
Done!


In [115]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=20)

param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(rf, param_grid=param_grid, n_jobs=-1)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1, 3, 10], 'max_features': [1, 3, 10], 'bootstrap': [True, False], 'max_depth': [3, None], 'min_samples_split': [1, 3, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [111]:
predict_output(grid_search, 'random_forest')

Predicting outcomes..
Correct number of rows
Saving to CSV..
Done!


In [None]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)

grid.fit(X_train, y_train)