<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# GridSearch and Pipelines Practice

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression

### Load in the dataset from `data/basketball_data.csv`

In [2]:
df = pd.read_csv('data/basketball_data.csv')
df_old = df

In [3]:
df.columns

Index(['Season', 'GameId', 'GameDate', 'GameTime', 'HostName', 'GuestName',
       'total_score', 'total_line', 'game_line', 'Host_HostRank',
       'Host_GameRank', 'Guest_GuestRank', 'Guest_GameRank', 'host_win_count',
       'host_lose_count', 'guest_win_count', 'guest_lose_count', 'game_behind',
       'winner', 'loser', 'host_place_streak', 'guest_place_streak',
       'hq1_avg10', 'hq2_avg10', 'hq3_avg10', 'hq4_avg10', 'hPace_avg10',
       'heFG%_avg10', 'hTOV%_avg10', 'hORB%_avg10', 'hFT/FGA_avg10',
       'hORtg_avg10', 'hFG_avg10', 'hFGA_avg10', 'hFG%_avg10', 'h3P_avg10',
       'h3PA_avg10', 'h3P%_avg10', 'hFT_avg10', 'hFTA_avg10', 'hFT%_avg10',
       'hORB_avg10', 'hDRB_avg10', 'hTRB_avg10', 'hAST_avg10', 'hSTL_avg10',
       'hBLK_avg10', 'hTOV_avg10', 'hPF_avg10', 'hPTS_avg10', 'hTS%_avg10',
       'h3PAR_avg10', 'hFTr_avg10', 'hDRB%_avg10', 'hTRB%_avg10',
       'hAST%_avg10', 'hSTL%_avg10', 'hBLK%_avg10', 'hDRtg_avg10', 'gq1_avg10',
       'gq2_avg10', 'gq3_avg10', 'gq

Add a target column called `host_wins` that is 1 if the host won the game and 0 if the visiting team won the game (*Hint:* use the `HostName` and `winner` columns).

In [4]:
def host_wins(row):
    if row['HostName'] == row['winner']:
        row['host_wins'] = 1
    else:
        row['host_wins'] = 0
    return row

df = df.apply(host_wins, axis=1)

### Set up training & testing sets

- The test set will be the 2016 season
- Train is all other seasons

In [5]:
train_set = df.loc[df['Season'] != 2016,:]
test_set = df.loc[df['Season'] == 2016, :]

### Set up X_train, X_test, y_train, and y_test

`X` will be all columns except the following:
```python
['GameId','GameDate','GameTime','HostName', 'GuestName','total_score','total_line','game_line', 'winner','loser','host_wins','Season']
```

`y` will be whether the home team was the winner (`host_wins`)

In [6]:
X_train = train_set.drop(columns=['GameId','GameDate','GameTime','HostName', 'GuestName','total_score',
                          'total_line','game_line', 'winner','loser','host_wins','Season'])
X_test = test_set.drop(columns=['GameId','GameDate','GameTime','HostName', 'GuestName','total_score',
                        'total_line','game_line', 'winner','loser','host_wins','Season'])
y_train = train_set['host_wins']
y_test = test_set['host_wins']

print(type(X_train), type(X_test), type(y_train), type(y_test))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


### Calculate baseline accuracy

In [7]:
y_test.value_counts(normalize=True)

1    0.603046
0    0.396954
Name: host_wins, dtype: float64

### Create instances of StandardScaler and KNeighborsClassifier to be incorporated into the Pipeline

In [19]:
ss = StandardScaler()
knn = KNeighborsClassifier()
logreg = LogisticRegression()

### Create two pipelines, one for a standalone model, and the other for GridSearchCV

In [9]:
pipe_1 = Pipeline([
    ('ss', ss),
    ('knn', knn)
])

pipe_2 = Pipeline([
    ('ss', ss),
    ('knn', knn)
])

In [10]:
pipe_2_params = {
    'ss__with_mean': [True, False],
    'ss__with_std': [True, False],
    'knn__n_neighbors': [i for i in range(1, 11)],
}

In [12]:
pipe_1.fit(X_train, y_train)
pipe_1.score(X_test, y_test)

0.5786802030456852

### GridSearchCV

In [13]:
pipe_2_gridsearch = GridSearchCV(estimator=pipe_2,
                                 param_grid=pipe_2_params,
                                 cv=5,
                                 verbose=1
                                )

In [15]:
pipe_2_gridsearch.fit(X_train, y_train)
pipe_2_gridsearch.score(X_test, y_test)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


0.601015228426396

In [16]:
pipe_2_gridsearch.best_score_

0.6083386073905688

In [17]:
pipe_2_gridsearch.best_params_

{'knn__n_neighbors': 9, 'ss__with_mean': True, 'ss__with_std': True}

In [18]:
pipe_2_gridsearch.best_estimator_

Pipeline(steps=[('ss', StandardScaler()),
                ('knn', KNeighborsClassifier(n_neighbors=9))])

### Challenge: 

1. Create a pipeline with `StandardScaler` and `LogisticRegression`
    - For LogisticRegression, set solver to liblinear
2. Run your pipeline through `GridSearchCV`, testing the following parameters:
    - penalty
    - C
    
How does it score on the train and test sets? What are the best parameters?

In [23]:
pipe_3 = Pipeline([
    ('ss', ss),
    ('logreg', logreg)
])
pipe_3.get_params()

{'memory': None,
 'steps': [('ss', StandardScaler()), ('logreg', LogisticRegression())],
 'verbose': False,
 'ss': StandardScaler(),
 'logreg': LogisticRegression(),
 'ss__copy': True,
 'ss__with_mean': True,
 'ss__with_std': True,
 'logreg__C': 1.0,
 'logreg__class_weight': None,
 'logreg__dual': False,
 'logreg__fit_intercept': True,
 'logreg__intercept_scaling': 1,
 'logreg__l1_ratio': None,
 'logreg__max_iter': 100,
 'logreg__multi_class': 'auto',
 'logreg__n_jobs': None,
 'logreg__penalty': 'l2',
 'logreg__random_state': None,
 'logreg__solver': 'lbfgs',
 'logreg__tol': 0.0001,
 'logreg__verbose': 0,
 'logreg__warm_start': False}

In [36]:
pipe_3_params = {
    'ss__with_mean': [True, False],
    'ss__with_std': [True, False],
    'logreg__C': [i for i in range(1, 11)],
#     'logreg__l1_ratio': [0, 0.25, 0.5, 0.75, 1.0],
    'logreg__solver': ['liblinear'],
}

pipe_3_gridsearch = GridSearchCV(estimator=pipe_3,
                                 param_grid=pipe_3_params,
                                 cv=5,
                                 verbose=1
                                )

In [37]:
pipe_3_gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('logreg', LogisticRegression())]),
             param_grid={'logreg__C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'logreg__solver': ['liblinear'],
                         'ss__with_mean': [True, False],
                         'ss__with_std': [True, False]},
             verbose=1)

In [44]:
pipe_3_gridsearch.score(X_test, y_test)

0.6923857868020304

In [38]:
pipe_3_gridsearch.best_score_

0.652534776487607

In [42]:
pipe_3_gridsearch.best_params_

{'logreg__C': 2,
 'logreg__solver': 'liblinear',
 'ss__with_mean': False,
 'ss__with_std': True}

In [43]:
pipe_3_gridsearch.best_estimator_

Pipeline(steps=[('ss', StandardScaler(with_mean=False)),
                ('logreg', LogisticRegression(C=2, solver='liblinear'))])