<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# GridSearch and Pipelines Practice

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

### Load in the dataset from `data/basketball_data.csv`

In [2]:
bball = pd.read_csv('../data/basketball_data.csv')
bball.head()

Unnamed: 0,Season,GameId,GameDate,GameTime,HostName,GuestName,total_score,total_line,game_line,Host_HostRank,...,gPTS_avg10,gTS%_avg10,g3PAR_avg10,gFTr_avg10,gDRB%_avg10,gTRB%_avg10,gAST%_avg10,gSTL%_avg10,gBLK%_avg10,gDRtg_avg10
0,2013,201212090LAL,2012-12-09,6:30 pm,Los Angeles Lakers,Utah Jazz,227.0,207.5,7.5,13,...,99.0,0.5206,0.223,0.2981,69.22,50.05,61.57,8.63,10.31,110.87
1,2013,201212100PHI,2012-12-10,7:00 pm,Philadelphia 76ers,Detroit Pistons,201.0,186.5,5.5,13,...,90.3,0.5077,0.2144,0.3095,71.46,49.48,59.83,6.48,9.46,107.91
2,2013,201212100HOU,2012-12-10,7:00 pm,Houston Rockets,San Antonio Spurs,240.0,212.0,-7.0,12,...,108.0,0.5915,0.2743,0.2518,74.26,50.99,61.82,8.3,6.85,101.41
3,2013,201212110BRK,2012-12-11,7:00 pm,Brooklyn Nets,New York Knicks,197.0,195.5,-3.5,12,...,100.3,0.5473,0.3595,0.2544,74.23,47.88,52.07,9.31,7.64,109.24
4,2013,201212110DET,2012-12-11,7:30 pm,Detroit Pistons,Denver Nuggets,195.0,203.5,-4.5,11,...,101.1,0.5605,0.2173,0.3177,68.45,50.4,56.33,7.67,7.83,114.86


Add a target column called `host_wins` that is 1 if the host won the game and 0 if the visiting team won the game (*Hint:* use the `HostName` and `winner` columns).

In [3]:
bball['host_wins'] = (bball['HostName'] == bball['winner']).astype(int)

### Set up training & testing sets

- The test set will be the 2016 season
- Train is all other seasons

In [4]:
train = bball[bball['Season'] != 2016]

In [5]:
test = bball[bball['Season'] == 2016]

### Set up X_train, X_test, y_train, and y_test

`X` will be all columns except the following:
```python
['GameId','GameDate','GameTime','HostName', 'GuestName','total_score','total_line','game_line', 'winner','loser','host_wins','Season']
```

`y` will be whether the home team was the winner (`host_wins`)

In [6]:
X_train = train.drop(columns = ['GameId','GameDate','GameTime','HostName', 'GuestName','total_score','total_line','game_line', 'winner','loser','host_wins','Season'])
X_test = test.drop(columns = ['GameId','GameDate','GameTime','HostName', 'GuestName','total_score','total_line','game_line', 'winner','loser','host_wins','Season'])
y_train = train['host_wins']
y_test = test['host_wins']

### Calculate baseline accuracy

In [7]:
y_test.value_counts(normalize = True)

1    0.603046
0    0.396954
Name: host_wins, dtype: float64

### Create instances of StandardScaler and KNeighborsClassifier to be incorporated into the Pipeline

In [8]:
ss = StandardScaler()
knn = KNeighborsClassifier()

### Create two pipelines, one for a standalone model, and the other for GridSearchCV

In [9]:
pipe = Pipeline([
    ('ss', ss),
    ('knn', knn)
])

pipe_gs = Pipeline([
    ('ss', ss),
    ('knn', knn)
])

In [10]:
pipe_gs_params = {'ss__with_mean': [True, False], 
                 'ss__with_std': [True, False],
                 'knn__p': [1, 2], 
                 'knn__weights': ['uniform', 'distance'],
                 'knn__n_neighbors': [3, 5, 10]}

In [11]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ss',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

In [12]:
pipe.score(X_test, y_test)

0.5786802030456852

### GridSearchCV

In [13]:
gs = GridSearchCV(pipe_gs, pipe_gs_params, cv=5, verbose=1)

In [14]:
gs.fit(X_train, y_train);

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:   30.4s finished


In [15]:
gs.best_score_

0.6090548028363665

In [16]:
gs.best_params_

{'knn__n_neighbors': 10,
 'knn__p': 1,
 'knn__weights': 'uniform',
 'ss__with_mean': True,
 'ss__with_std': False}

In [17]:
gs.score(X_test, y_test)

0.6609137055837564

### Challenge: 

1. Create a pipeline with `StandardScaler` and `LogisticRegression`
    - For LogisticRegression, set solver to liblinear
2. Run your pipeline through `GridSearchCV`, testing the following parameters:
    - penalty
    - C
    
How does it score on the train and test sets? What are the best parameters?

In [18]:
pipe_lr = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LogisticRegression(solver = 'liblinear'))
])

In [19]:
pipe_lr_params = {'lr__penalty': ['l1', 'l2'],
                  'lr__C': [1.0, 0.5, 0.1]}

In [20]:
gs_lr = GridSearchCV(pipe_lr, pipe_lr_params, cv=5, verbose=1)

In [21]:
gs_lr.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    3.7s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('ss',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('lr',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                                                           max_iter=100,
                                                           multi_class='auto',
                  

In [22]:
gs_lr.best_params_

{'lr__C': 0.1, 'lr__penalty': 'l2'}

In [23]:
gs_lr.best_score_

0.65468142541622

In [24]:
gs_lr.score(X_test, y_test)

0.7025380710659899