# Summmary

This notebook covers: 

* how to creates a grid parameter tuning for not only the regression but the whole pipeline.
* how to download data directly from Kaggle and load it to a notebook,
* how to print a grid search as dataframe

# Set Environment

In [1]:
import os

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd

from sklearn import datasets

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Get Data

For the API to work, you need to:
    1. pip install kaggle
    2. create kaggle user
    3. go to My account, API, Create API token
    4. Open the jason file and update the Kaggle username and key below

In [2]:
os.environ['KAGGLE_USERNAME'] = "charlydethibault" # username from the json file
os.environ['KAGGLE_KEY'] = "XXXXXXXXXXXX" # key from the json file
!kaggle competitions download -c titanic # api copied from kaggle

Downloading titanic.zip to /Users/charlesd
  0%|                                               | 0.00/34.1k [00:00<?, ?B/s]
100%|███████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 974kB/s]


In [3]:
from zipfile import ZipFile 

# specifying the zip file name 
file_name = "titanic.zip"

# opening the zip file in READ mode 
with ZipFile(file_name, 'r') as zip: 
    # printing all the contents of the zip file 
    zip.printdir() 

    # extracting all the files 
    print('Extracting all the files now...') 
    zip.extractall() 
    print('Done!') 

File Name                                             Modified             Size
gender_submission.csv                          2019-12-11 02:17:12         3258
test.csv                                       2019-12-11 02:17:12        28629
train.csv                                      2019-12-11 02:17:12        61194
Extracting all the files now...
Done!


In [4]:
# get data
df = pd.read_csv("train.csv")
X = df[["Sex", "Name", "Fare"]]
y = df["Survived"]
#create instance
ohe = OneHotEncoder()
vect = CountVectorizer()

ct = make_column_transformer((ohe,['Sex']),(vect,'Name'))
clf = LogisticRegression(solver='liblinear',random_state=1)
# make pipeline
pipe = make_pipeline(ct,clf)

In [5]:
ohe
vect
ct
clf
pipe

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('onehotencoder', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True), ['Sex']), ('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtyp..._accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None), 'Name')])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

Pipeline(memory=None,
     steps=[('columntransformer', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('onehotencoder', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
    ...nalty='l2', random_state=1, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))])

# Cross Validate Entire pipeline

In [6]:
cross_val_score(pipe,X,y,cv=5,scoring ='accuracy').mean()

0.8024646897088326

# Find optimal tuning parameters for entire pipeline

In [7]:
# specify values to search
params = {}
params['columntransformer__countvectorizer__min_df'] = [1,2]
params['logisticregression__C'] = [0.1,1,10]
params['logisticregression__penalty'] = ['l1','l2']# this is L1 , L2 not 11, 12
# create grid
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('columntransformer', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('onehotencoder', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
    ...nalty='l2', random_state=1, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'columntransformer__countvectorizer__min_df': [1, 2], 'logisticregression__C': [0.1, 1, 10], 'logisticregression__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

# Results

In [8]:
grid.best_score_

0.8249158249158249

In [9]:
grid.best_params_

{'columntransformer__countvectorizer__min_df': 1,
 'logisticregression__C': 10,
 'logisticregression__penalty': 'l1'}

In [14]:
pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)


Unnamed: 0,columntransformer__countvectorizer__min_df,logisticregression__C,logisticregression__penalty,Accuracy
0,1,0.1,l1,0.786756
1,1,0.1,l2,0.786756
2,1,1.0,l1,0.820426
3,1,1.0,l2,0.802469
4,1,10.0,l1,0.824916
5,1,10.0,l2,0.814815
6,2,0.1,l1,0.786756
7,2,0.1,l2,0.786756
8,2,1.0,l1,0.820426
9,2,1.0,l2,0.810325
