In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

### Load pre-processed data

In [3]:
X_train = pd.read_csv('./readied_data_Q2/X_train.csv')
X_test= pd.read_csv('./readied_data_Q2/X_test.csv')
y_train = pd.read_csv('./readied_data_Q2/y_train.csv', header = None)
y_test = pd.read_csv('./readied_data_Q2/y_test.csv', header = None)

In [4]:
X_train.drop(columns = ['Unnamed: 0'], inplace = True)
X_test.drop(columns = ['Unnamed: 0'], inplace = True)

In [5]:
y_train.drop(columns = [y_train.columns.values.tolist()[0]], inplace = True)
y_test.drop(columns = [y_test.columns.values.tolist()[0]], inplace = True)

In [6]:
y_train.columns = ['target']
y_test.columns = ['target']

In [7]:
y_train = pd.Series(y_train['target'])
y_test = pd.Series(y_test['target'])

### Perform Feature Selection

In [8]:
from cLiML.model_prep import feature_select_logistic_reg

In [9]:
#df_feature = feature_select_logistic_reg(X_train, y_train, cv = 5)

In [10]:
#df_feature.head()

In [11]:
#df_feature.to_csv('./readied_data_Q2/df_feature.csv')

### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train);

In [13]:
lr.score(X_train,y_train)

0.9963119072708114

In [14]:
lr.score(X_test, y_test)

0.8445378151260504

### Logistic Regression with tuning

In [15]:
from sklearn.model_selection import GridSearchCV

In [19]:
np.logspace(-3, 3, 10)

array([1.00000000e-03, 4.64158883e-03, 2.15443469e-02, 1.00000000e-01,
       4.64158883e-01, 2.15443469e+00, 1.00000000e+01, 4.64158883e+01,
       2.15443469e+02, 1.00000000e+03])

In [20]:
params = {'penalty': ['l1','l2'],
         'C': np.logspace(-3, 3, 20)}

lrt = LogisticRegression()
lrt_gs = GridSearchCV(lrt, params, n_jobs = -1, cv = 10)
lrt_gs.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([1.00000e-03, 2.06914e-03, 4.28133e-03, 8.85867e-03, 1.83298e-02,
       3.79269e-02, 7.84760e-02, 1.62378e-01, 3.35982e-01, 6.95193e-01,
       1.43845e+00, 2.97635e+00, 6.15848e+00, 1.27427e+01, 2.63665e+01,
       5.45559e+01, 1.12884e+02, 2.33572e+02, 4.83293e+02, 1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
lrt_gs.best_params_

{'C': 0.0379269019073225, 'penalty': 'l2'}

In [22]:
lrt_gs.score(X_train, y_train)

0.9857744994731296

In [23]:
lrt_gs.score(X_test, y_test)

0.8739495798319328