In [1]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest , RFECV
from sklearn.pipeline import Pipeline
import numpy as np

In [2]:
titanic = pd.read_csv('../data/train_titanic.csv')

In [3]:
X_train, X_test, y_train, y_test =\
train_test_split(titanic.drop(columns=['PassengerId', 'Fare']),\
                titanic['Fare'], random_state=78701)

In [4]:
X_train.head(2)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,Embarked
299,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50.0,0,1,PC 17558,B58 B60,C
427,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louis...",female,19.0,0,0,250655,,S


In [5]:
### A function to clean our data. 
def data_cleaner(df):
    df_out = df.copy()
    df_out['Name'] = df_out['Name'].map(lambda x: x.split('.')[0].split(',')[-1])
    df_out['Sex'] = df_out['Sex'].map(lambda x: 1 if x == 'male' else 0)
    df_out['Age'] = df_out['Age'].fillna(df_out['Age'].mean())
    df_out = df_out.loc[:,['Name', 'Sex', 'Age', 'Pclass', 'Survived']]
    return pd.get_dummies(df_out, columns=['Name'], drop_first=True)

### Fit our model to training data

In [6]:
X_tr_model = data_cleaner(X_train)
## scale training data - use a dataframe for ease of use 
ss = StandardScaler()

X_train_sc = pd.DataFrame(ss.fit_transform(X_tr_model),\
                          columns= X_tr_model.columns)
## instantiate model 
lasso = Lasso()

## fit model 
lasso.fit(X_train_sc, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

### Make predictions on test data....what's happening here?


In [7]:
### use function 
X_te_model = data_cleaner(X_test)

### scale test data 
ss.transform(X_te_model)

ValueError: operands could not be broadcast together with shapes (223,14) (16,) (223,14) 

In [8]:
## columns only in train 
only_tr_cols = list(set(X_tr_model.columns) - set(X_te_model.columns))
## columns only in test 
only_te_cols = list(set(X_te_model.columns) - set(X_tr_model.columns))

###  Gridsearch 

In [9]:
X_te_model = X_te_model.drop(columns=only_te_cols)

In [10]:
only_tr_cols

['Name_ Lady', 'Name_ Sir', 'Name_ Mme', 'Name_ Don', 'Name_ Dr', 'Name_ Col']

In [11]:
### add columns of zeros to test dataframe 
for col in only_tr_cols:
    X_te_model[col] = 0 

In [12]:
### align columns 
X_te_model = X_te_model.loc[:, X_tr_model.columns]

In [13]:
#ss.transform(X_te_model)

In [14]:
X_test_sc = pd.DataFrame(ss.transform(X_te_model), 
             columns=X_te_model.columns)

In [15]:
lasso.score(X_test_sc, y_test)

0.23510898090164267

### Pipeline Gridsearch

(if time allows)

In [16]:
Lasso()

params = {
    'alpha':[-20,-10,-1,1,10,20], 
    'max_iter':[1e6]
}

gs = GridSearchCV(Lasso(), param_grid=params, cv=5)


In [17]:
gs.fit(X_train_sc, y_train)

  positive)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [-20, -10, -1, 1, 10, 20],
                         'max_iter': [1000000.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [18]:
gs.best_params_

{'alpha': 1, 'max_iter': 1000000.0}

In [19]:
gs.score(X_test_sc, y_test)

0.23510898090164267

In [20]:
gs.best_estimator_.score(X_test_sc, y_test)

0.23510898090164267

In [21]:
### pipeline with gridsearch and feature selection 

pipe = Pipeline([
    ('ss', StandardScaler()), 
    ('feat_select', RFECV(Lasso())),   
    ('model', Lasso())
])

pipe_params = {
   # 'feat_select__estimator':[Lasso()], 
    'feat_select__cv':[5], 
    'feat_select__min_features_to_select':[5,12],
    'model__alpha':np.logspace(start=-1, stop=2, num=10)
}

In [22]:
gs_pipe = GridSearchCV(pipe, pipe_params, cv=5)

In [23]:
gs_pipe.fit(X_train_sc, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('ss',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('feat_select',
                                        RFECV(cv='warn',
                                              estimator=Lasso(alpha=1.0,
                                                              copy_X=True,
                                                              fit_intercept=True,
                                                              max_iter=1000,
                                                              normalize=False,
                                                              positive=False,
                                                              precompute

In [24]:
gs_pipe.best_params_

{'feat_select__cv': 5,
 'feat_select__min_features_to_select': 12,
 'model__alpha': 0.46415888336127786}