In [42]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest , RFECV
from sklearn.pipeline import Pipeline
import numpy as np

In [43]:
titanic = pd.read_csv('../data/train_titanic.csv')

In [44]:
X_train, X_test, y_train, y_test =\
train_test_split(titanic.drop(columns=['PassengerId', 'Fare']),\
                titanic['Fare'], random_state=78701)
# trying to predict 'Fare' based on the other columns in the data frame

In [45]:
X_train.head(2) 

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,Embarked
299,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50.0,0,1,PC 17558,B58 B60,C
427,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louis...",female,19.0,0,0,250655,,S


In [46]:
### A function to clean our data. 

def data_cleaner(df):
    df_out = df.copy()
    df_out['Name'] = df_out['Name'].map(lambda x: x.split('.')[0].split(',')[-1])
    df_out['Sex'] = df_out['Sex'].map(lambda x: 1 if x == 'male' else 0)
    df_out['Age'] = df_out['Age'].fillna(df_out['Age'].mean())
    df_out = df_out.loc[:,['Name', 'Sex', 'Age', 'Pclass', 'Survived']]
    return pd.get_dummies(df_out, columns=['Name'], drop_first=True)

In [47]:
data_cleaner(titanic)

Unnamed: 0,Sex,Age,Pclass,Survived,Name_ Col,Name_ Don,Name_ Dr,Name_ Jonkheer,Name_ Lady,Name_ Major,Name_ Master,Name_ Miss,Name_ Mlle,Name_ Mme,Name_ Mr,Name_ Mrs,Name_ Ms,Name_ Rev,Name_ Sir,Name_ the Countess
0,1,22.000000,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,38.000000,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,26.000000,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,35.000000,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,1,35.000000,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,1,27.000000,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
887,0,19.000000,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
888,0,29.699118,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
889,1,26.000000,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


### Fit our model to training data

In [48]:
X_tr_model = data_cleaner(X_train) 

ss = StandardScaler() 
# scale training data - use a dataframe for ease of use 
X_train_sc = pd.DataFrame(ss.fit_transform(X_tr_model), columns=X_tr_model.columns)
# We don't need to put that in a DataFrame but it's easier to see
## instantiate model 
lasso = Lasso()
## fit model 
lasso.fit(X_train_sc, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

### Make predictions on test data....what's happening here?


# DUMMY STUFF AFTER DATA IS CLEAN AND AFTER WE TRAIN TEST SPLIT

In [49]:
### use function 
X_te_model = data_cleaner(X_test)
### scale test data 
ss.transform(X_te_model) # Standard Scaler is looking for 16 columns because that's...
# how many are in X_train

ValueError: operands could not be broadcast together with shapes (223,14) (16,) (223,14) 

In [24]:
# Use set
only_tr_columns = list(set(X_tr_model.columns) - set(X_te_model.columns))
only_te_columns = list(set(X_te_model.columns) - set(X_tr_model.columns))


## columns only in train 

## columns only in test 


In [25]:
only_te_columns

['Name_ Major', 'Name_ the Countess', 'Name_ Ms', 'Name_ Jonkheer']

In [26]:
only_tr_columns

['Name_ Col', 'Name_ Dr', 'Name_ Lady', 'Name_ Sir', 'Name_ Mme', 'Name_ Don']

In [27]:
X_te_model  # Pretend this is the dummied neighborhoods/ames data what have you

Unnamed: 0,Sex,Age,Pclass,Survived,Name_ Jonkheer,Name_ Major,Name_ Master,Name_ Miss,Name_ Mlle,Name_ Mr,Name_ Mrs,Name_ Ms,Name_ Rev,Name_ the Countess
37,1,21.000000,3,0,0,0,0,0,0,1,0,0,0,0
84,0,17.000000,2,1,0,0,0,1,0,0,0,0,0,0
367,0,30.216437,3,1,0,0,0,0,0,0,1,0,0,0
333,1,16.000000,3,0,0,0,0,0,0,1,0,0,0,0
357,0,38.000000,2,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
741,1,36.000000,1,0,0,0,0,0,0,1,0,0,0,0
875,0,15.000000,3,1,0,0,0,1,0,0,0,0,0,0
117,1,29.000000,2,0,0,0,0,0,0,1,0,0,0,0
825,1,30.216437,3,0,0,0,0,0,0,1,0,0,0,0


In [28]:
# we can delete the ones that do not show up in training column

In [29]:
X_te_model = X_te_model.drop(columns=only_te_columns)

In [30]:
# add coumns of zeros to test data frame
for col in only_tr_columns:
    X_te_model[col] = 0

In [31]:
X_te_model.head(5)

Unnamed: 0,Sex,Age,Pclass,Survived,Name_ Master,Name_ Miss,Name_ Mlle,Name_ Mr,Name_ Mrs,Name_ Rev,Name_ Col,Name_ Dr,Name_ Lady,Name_ Sir,Name_ Mme,Name_ Don
37,1,21.0,3,0,0,0,0,1,0,0,0,0,0,0,0,0
84,0,17.0,2,1,0,1,0,0,0,0,0,0,0,0,0,0
367,0,30.216437,3,1,0,0,0,0,1,0,0,0,0,0,0,0
333,1,16.0,3,0,0,0,0,1,0,0,0,0,0,0,0,0
357,0,38.0,2,0,0,1,0,0,0,0,0,0,0,0,0,0


In [32]:
X_tr_model.head(5)

Unnamed: 0,Sex,Age,Pclass,Survived,Name_ Col,Name_ Don,Name_ Dr,Name_ Lady,Name_ Master,Name_ Miss,Name_ Mlle,Name_ Mme,Name_ Mr,Name_ Mrs,Name_ Rev,Name_ Sir
299,0,50.0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
427,0,19.0,2,1,0,0,0,0,0,1,0,0,0,0,0,0
452,1,30.0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
14,0,14.0,3,0,0,0,0,0,0,1,0,0,0,0,0,0
614,1,35.0,3,0,0,0,0,0,0,0,0,0,1,0,0,0


In [33]:
# This is one way to align the columns in the data frames. Maybe use this for my project dummies
X_te_model = X_te_model.loc[:, X_tr_model.columns]

In [34]:
X_test_sc = pd.DataFrame(ss.transform(X_te_model), columns=X_te_model.columns)

In [35]:
X_test_sc

Unnamed: 0,Sex,Age,Pclass,Survived,Name_ Col,Name_ Don,Name_ Dr,Name_ Lady,Name_ Master,Name_ Miss,Name_ Mlle,Name_ Mme,Name_ Mr,Name_ Mrs,Name_ Rev,Name_ Sir
0,0.739119,-0.652595,0.836554,-0.773360,-0.03872,-0.03872,-0.102908,-0.03872,-0.220603,-0.503273,-0.03872,-0.03872,0.852114,-0.412162,-0.077615,-0.03872
1,-1.352963,-0.958532,-0.360058,1.293058,-0.03872,-0.03872,-0.102908,-0.03872,-0.220603,1.986995,-0.03872,-0.03872,-1.173552,-0.412162,-0.077615,-0.03872
2,-1.352963,0.052316,0.836554,1.293058,-0.03872,-0.03872,-0.102908,-0.03872,-0.220603,-0.503273,-0.03872,-0.03872,-1.173552,2.426231,-0.077615,-0.03872
3,0.739119,-1.035016,0.836554,-0.773360,-0.03872,-0.03872,-0.102908,-0.03872,-0.220603,-0.503273,-0.03872,-0.03872,0.852114,-0.412162,-0.077615,-0.03872
4,-1.352963,0.647635,-0.360058,-0.773360,-0.03872,-0.03872,-0.102908,-0.03872,-0.220603,1.986995,-0.03872,-0.03872,-1.173552,-0.412162,-0.077615,-0.03872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,0.739119,0.494667,-1.556670,-0.773360,-0.03872,-0.03872,-0.102908,-0.03872,-0.220603,-0.503273,-0.03872,-0.03872,0.852114,-0.412162,-0.077615,-0.03872
219,-1.352963,-1.111500,0.836554,1.293058,-0.03872,-0.03872,-0.102908,-0.03872,-0.220603,1.986995,-0.03872,-0.03872,-1.173552,-0.412162,-0.077615,-0.03872
220,0.739119,-0.040722,-0.360058,-0.773360,-0.03872,-0.03872,-0.102908,-0.03872,-0.220603,-0.503273,-0.03872,-0.03872,0.852114,-0.412162,-0.077615,-0.03872
221,0.739119,0.052316,0.836554,-0.773360,-0.03872,-0.03872,-0.102908,-0.03872,-0.220603,-0.503273,-0.03872,-0.03872,0.852114,-0.412162,-0.077615,-0.03872


In [36]:
lasso.score(X_test_sc,y_test)

0.23510898090164267

###  Gridsearch 

### Pipeline Gridsearch

(if time allows)

In [37]:
Lasso()

params = {          # keys of our dictionaryies need rto be the arguments of Lasso above.
    'alpha' : [-20, -10, -1, 1, 10, 20], # this list could be linspace
    'max_iter' : [1e6]
}

grid_search = GridSearchCV(Lasso(), param_grid=params, cv=5)


In [38]:
grid_search.fit(X_train_sc, y_train)

  positive)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [-20, -10, -1, 1, 10, 20],
                         'max_iter': [1000000.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [39]:
grid_search.best_params_

{'alpha': 1, 'max_iter': 1000000.0}

In [40]:
grid_search.score(X_test_sc, y_test)

0.23510898090164267

In [41]:
grid_search.best_estimator_

Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000000.0,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)