# Logistic Regression Using Sklearn (81% valid acc, 75% leaderboard)

In [162]:
import numpy as np
import pandas as pd
%matplotlib inline

## Data

In [80]:
# load
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [100]:
# remember passenger ids
train_idx = train.PassengerId
test_idx = test.PassengerId

In [101]:
# create features
df = train.append(test)
cat_cols = ['Pclass', 'Sex', 'Embarked']
df = pd.get_dummies(df, prefix=[x + '_' for x in cat_cols], columns=cat_cols)
df['Age'] = df['Age'] / 85.
df['Fare'] = np.log(df['Fare'] + 10.0)
df['cabin_feature'] = [str(x)[0] for x in list(df.Cabin)]
df = pd.get_dummies(df, prefix='cabin_', columns=['cabin_feature'])
df['SibSp'] = df['SibSp'] / 9.
df['Parch'] = df['Parch'] / 9.
df = df.drop(columns=['Name', 'Ticket', 'Cabin'])
df = df.apply(lambda x: x.fillna(x.mean()),axis=0)

In [102]:
# split into train and test sets
train_features = df.loc[df.PassengerId.isin(train_idx),]
test_features = df.loc[df.PassengerId.isin(test_idx),]

In [106]:
from sklearn.model_selection import train_test_split

In [112]:
# create validation set from train
X = train_features.drop(columns=['PassengerId', 'Survived'])
y = train_features.Survived
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=69)
X_test = test_features.drop(columns=['PassengerId', 'Survived'])

## Model

In [109]:
from sklearn import linear_model

In [113]:
# a simple linear regression
model = linear_model.LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [143]:
train_pred = model.predict(X_train)
val_pred = model.predict(X_val)
test_pred = model.predict(X_test)

train_pred2 = model.predict_proba(X_train)
val_pred2 = model.predict_proba(X_val)
test_pred2 = model.predict_proba(X_test)

## Scoring

In [145]:
from sklearn.metrics import accuracy_score, log_loss

In [146]:
# accuracy score
train_score = accuracy_score(y_train, train_pred)
val_score = accuracy_score(y_val, val_pred)
print('Train acc: {:.3f}, Valid acc: {:.3f}'.format(train_score, val_score))

Train acc: 0.817, Valid acc: 0.817


In [161]:
# log loss
train_score = log_loss(y_train, train_pred2[:,1])
val_score = log_loss(y_val, val_pred2[:,1])
print('Train loss: {:.3f}, Valid loss: {:.3f}'.format(train_score, val_score))

Train loss: 0.439, Valid loss: 0.427


## Submission

In [142]:
from IPython.display import FileLink
sub_path = "../submissions/sklearn_logreg1.csv"
submission = pd.DataFrame.from_dict({'PassengerId': test_idx, 'Survived': test_pred.astype(int)})
submission.to_csv(sub_path,index=False)
FileLink(sub_path)

Leaderboard score of .75