In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [71]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [72]:
embark_label = train["Embarked"].mode()[0]
train["Cabin"] = train["Cabin"].replace(np.NAN,  "Unknown")
train["Embarked"] = train["Embarked"].replace(np.NAN, embark_label)
train["Age"] = train["Age"].replace(np.NAN, train["Age"].median())
test["Cabin"] = test["Cabin"].replace(np.NAN, "Unknown")
test["Age"] = test["Age"].replace(np.NAN, test["Age"].median())
test["Fare"] = test["Fare"].replace(np.NAN, test["Fare"].mean())

In [73]:
cabin_labels = sorted(set(list(train["Cabin"].unique()) + list(test["Cabin"].unique())))
print(cabin_labels[:30])

['A10', 'A11', 'A14', 'A16', 'A18', 'A19', 'A20', 'A21', 'A23', 'A24', 'A26', 'A29', 'A31', 'A32', 'A34', 'A36', 'A5', 'A6', 'A7', 'A9', 'B10', 'B101', 'B102', 'B11', 'B18', 'B19', 'B20', 'B22', 'B24', 'B26']


In [74]:
train["Cabin_type"] = train["Cabin"].apply(lambda cabin: cabin[0])
test["Cabin_type"] = test["Cabin"].apply(lambda cabin: cabin[0])

In [75]:
train["Cabin_type"].head()

0    U
1    C
2    U
3    C
4    U
Name: Cabin_type, dtype: object

In [76]:
train["family_member_size"] = 1 + train["SibSp"] + train["Parch"]
test["family_member_size"] = 1 + test["SibSp"] + test["Parch"]

In [77]:
categorical_features = ["Sex", "Cabin_type", "Embarked"]
categorical_label_dictionary = dict()
for feature in categorical_features:
    unique_labels = sorted(set(list(train[feature].unique()) + list(test[feature].unique())))
    for data in [train, test]:
        categorical_label_dictionary[feature] = unique_labels
        data[feature + "_value"] = data[feature].apply(lambda item: unique_labels.index(item))

In [79]:
train_test = pd.concat([train, test])

In [80]:
for feature in ["Sex", "Cabin_type", "Embarked"]:
    items = pd.get_dummies(train_test[feature + "_value"])
    labels = categorical_label_dictionary[feature]
    items.columns = [feature + "_" + labels[column] for column in list(items.columns)]
    train_test[items.columns] = items
    train_test.pop(feature + "_value")

In [81]:
for column in ["Name", "Sex", "Ticket", "Cabin", "Cabin_type", "Embarked"]:
    train_test.pop(column)

In [82]:
train_features = train_test.iloc[0: len(train)]
test_features = train_test.iloc[len(train):]

In [83]:
_ = train_features.pop("PassengerId")
_ = test_features.pop("Survived")

In [85]:
validation_split = 0.2

In [86]:
train_targets = train_features.pop("Survived")
train_features, validation_features, train_targets, validation_targets = model_selection.train_test_split(train_features, train_targets, test_size=validation_split, random_state=np.random.randint(1, 1000))
print(train_features.shape, validation_features.shape, train_targets.shape, validation_targets.shape)

(712, 20) (179, 20) (712,) (179,)


In [87]:
logitistc_related_columns = list(train.corr()[train.corr()["Survived"].abs() > 0.2].index)
logitistc_related_columns.remove("Survived")
logitistc_related_columns

['Pclass', 'Fare', 'Sex_value', 'Cabin_type_value']

In [110]:
from sklearn.linear_model import LogisticRegression

logit_train_features, logit_val_features = train_test_split(train[logitistc_related_columns +  ["Survived"]], test_size=0.2, random_state=48)
logit_train_targets = logit_train_features.pop("Survived")
logit_val_targets = logit_val_features.pop("Survived")
logit = LogisticRegression(solver='newton-cg')
logit.fit(logit_train_features, logit_train_targets)
score = logit.score(logit_val_features, logit_val_targets)

In [126]:
inputs = {'Pclass': [3], 'Fare': [70.3], 'Sex_value': [1], 'Cabin_type_value': [8]}
test_model = pd.DataFrame.from_dict(inputs)
test_model

Unnamed: 0,Pclass,Fare,Sex_value,Cabin_type_value
0,3,70.3,1,8


In [128]:
predictions = logit.predict(test_model)
print(predictions)

[0]


In [118]:
import pickle

model_filename = 'logit.sav'
pickle.dump(logit, open(model_filename,'wb'))
print('Model is saved into to disk successfully Using Pickle')


Model is saved into to disk successfully Using Pickle


In [125]:
print(test_model.loc[1])

Pclass               3.0
Fare                50.0
Sex_value            0.0
Cabin_type_value     8.0
Name: 1, dtype: float64


In [129]:
pickle.load(open(model_filename, 'rb'))

inputs = {'Pclass': [3], 'Fare': [80], 'Sex_value': [0], 'Cabin_type_value': [8]}
test_model = pd.DataFrame.from_dict(inputs)

predictions = logit.predict(test_model)
print(predictions)

[1]
