In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
embark_label = train["Embarked"].mode()[0]
train["Cabin"] = train["Cabin"].replace(np.NAN,  "Unknown")
train["Embarked"] = train["Embarked"].replace(np.NAN, embark_label)
train["Age"] = train["Age"].replace(np.NAN, train["Age"].median())
test["Cabin"] = test["Cabin"].replace(np.NAN, "Unknown")
test["Age"] = test["Age"].replace(np.NAN, test["Age"].median())
test["Fare"] = test["Fare"].replace(np.NAN, test["Fare"].mean())

In [4]:
cabin_labels = sorted(set(list(train["Cabin"].unique()) + list(test["Cabin"].unique())))
print(cabin_labels[:30])

['A10', 'A11', 'A14', 'A16', 'A18', 'A19', 'A20', 'A21', 'A23', 'A24', 'A26', 'A29', 'A31', 'A32', 'A34', 'A36', 'A5', 'A6', 'A7', 'A9', 'B10', 'B101', 'B102', 'B11', 'B18', 'B19', 'B20', 'B22', 'B24', 'B26']


In [5]:
train["Cabin_type"] = train["Cabin"].apply(lambda cabin: cabin[0])
test["Cabin_type"] = test["Cabin"].apply(lambda cabin: cabin[0])

In [6]:
train["Cabin_type"].head()

0    U
1    C
2    U
3    C
4    U
Name: Cabin_type, dtype: object

In [7]:
train["family_member_size"] = 1 + train["SibSp"] + train["Parch"]
test["family_member_size"] = 1 + test["SibSp"] + test["Parch"]

In [8]:
categorical_features = ["Sex", "Cabin_type", "Embarked"]
categorical_label_dictionary = dict()
for feature in categorical_features:
    unique_labels = sorted(set(list(train[feature].unique()) + list(test[feature].unique())))
    for data in [train, test]:
        categorical_label_dictionary[feature] = unique_labels
        data[feature + "_value"] = data[feature].apply(lambda item: unique_labels.index(item))

In [9]:
train_test = pd.concat([train, test])

In [10]:
for feature in ["Sex", "Cabin_type", "Embarked"]:
    items = pd.get_dummies(train_test[feature + "_value"])
    labels = categorical_label_dictionary[feature]
    items.columns = [feature + "_" + labels[column] for column in list(items.columns)]
    train_test[items.columns] = items
    train_test.pop(feature + "_value")

In [11]:
for column in ["Name", "Sex", "Ticket", "Cabin", "Cabin_type", "Embarked"]:
    train_test.pop(column)

In [12]:
train_features = train_test.iloc[0: len(train)]
test_features = train_test.iloc[len(train):]

In [13]:
_ = train_features.pop("PassengerId")
_ = test_features.pop("Survived")

In [14]:
validation_split = 0.2

In [15]:
train_targets = train_features.pop("Survived")
train_features, validation_features, train_targets, validation_targets = train_test_split(train_features, train_targets, test_size=validation_split, random_state=np.random.randint(1, 1000))
print(train_features.shape, validation_features.shape, train_targets.shape, validation_targets.shape)

(712, 20) (179, 20) (712,) (179,)


In [16]:
train_features.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,family_member_size,Sex_female,Sex_male,Cabin_type_A,Cabin_type_B,Cabin_type_C,Cabin_type_D,Cabin_type_E,Cabin_type_F,Cabin_type_G,Cabin_type_T,Cabin_type_U,Embarked_C,Embarked_Q,Embarked_S
847,3,35.0,0,0,7.8958,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0
198,3,28.0,0,0,7.75,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0
856,1,45.0,1,1,164.8667,3,1,0,0,0,0,0,0,0,0,0,1,0,0,1
654,3,18.0,0,0,6.75,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0
675,3,18.0,0,0,7.775,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1


In [17]:
correlated_columns = list(train.corr()[train.corr()["Survived"].abs() > 0.2].index)
correlated_columns.remove("Survived")
correlated_columns

['Pclass', 'Fare', 'Sex_value', 'Cabin_type_value']

In [23]:
from sklearn.linear_model import LogisticRegression

logit_train_features, logit_val_features = train_test_split(train[correlated_columns +  ["Survived"]], test_size=0.2, random_state=48)
logit_train_targets = logit_train_features.pop("Survived")
logit_val_targets = logit_val_features.pop("Survived")
logit = LogisticRegression(solver='newton-cg')
logit.fit(logit_train_features, logit_train_targets)
score = logit.score(logit_val_features, logit_val_targets)
score

0.770949720670391

In [19]:
inputs = {'Pclass': [3], 'Fare': [70.3], 'Sex_value': [0], 'Cabin_type_value': [5]}
test_model_data = pd.DataFrame.from_dict(inputs)
test_model_data

Unnamed: 0,Pclass,Fare,Sex_value,Cabin_type_value
0,3,70.3,0,5


In [20]:
prediction = logit.predict(test_model_data)
print(prediction)

[1]


In [21]:
import pickle

model_filename = 'logit.pkl'
pickle.dump(logit, open(model_filename,'wb'))
print('Model is saved into to disk successfully using Pickle')

Model is saved into to disk successfully using Pickle


In [22]:
pkl_model = pickle.load(open(model_filename, 'rb'))

inputs = {'Pclass': [3], 'Fare': [80], 'Sex_value': [0], 'Cabin_type_value': [8]}
test_model_data = pd.DataFrame.from_dict(inputs)

predictions = pkl_model.predict(test_model_data)
print(predictions)

[1]
