In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Kaggle specific
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [14]:
"""
# We read in the data and do some feature engineering and
# also some exploratory data analysis
"""
train_data_df = pd.read_csv('./data/train.csv', index_col=False)

# Augument n/a values with zeros for numerical values and empty string for nominal values
train_data_df.select_dtypes([int, float]).fillna(0)
train_data_df.select_dtypes(object).fillna("")

# Confirm the shape of our dataset is correct
train_data_df.shape

(891, 12)

In [15]:
"""
# We'll now engineer a new feature called desk (which gets the deck of each cabin)
# First covert the ['Cabin'] features to string and the match to a deck
"""
train_data_df['Cabin'] = train_data_df['Cabin'].astype(pd.StringDtype()).astype(str)

decks = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']

def match_cabin_to_deck(cabin: str, decks):
    for substr in decks:
        if cabin.find('<NA>') != -1:
            return 'UNKNOWN'
        elif cabin.find(substr) != -1:
            return substr
    return np.nan

train_data_df['Deck'] = train_data_df['Cabin'].map(lambda x: match_cabin_to_deck(x, decks))

In [16]:
"""
# Next, let's select the features that will make our predictions easier
# So far [Pclass, Age, Sex, Fare] seems like good features to 
# determine the likelihood of survival
"""
new_train_df = train_data_df[['PassengerId', 'Pclass', 'Age', 'Sex', 'Fare', 'Survived']]

"""
# From the data we see here - after engineering a new feature 'deck',
# we see that regardless of whether the deck were known or unknown
# there is a huge rate of survival of female passengers
"""
# Confirm shape of our dataframe
new_train_df.shape

(891, 6)

In [17]:
"""
# Next, we import the test data and do the same feature engineering
# and data analysis as the training data
"""
test_data_df = pd.read_csv('./data/test.csv', index_col=False)

# Drop any row with empty data
test_data_df.dropna()

# Augument n/a values with zeros for numerical values and empty string for nominal values
test_data_df.select_dtypes([int, float]).fillna(0)
test_data_df.select_dtypes(object).fillna("")

# Confirm the shape of our dataset is correct
test_data_df.shape

(418, 11)

In [18]:
"""
# We'll now engineer a new feature called desk (which gets the deck of each cabin)
# First covert the ['Cabin'] features to string and the match to a deck
"""
test_data_df['Cabin'] = test_data_df['Cabin'].astype(pd.StringDtype()).astype(str)

test_data_df['Deck'] = test_data_df['Cabin'].map(lambda x: match_cabin_to_deck(x, decks))

# Confirm the shape of our dataset is correct
new_test_df = test_data_df[['PassengerId', 'Pclass', 'Age', 'Sex', 'Fare']]

# Confirm the shape of our test dataset and training dataset
new_train_df.shape, new_test_df.shape

((891, 6), (418, 5))

In [19]:
"""
# We will now prepare our datasets for our classifier
# using `sklearn` and more
"""
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score, KFold, RepeatedKFold
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
import torch

In [20]:
# Split train data into X(features) and y(labels)
X_train = new_train_df.iloc[:, 1:5]
y_train = new_train_df.iloc[:, -1]

# Get test data X(features)
X_test = new_test_df.iloc[:, 1:]

X_train.columns, X_test.columns

(Index(['Pclass', 'Age', 'Sex', 'Fare'], dtype='object'),
 Index(['Pclass', 'Age', 'Sex', 'Fare'], dtype='object'))

In [21]:
"""
# Next, we encode our features and labels for the training data
# and also the test data
"""
column_trans = make_column_transformer(
    (OneHotEncoder(categories='auto'), ['Sex']),
    (SimpleImputer(missing_values=np.nan, strategy="mean"), ['Fare', 'Age']),
    remainder='passthrough'
)

X_train = column_trans.fit_transform(X_train)
X_test = column_trans.fit_transform(X_test)

label_enc = LabelEncoder()
y_train = label_enc.fit_transform(y_train)

In [22]:
"""
# Next, we convert our dataframes to Tensor using PyTorch
"""
X_train = torch.tensor(X_train, dtype=torch.float)
y_train = torch.tensor(y_train, dtype=torch.float)

X_test = torch.tensor(X_test, dtype=torch.float)

In [23]:
"""
# Next, we instantiate our model and train it on 
# our training data (tensor)
"""
# Tuning Parameters
N_NEIGHBORS = 20
knn_clf = KNeighborsClassifier(n_neighbors=N_NEIGHBORS, weights="distance", algorithm="kd_tree", leaf_size=25)

# Logistic Regression
log_clf = LogisticRegression()

# Fit model with training data
knn_clf.fit(X_train, y_train)
log_clf.fit(X_train, y_train)

# KFold for cross validation scoring
cv = KFold(n_splits=N_NEIGHBORS, random_state=1, shuffle=True)
cv_ = RepeatedKFold(n_splits=N_NEIGHBORS, n_repeats=10, random_state=1)

# Scoring and Accuracy
scoring = knn_clf.score(X_train, y_train)
accuracy = cross_val_score(knn_clf, X_train, y_train, cv=cv_, scoring='accuracy', n_jobs=-1).mean()

# LogRes Scoring & Accuracy
log_scoring = log_clf.score(X_train, y_train)
# log_accuracy = cross_val_score(log_clf, X_train, y_train, scoring='accuracy').mean()

print(f"KNN -> Score: {scoring}, Accuracy: {accuracy}")
print(f"Logistic Regression -> Score: {log_scoring}, Accuracy: {0}")

: 

: 

In [None]:
"""
# Make prediction using test data
"""
y_pred = knn_clf.predict(X_test)

# Create new dataframe with PassengerId, y_pred as Survived
new_test_df['PassengerId'], len(y_pred)

output = pd.DataFrame(
    {
        'PassengerId': new_test_df['PassengerId'],
        'Survived': pd.Series(y_pred, dtype=int)
    }
)

# output.to_csv('submission.csv', index=False)