In [638]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


In [639]:
#Read the train data
train_df = pd.read_csv("../data/train.csv")
# train_df.describe(include="all")

In [640]:
#Read the test data
test_df = pd.read_csv("../data/test.csv")
# test_df.describe(include="all")

In [641]:
#For replacing the NaN Age values we will use the average Age by Sex

#Getting the mean by sex an sorting the dataframe such that it is shows female in index 0 and male in index 1
train_grp = train_df[["Sex","Age"]].groupby(["Sex"], as_index = False).mean().sort_values(by=["Sex"])
train_grp

Unnamed: 0,Sex,Age
0,female,27.915709
1,male,30.726645


In [642]:
# Replacing the average values for each sex

female_avg_age = train_grp.at[0,"Age"]
male_avg_age = train_grp.at[1,"Age"]

train_df.loc[((train_df["Sex"] == "male") & (np.isnan(train_df["Age"]))), "Age"] = male_avg_age
train_df.loc[((train_df["Sex"] == "female") & (np.isnan(train_df["Age"]))), "Age"] = female_avg_age


In [643]:
#Making the same age replacement for the test file
test_grp = test_df[["Sex","Age"]].groupby(["Sex"], as_index = False).mean().sort_values(by=["Sex"]).sort_values(by=["Sex"])

female_avg_age = test_grp.at[0,"Age"]
male_avg_age = test_grp.at[1,"Age"]

test_df.loc[((test_df["Sex"] == "male") & (np.isnan(test_df["Age"]))), "Age"] = male_avg_age
test_df.loc[((test_df["Sex"] == "female") & (np.isnan(test_df["Age"]))), "Age"] = female_avg_age

In [644]:
# Cabin variable is missing more than 3/4 of total values. Cabin is dropped for the analysis for both train and test sets
# Name is also irrelevant for the analysis, it's dropped

train_df.drop(columns=["Cabin", "Name"], inplace=True)
test_df.drop(columns=["Cabin", "Name"], inplace=True)

test_df[test_df.isna().any(axis=1)]


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked


In [645]:
# One-hot encoding for Sex
train_one_hot = pd.get_dummies(train_df["Sex"])
train_df.drop(columns=["Sex"], inplace=True)
train_df = train_df.join(train_one_hot)

test_one_hot = pd.get_dummies(test_df["Sex"])
test_df.drop(columns=["Sex"], inplace=True)
test_df = test_df.join(test_one_hot)

In [646]:
# We are only missing some few values in the Embarked variable for training data. We will fill it with the Embarked mode 

train_df["Embarked"] = train_df["Embarked"].fillna(train_df["Embarked"].mode()[0])

In [647]:
#One-hot encoding for the Embarked variable
train_one_hot = pd.get_dummies(train_df["Embarked"])
train_df.drop(columns=["Embarked"], inplace=True)
train_df = train_df.join(train_one_hot)

test_one_hot = pd.get_dummies(test_df["Embarked"])
test_df.drop(columns=["Embarked"], inplace=True)
test_df = test_df.join(test_one_hot)



In [648]:
# Get only the number part of the ticket

train_df["Ticket"] = train_df["Ticket"].apply(lambda x: x.split()[-1])

#Eliminate the nly values without number 
train_df.drop(train_df[train_df["Ticket"] == "LINE"].index , inplace=True)

#Cast as int
train_df["Ticket"] = train_df["Ticket"].astype("int")

In [649]:
#Repeat last change for test data
test_df["Ticket"] = test_df["Ticket"].apply(lambda x: x.split()[-1])
test_df.drop(test_df[test_df["Ticket"] == "LINE"].index , inplace=True)
test_df["Ticket"] = test_df["Ticket"].astype("int")


In [650]:
# Drop the PassengerId, not important for the training model
test_data = test_df.drop(columns=["PassengerId"])
train_data = train_df.drop(columns=["PassengerId"])


In [651]:
#Basic Logistic Regression Model

x = train_data.drop("Survived", axis = 1)
y = train_data["Survived"]
lr = LogisticRegression(tol = 0.01)
lr.fit(x,y)
lr.score(x,y)

0.7113866967305524

In [652]:
x_test = test_data
y_test = lr.predict(test_data)
test_df["Survived"] = y_test

In [653]:
result_data = test_df[["PassengerId","Survived"]]
result_data.to_csv("result.csv",index=False)
