In [19]:
import pandas as pd
import matplotlib.pyplot as plt

In [20]:
training_data = pd.read_csv("train_titanic.csv")
#Dealing with null values
print("MedianAge: ", training_data['Age'].median())
print("STDAge: ", training_data['Age'].std())
print("MeanAge: ", training_data['Age'].mean())

training_data.info()

MedianAge:  28.0
STDAge:  14.526497332334044
MeanAge:  29.69911764705882
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [21]:
#Replacing NaN Age with median age
training_data['Age'].fillna((training_data['Age'].median()), inplace=True)
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [22]:
#Dropping irrelevant columns
training_data = training_data.drop(['Name','Ticket', 'Cabin'], axis=1)
training_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       2
dtype: int64

In [23]:
training_data.Embarked.value_counts()
#Replacing NaN Embarked with most occuring value - S
training_data["Embarked"] = training_data["Embarked"].fillna("S")
training_data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C


In [24]:
training_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [25]:
#Processing Data for categorical variables
Sex = pd.get_dummies(training_data.Sex, drop_first=True)
Embarked = pd.get_dummies(training_data.Embarked, drop_first=True)
training_data = pd.concat([training_data,Sex,Embarked],axis=1)
training_data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,male,Q,S
0,1,0,3,male,22.0,1,0,7.25,S,1,0,1
1,2,1,1,female,38.0,1,0,71.2833,C,0,0,0


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#Droping categorical variables without dummies and subsetting independent and predictable 
#variables for training 
training_data_X = training_data.drop(['Sex','Embarked', 'Survived'], axis=1)
training_data_Y = training_data['Survived']

In [27]:
#Calling LR Method and training the model
model = LogisticRegression()
model.fit(training_data_X, training_data_Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
#Read test dataset
test_data = pd.read_csv('test_titanic.csv')
test_data.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [29]:
#Drop same columns from test as well
test_data = test_data.drop(['Name','Ticket', 'Cabin'], axis=1)
test_data.head(2)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S


In [30]:
#Get dummies for categorical variables in test dataset as well
Sex = pd.get_dummies(test_data.Sex, drop_first=True)
Embarked = pd.get_dummies(test_data.Embarked, drop_first=True)
test_data = pd.concat([test_data,Sex,Embarked],axis=1)
test_data.head(2)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,male,Q,S
0,892,3,male,34.5,0,0,7.8292,Q,1,1,0
1,893,3,female,47.0,1,0,7.0,S,0,0,1


In [31]:
#Deal with NaN values in test dataset as well
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median()) 
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].median()) 
test_data.Embarked = test_data.Embarked.fillna('S')

In [32]:
#Subset test dataset for independent variables
test_data_X = test_data.drop(['Sex','Embarked'], axis=1)

In [33]:
#Predict Survived or not based on test_data_X
prediction = model.predict(test_data_X)

In [34]:
#Creating a dataframe with Passenger Id and survival prediction
my_submission = pd.DataFrame({'PassengerId':test_data_X.PassengerId, 'Survived': prediction})


In [37]:
my_submission.to_csv("Titanic_Survival_Submission-LR")