In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

# Import the data
train_db = pd.read_csv("data/titanic_train.csv")
test_db = pd.read_csv("data/titanic_test.csv")
gender_db = pd.read_csv("data/gender_submission.csv")

train_db = train_db.dropna()
test_db = test_db.dropna()
gender_db = gender_db.dropna()

In [2]:
# Check the data
train_db.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


In [3]:
test_db.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
12,904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23.0,1,0,21228,82.2667,B45,S
14,906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance...",female,47.0,1,0,W.E.P. 5734,61.175,E31,S
24,916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48.0,1,3,PC 17608,262.375,B57 B59 B63 B66,C
26,918,1,"Ostby, Miss. Helene Ragnhild",female,22.0,0,1,113509,61.9792,B36,C
28,920,1,"Brady, Mr. John Bertram",male,41.0,0,0,113054,30.5,A21,S


In [4]:
gender_db.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [None]:
# Add some histograms here

In [5]:
train_db.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
# Convert male and female to integers
train_db.Sex.replace(['male', 'female'], [0,1], inplace=True)
test_db.Sex.replace(['male', 'female'], [0,1], inplace=True)

In [6]:
train_db.Sex.head(), test_db.Sex.head()

(1     1
 3     1
 6     0
 10    1
 11    1
 Name: Sex, dtype: int64,
 12    1
 14    1
 24    1
 26    1
 28    0
 Name: Sex, dtype: int64)

In [7]:
train_db.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,1.0,0.148495,-0.089136,0.025205,0.030933,-0.083488,-0.051454,0.02974
Survived,0.148495,1.0,-0.034542,0.532418,-0.254085,0.106346,0.023582,0.134241
Pclass,-0.089136,-0.034542,1.0,0.046181,-0.306514,-0.103592,0.047496,-0.315235
Sex,0.025205,0.532418,0.046181,1.0,-0.184969,0.104291,0.089581,0.130433
Age,0.030933,-0.254085,-0.306514,-0.184969,1.0,-0.156162,-0.271271,-0.092424
SibSp,-0.083488,0.106346,-0.103592,0.104291,-0.156162,1.0,0.255346,0.286433
Parch,-0.051454,0.023582,0.047496,0.089581,-0.271271,0.255346,1.0,0.38974
Fare,0.02974,0.134241,-0.315235,0.130433,-0.092424,0.286433,0.38974,1.0


In [8]:
# Set features
selected_features = train_db[['Sex', 'Age']]
test_features = test_db[['Sex', 'Age']]

In [9]:
# Select the y-value from the training data and reshape and flatten
y = train_db.Survived

y = y.values.reshape(-1, 1)
y = y.flatten()

In [10]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler

X_minmax = MinMaxScaler().fit(selected_features)


X_train_minmax = X_minmax.transform(selected_features)
X_test_minmax = X_minmax.transform(test_features)

In [11]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression(max_iter=6000)
model1
model1.fit(selected_features, y)

print(f"Training Data Score: {model1.score(selected_features, y)}")
# print(f"Testing Data Score: {model1.score(test_features, y)}")

Training Data Score: 0.7704918032786885


In [12]:
# Test the data

predictions = model1.predict(X_test_minmax)
print(f"First 10 Predictions:   {predictions[:10]}")

pd.DataFrame({"Prediction": predictions}).reset_index(drop=True)

First 10 Predictions:   [1 1 1 1 1 1 1 1 1 1]


Unnamed: 0,Prediction
0,1
1,1
2,1
3,1
4,1
...,...
82,1
83,1
84,1
85,1
