In [1]:
# Import Dependencies
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Read in Titanic Dataset to Pandas DataFrame
titanic_df = pd.read_csv("data/Titanic-Dataset.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [4]:
# Remove unneeded columns
titanic_df = titanic_df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

# Count number of duplicates and drop them
print(f"{titanic_df.duplicated().sum()} duplicated values dropped.")
titanic_df.drop_duplicates(inplace=True)

111 duplicated values dropped.


In [5]:
# How many null values are there?
titanic_df.drop_duplicates().notnull().sum()

Survived    780
Pclass      780
Sex         780
Age         676
SibSp       780
Parch       780
Fare        780
Embarked    778
dtype: int64

In [6]:
# Lots of Null Values. Drop all rows with null values for now.
titanic_df = titanic_df.dropna()
print(f"There are {len(titanic_df)} rows left")

There are 674 rows left


## Machine Learning Model
#### Question
Can we create a machine learning model to predict which passengers will survive?

#### Features
Passenger Class (Pclass), Sex, Age, Number of Siblings Aboard (SibSp), Number of Parents Aboard (Parch), and Fare

#### Target: 
Survived Crash or Not

In [7]:
# Create Features
X = titanic_df.drop("Survived", axis=1)
# Turn all non numeric columns into numeric
X = pd.get_dummies(X)

# Create Target 
y = titanic_df["Survived"]

In [8]:
# Randomly Split Data into training and testing data.
# Using Default splitting of 75/25 training/testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
# Create Logistic Regression Classifier From Sci Kit Learn
classifier = LogisticRegression(solver="lbfgs",
                                max_iter=500,
                                random_state=42)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=500, random_state=42)

In [10]:
# Create 
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Predictions,Actual
0,0,1
1,1,1
2,0,0
3,0,0
4,1,1
5,0,0
6,0,0
7,1,1
8,0,0
9,0,0


In [11]:
print(accuracy_score(y_test, y_pred))

0.8579881656804734
