In [1]:
# Import Dependencies
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Read in Titanic Dataset to Pandas DataFrame
titanic_df = pd.read_csv("Data/Titanic-Dataset2.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Data Preprocessing

In [3]:
titanic_df["Cabin"].value_counts()

C23 C25 C27        6
G6                 5
B57 B59 B63 B66    5
C22 C26            4
F33                4
                  ..
A14                1
E63                1
E12                1
E38                1
C105               1
Name: Cabin, Length: 186, dtype: int64

In [4]:
# Display size of dataframe
titanic_df.shape

(1309, 12)

In [5]:
# Drop Passenger ID and Name Columns
titanic_df.drop(["PassengerId", "Name"], axis=1, inplace=True)

In [6]:
#Check datatypes
titanic_df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [7]:
# Check unique values
titanic_df.nunique()

Survived      2
Pclass        3
Sex           2
Age          98
SibSp         7
Parch         8
Ticket      929
Fare        281
Cabin       186
Embarked      3
dtype: int64

In [8]:
# Number of unique tickets
titanic_df["Ticket"].value_counts()

CA. 2343        11
CA 2144          8
1601             8
PC 17608         7
S.O.C. 14879     7
                ..
113792           1
36209            1
323592           1
315089           1
359309           1
Name: Ticket, Length: 929, dtype: int64

In [9]:
# Drop ticket column
titanic_df.drop(["Ticket"], axis=1, inplace=True)

In [10]:
# Check null values
titanic_df.isna().sum()

Survived       0
Pclass         0
Sex            0
Age          263
SibSp          0
Parch          0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

In [11]:
# Fill null values for Embarked, Fare, and Cabin
titanic_df["Embarked"].fillna("S", inplace=True)
titanic_df["Fare"].fillna(0, inplace=True)
titanic_df["Cabin"].fillna("U", inplace=True)
titanic_df.interpolate(method="linear", inplace=True)
titanic_df.head(20)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,U,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,U,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,U,S
5,0,3,male,44.5,0,0,8.4583,U,Q
6,0,1,male,54.0,0,0,51.8625,E46,S
7,0,3,male,2.0,3,1,21.075,U,S
8,1,3,female,27.0,0,2,11.1333,U,S
9,1,2,female,14.0,1,0,30.0708,U,C


In [12]:
# Confirm no null values
titanic_df.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [13]:
titanic_df.nunique()

Survived      2
Pclass        3
Sex           2
Age         185
SibSp         7
Parch         8
Fare        281
Cabin       187
Embarked      3
dtype: int64

In [14]:
# Create two functions to engineer new rows from Cabin

# Pass cabin information and return the deck
def get_deck(cabin):
    # Return First Character of Cabin 
    return cabin[0]

# Pass cabin information and return which side of boat cabin is located
def get_location(cabin):
    # Create string list of integers 0 through 9
    nums = str(list(range(0,10)))
    # If last character of Cabin is not a number, return neither
    if cabin[-1] not in nums:
        return "Neither"
    else:
        # Convert string number into integer to perform math on
        end_num = int(cabin[-1])
        # Even number ending cabins are Port(Left) side and odd are Starboard(Right) side
        if end_num%2 == 0:
            return "Port"
        else:
            return "Starboard"

In [15]:
# Engineer two new columns from Cabin. Deck and Cabin_Location.
titanic_df["Deck"] = titanic_df["Cabin"].apply(get_deck)
titanic_df["Cabin_Location"] = titanic_df["Cabin"].apply(get_location)
titanic_df.head(20)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Deck,Cabin_Location
0,0,3,male,22.0,1,0,7.25,U,S,U,Neither
1,1,1,female,38.0,1,0,71.2833,C85,C,C,Starboard
2,1,3,female,26.0,0,0,7.925,U,S,U,Neither
3,1,1,female,35.0,1,0,53.1,C123,S,C,Starboard
4,0,3,male,35.0,0,0,8.05,U,S,U,Neither
5,0,3,male,44.5,0,0,8.4583,U,Q,U,Neither
6,0,1,male,54.0,0,0,51.8625,E46,S,E,Port
7,0,3,male,2.0,3,1,21.075,U,S,U,Neither
8,1,3,female,27.0,0,2,11.1333,U,S,U,Neither
9,1,2,female,14.0,1,0,30.0708,U,C,U,Neither


In [16]:
# Drop cabin as Deck and Cabin_Location give same information
titanic_df.drop(["Cabin"], axis=1, inplace=True)
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Cabin_Location
0,0,3,male,22.0,1,0,7.25,S,U,Neither
1,1,1,female,38.0,1,0,71.2833,C,C,Starboard
2,1,3,female,26.0,0,0,7.925,S,U,Neither
3,1,1,female,35.0,1,0,53.1,S,C,Starboard
4,0,3,male,35.0,0,0,8.05,S,U,Neither


In [17]:
# Check number of unique values
titanic_df.nunique()

Survived            2
Pclass              3
Sex                 2
Age               185
SibSp               7
Parch               8
Fare              281
Embarked            3
Deck                9
Cabin_Location      3
dtype: int64

In [18]:
# Check number of duplicate values
titanic_df.duplicated().sum()

87

In [19]:
# Use OneHotEncoder to transform non-numeric columns into numeric
# Reset titanic dataframe index to match with encode merge
titanic_df.reset_index(drop=True, inplace=True)

# Generate categorical variable list
titanic_cat = titanic_df.dtypes[titanic_df.dtypes == "object"].index.tolist()

# Create OneHotEncoder Instance
enc = OneHotEncoder(sparse=False)
# Fit and transform OneHotEncoder using categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(titanic_df[titanic_cat]))
# Add the encoded variable names to dataframe
encode_df.columns = enc.get_feature_names_out(titanic_cat)

# Merge encoded features and drop originals
merged_titanic_df = titanic_df.merge(encode_df, left_index=True, right_index=True).drop(labels=titanic_cat, axis=1)
merged_titanic_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U,Cabin_Location_Neither,Cabin_Location_Port,Cabin_Location_Starboard
0,0,3,22.0,1,0,7.25,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,3,35.0,0,0,8.05,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [20]:
# Confirm merge worked as intended
merged_titanic_df.shape

(1309, 23)

In [21]:
# Split data into features and target
X = merged_titanic_df.drop("Survived", axis=1)
y = merged_titanic_df["Survived"]

# Split preprocessed data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [22]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Machine Learning Model

In [24]:
# Create Logistic Regression Classifier From Sci Kit Learn
classifier = LogisticRegression(solver="lbfgs",
                                max_iter=750,
                                random_state=42)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=750, random_state=42)

In [25]:
# Create Predictions Using the Logistic Regression Classifier
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Predictions,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,1,1
5,1,0
6,1,1
7,1,1
8,1,1
9,0,1


In [26]:
# Display Accuracy Score of First Model
print(f"Accuracy Score: {accuracy_score(y_test, y_pred)*100:.2f}%")

Accuracy Score: 84.76%
