In [1]:
# Import Dependencies
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Read in Titanic Dataset to Pandas DataFrame
titanic_df = pd.read_csv("Data/Titanic-Dataset1.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,Embarked,Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,,S,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85,C,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,,S,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123,S,53.1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,,S,8.05


### Data Preprocessing

In [3]:
# Display size of dataframe
titanic_df.shape

(891, 12)

In [4]:
# Drop Passenger ID and Name Columns
titanic_df.drop(["PassengerId", "Name"], axis=1, inplace=True)

In [5]:
#Check datatypes
titanic_df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Cabin        object
Embarked     object
Fare        float64
dtype: object

In [6]:
# Check unique values
titanic_df.nunique()

Survived      2
Pclass        3
Sex           2
Age          88
SibSp         7
Parch         7
Ticket      681
Cabin       147
Embarked      3
Fare        248
dtype: int64

In [7]:
# Number of unique tickets
titanic_df["Ticket"].value_counts()

347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: Ticket, Length: 681, dtype: int64

In [8]:
# Check null values
titanic_df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Cabin       687
Embarked      2
Fare          0
dtype: int64

In [9]:
# Drop ticket and cabin column
titanic_df.drop(["Ticket", "Cabin"], axis=1, inplace=True)

In [10]:
# Drop null values and duplicates
titanic_df.dropna(inplace=True)
titanic_df.drop_duplicates(inplace=True)
print(f"There are {titanic_df.shape[0]} rows and {titanic_df.shape[1]} columns.")

There are 674 rows and 8 columns.


In [12]:
# Use OneHotEncoder to transform non-numeric columns into numeric
# Reset titanic dataframe index to match with encode merge
titanic_df.reset_index(drop=True, inplace=True)

# Generate categorical variable list
titanic_cat = titanic_df.dtypes[titanic_df.dtypes == "object"].index.tolist()

# Create OneHotEncoder Instance
enc = OneHotEncoder(sparse=False)
# Fit and transform OneHotEncoder using categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(titanic_df[titanic_cat]))
# Add the encoded variable names to dataframe
encode_df.columns = enc.get_feature_names_out(titanic_cat)

# Merge encoded features and drop originals
merged_titanic_df = titanic_df.merge(encode_df, left_index=True, right_index=True).drop(labels=titanic_cat, axis=1)
merged_titanic_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0
1,1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,1,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0
3,1,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0
4,0,3,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0


In [13]:
# Confirm merge worked as intended
merged_titanic_df.shape

(674, 11)

In [14]:
# Split data into features and target
X = merged_titanic_df.drop("Survived", axis=1)
y = merged_titanic_df["Survived"]

# Split preprocessed data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [15]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Machine Learning Model

In [16]:
# Create Logistic Regression Classifier From Sci Kit Learn
classifier = LogisticRegression(solver="lbfgs",
                                max_iter=500,
                                random_state=42)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=500, random_state=42)

In [17]:
# Create Predictions Using the Logistic Regression Classifier
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Predictions,Actual
0,0,1
1,1,1
2,0,0
3,0,0
4,1,1
5,0,0
6,0,0
7,1,1
8,0,0
9,0,0


In [18]:
# Display Accuracy Score of First Model
print(f"Accuracy Score: {accuracy_score(y_test, y_pred)*100:.2f}%")

Accuracy Score: 85.80%
