In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [17]:
# Load the passenger data
passengers = pd.read_csv("train.csv")


In [18]:
# Update sex column to numerical
passengers['Sex'] = passengers['Sex'].map(lambda x: 1 if x == 'female' else 0 )
# passengers['Sex']=passengers['Sex'].map({'female':1,'male':0})

# Fill the nan values in the age column
passengers['Age'].fillna(value=passengers['Age'].mean()  ,inplace=True)

# Create a first class column
passengers['FirstClass'] = passengers['Pclass'].apply(lambda x : 1 if x == 1 else 0)

# Create a second class column
passengers['SecondClass'] = passengers['Pclass'].apply(lambda x : 1 if x == 2 else 0)
print(passengers.head(10))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name  Sex        Age  SibSp  \
0                            Braund, Mr. Owen Harris    0  22.000000      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.000000      1   
2                             Heikkinen, Miss. Laina    1  26.000000      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.000000      1   
4                           Allen, Mr. William Henry    0  35.000000      0   
5                                   Moran, Mr. James    0  29.699118      0   
6                            McCarthy, Mr. Timothy

In [19]:
# Select the desired features
features = passengers[['Sex','Age','FirstClass','SecondClass']]
survival = passengers['Survived']
# Perform train, test, split
features_train,features_test,survival_train,survival_test = train_test_split(features,survival,test_size = 0.3)

In [20]:
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)

In [21]:
# Create and train the model
model = LogisticRegression()
model.fit(features_train,survival_train)

LogisticRegression()

In [22]:
# Score the model on the train data
print(model.score(features_train,survival_train))

# Score the model on the test data
print(model.score(features_test,survival_test))

0.7624398073836276
0.8283582089552238


In [23]:
# Analyze the coefficients
print(model.coef_)
print(list(zip(['Sex','Age','FirstClass','SecondClass'],model.coef_[0])))

[[ 1.14375935 -0.31121426  0.93804488  0.42906038]]
[('Sex', 1.14375935222717), ('Age', -0.31121426105395344), ('FirstClass', 0.9380448785771832), ('SecondClass', 0.4290603787743715)]


In [24]:
# Sample passenger features
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You = np.array([0.0,30.0,1.0,0.0])

# Combine passenger arrays
sample_passengers = np.array([Jack , Rose, You])

# Scale the sample passenger features
print(model.predict(sample_passengers))
print(model.predict_proba(sample_passengers))

[0 0 0]
[[9.98902563e-01 1.09743721e-03]
 [9.78080870e-01 2.19191302e-02]
 [9.99875088e-01 1.24911854e-04]]
