### Kaggle Titanic Logistic Regression Practice
##### Kaggle offers some practice datasets for people to try to predict whether a passenger will survive on the Titanic. Our submission correctly predicted whether a passenger would surive 77% of the time.
##### Link to problem on Kaggle: https://www.kaggle.com/c/titanic
##### Leaned on a tutorial from a YouTube video: https://www.youtube.com/watch?v=pUSi5xexT4Q&t=917s

##### Import LIbraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

##### Read in csv files

In [2]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
gender = pd.read_csv('gender_submission.csv')

##### Data Preview

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


##### FInd N/A values. Will need to impute

In [5]:
print('dataset size: '+ str(len(train)))

train.isna().sum()

dataset size: 891


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### There are three columns that have null values: Age, Cabin, and Embarked.
##### Let's impute values into each of these colums - starting with Age

In [6]:
def age_impute(data):
    
    # impute missing Age data with the column's median:
    data['Age'].fillna(data['Age'].median(), inplace = True)

##### Next, we'll impute the Embarked field with a dummy value. Embarked is the location from which a passenger originally embarked from. Let's fill in NULLs with a 'U' for Unknown

In [7]:
def embarked_inpute(data):
    
    data['Embarked'].fillna('U', inplace = True)

##### 687 of the ~800ish records for the cabin column are NULL. I think we're better off dropping that column than trying to impute.
##### We'll also drop some other columns like Ticket # and Passenger Name

In [8]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [9]:
columns = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Embarked']

In [10]:
train = train[columns]

#### In this version, we're going to create dummy variables rather than imputing with numbers

In [26]:
embarked_inpute(train)
age_impute(train)

KeyError: 'Embarked'

In [27]:
# This function will convert categorical values to flags!
train = pd.get_dummies(train)

##### Looking really good! Categorical fields are removed, and NULL values are imputed

In [28]:
train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U
0,0,3,22.0,1,0,7.25,0,1,0,0,1,0
1,1,1,38.0,1,0,71.2833,1,0,1,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1,0
3,1,1,35.0,1,0,53.1,1,0,0,0,1,0
4,0,3,35.0,0,0,8.05,0,1,0,0,1,0


##### Begin training the model!

In [14]:
# Read in libraries

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [48]:
from sklearn.linear_model import LogisticRegression

T = 'intervention'
y = train['Survived']
X = train.drop('Survived', axis = 1)

# Create logistic regression model we will then apply to the test dataset
ps_model = LogisticRegression(random_state = 0, max_iter = 1000).fit(X, y)

In [49]:
data_ps

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,propensity_score
0,3,34.5,0,0,7.8292,0,1,0,1,0,0,0.105429
1,3,47.0,1,0,7.0000,1,0,0,0,1,0,0.350586
2,2,62.0,0,0,9.6875,0,1,0,1,0,0,0.107009
3,3,27.0,0,0,8.6625,0,1,0,0,1,0,0.100637
4,3,22.0,1,1,12.2875,1,0,0,0,1,0,0.567691
...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,27.0,0,0,8.0500,0,1,0,0,1,0,0.100519
414,1,39.0,0,0,108.9000,1,0,1,0,0,0,0.941352
415,3,38.5,0,0,7.2500,0,1,0,0,1,0,0.066842
416,3,27.0,0,0,8.0500,0,1,0,0,1,0,0.100519


In [17]:
predictions = log_reg_fit.predict(X)

#### Clean test df:

In [50]:
test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U
0,3,34.5,0,0,7.8292,0,1,0,1,0,0
1,3,47.0,1,0,7.0,1,0,0,0,1,0
2,2,62.0,0,0,9.6875,0,1,0,1,0,0
3,3,27.0,0,0,8.6625,0,1,0,0,1,0
4,3,22.0,1,1,12.2875,1,0,0,0,1,0


In [19]:
test = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked']]

In [20]:
test.isna().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [21]:
#Impute missing Values for test df
test['Fare'].fillna(test['Fare'].median(), inplace = True);

# Run the same functions we used earlier:
age_impute(test)
embarked_inpute(test)
#column_mapping(test)

In [22]:
test = pd.get_dummies(test)

In [23]:
test.isna().sum()

Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [24]:
# Since there are no NULL Embarked columns in the test dataset, 
# we'll have to code in a Embarked_U column of 0s to match the train df
test['Embarked_U'] = 0

##### Test DF is looking good

In [25]:
test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U
0,3,34.5,0,0,7.8292,0,1,0,1,0,0
1,3,47.0,1,0,7.0,1,0,0,0,1,0
2,2,62.0,0,0,9.6875,0,1,0,1,0,0
3,3,27.0,0,0,8.6625,0,1,0,0,1,0
4,3,22.0,1,1,12.2875,1,0,0,0,1,0


##### Apply model fit to our test df to see whether we think each person will survive:

In [53]:
# Apply model to the test dataset
test_survival_prop = test.assign(survival_likelihood=ps_model.predict_proba(test)[:, 1])

In [54]:
test_survival_prop

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,survival_likelihood
0,3,34.5,0,0,7.8292,0,1,0,1,0,0,0.105429
1,3,47.0,1,0,7.0000,1,0,0,0,1,0,0.350586
2,2,62.0,0,0,9.6875,0,1,0,1,0,0,0.107009
3,3,27.0,0,0,8.6625,0,1,0,0,1,0,0.100637
4,3,22.0,1,1,12.2875,1,0,0,0,1,0,0.567691
...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,27.0,0,0,8.0500,0,1,0,0,1,0,0.100519
414,1,39.0,0,0,108.9000,1,0,1,0,0,0,0.941352
415,3,38.5,0,0,7.2500,0,1,0,0,1,0,0.066842
416,3,27.0,0,0,8.0500,0,1,0,0,1,0,0.100519


##### Explore data visually to check outcomes

In [56]:
import altair as alt

In [57]:
test_survival_prop.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,survival_likelihood
0,3,34.5,0,0,7.8292,0,1,0,1,0,0,0.105429
1,3,47.0,1,0,7.0,1,0,0,0,1,0,0.350586
2,2,62.0,0,0,9.6875,0,1,0,1,0,0,0.107009
3,3,27.0,0,0,8.6625,0,1,0,0,1,0,0.100637
4,3,22.0,1,1,12.2875,1,0,0,0,1,0,0.567691


In [64]:
alt.Chart(test_survival_prop).transform_fold(
    ['1', '2', '3'], as_ = ['class', 'survival_pct']
).mark_area(
    opacity = 0.4,
    interpolate = 'step'
).encode(
    alt.X('survival_likelihood:Q', bin = alt.Bin(maxbins=7), title = 'Predicted Survival Rate'),
    alt.Y('count()', stack = None, scale = alt.Scale(domain=[0,200]), title = 'Passengers'),
    alt.Color('class:N')
).properties(title = "Harbaugh vs the Greats: Season Win Total", width = 375)

In [None]:
alt.Chart(pivot).transform_fold(
    #['Bo Schembechler', 'Brady Hoke', 'Gary Moeller', 'Jim Harbaugh', 'Lloyd Carr'],
    ['Bo Schembechler', 'Jim Harbaugh', 'Lloyd Carr'],
    as_=['Coach', 'wins_normalized']
).mark_area(
    opacity=0.35,
    interpolate='step'
).encode(
    alt.X('wins_normalized:Q',bin=alt.Bin(maxbins=7), title = 'Season Win Total'),
    alt.Y('count()', stack=None, scale=alt.Scale(domain=[0, 8]), title = 'Number of Season'),
    alt.Color('Coach:N')
).properties(title = "Harbaugh vs the Greats: Season Win Total", width = 375)

##### Create a submission csv with two columns: PassengerID and whether we think that person will survive:

In [26]:
test_2 = pd.read_csv('test.csv')

In [27]:
PassengerId = test_2['PassengerId']

In [28]:
df = pd.concat([PassengerId, submission_preds], axis = 1)

In [29]:
df.rename(columns = {0: 'Survived'}, inplace = True)

In [30]:
df.to_csv('Titanic_Predictions_2.csv', index = False)

##### When uploaded to kaggle, this df correctly predicted whether the person would die 76.794% of the time. As of 5/5/2022, this puts me in 9,432nd place out of around 15,000 participants 