# Module 8 - Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
location = "datasets/titanic.xls"

df = pd.read_excel(location)
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


#### Explore through visualized data

In [None]:
#bar chart of survival status count
sns.countplot(x='survived', data=df)

In [None]:
#survival status by sex
sns.barplot('sex', 'survived', data=df)

In [None]:
#survival rate by passenger class
sns.barplot('pclass', 'survived', data=df)

In [None]:
#plot age by survival status
plt.figure(figsize=(10,6))

ax = sns.kdeplot(df['age'][df['survived'] == 1], #passengers that survived
                 color="darkturquoise",
                 shade=True)

sns.kdeplot(df['age'][df['survived'] == 0], #passengers that did not survive
            color="lightcoral",
            shade=True)

plt.legend (['Survived', 'Died'])
plt.title("Density Plot of Age for Survived vs Deceased Population")

ax.set(xlabel='Age')
#plt.show()

#### Handle missing values

In [None]:
#find columns that have missing values
df.isnull().sum()

Let's clean up 'age' and 'embarked'

In [None]:
#rows where the age is missing
missing_age = df.loc[df['age'].isnull()]
missing_age.head()

In [None]:
#get index numbers of missing rows - we'll use this later
mals = list(missing_age.index)

In [None]:
#table of avg age of passenger by survival status, sex, and passenger class
df.groupby(['survived', 'sex', 'pclass'])['age'].mean()

In [None]:
#fill missing values for age based on survival status, sex, and passenger class
df['age'].fillna(df.groupby(['survived', 'sex', 'pclass'])['age'].transform('mean'), inplace=True)

In [None]:
#verify filled missing values 
df.iloc[mals].head()

In [None]:
#verify there are no more missing age values
df.isnull().sum()

In [None]:
#missing values for 'embarked'
embark = df.loc[df['embarked'].isnull()]
embark

In [None]:
#save index for missing values to verify later
embarkls = list(embark.index)

In [None]:
#only 2 missing values so we'll fill with most common embarkation point
df['embarked'].value_counts()

In [None]:
#fill missing values
df['embarked'].fillna('S', inplace=True)

In [None]:
#check that they're filled
df.iloc[embarkls]

In [None]:
df.isnull().sum()

Get rid of columns that we don't want to use in the model

In [None]:
modeldf = df.drop(['name','ticket','fare', 'cabin', 'boat', 'body', 'home.dest'], axis=1)

In [None]:
#columns left in our dataframe
modeldf.columns

Create dummy variables for categorical values

In [None]:
#dummy variables for passenger class embarkation port
#get_dummies will auto-drop columns that dummies were created from
modeldf = pd.get_dummies(data=modeldf, columns=['pclass','embarked'])
modeldf.head()

In [None]:
#change sex values to binary
#female=0, male=1
modeldf['sex'] = modeldf['sex'].map({'female':0, 'male':1})
modeldf.head()

In [None]:
#create new column based on number of family members
#drop sibsp and parch columns
modeldf['family_num'] = modeldf['sibsp'] + modeldf['parch']
modeldf.drop(['sibsp', 'parch'], axis=1, inplace=True)
modeldf.head()

In [None]:
modeldf['TravelAlone']=np.where((modeldf['family_num'] > 0), 0, 1)
modeldf.head()

## Logistic Regression

#### Split data into train and test

In [None]:
#extract target variable
#make copy of 'survived' column
y = modeldf['survived']

In [None]:
#copy of modeldf without 'survived' column
X = modeldf.drop(['survived'], axis=1)

In [None]:
#80% for training data, 20% for test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

In [None]:
#build logistic regression model
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)

In [None]:
#accuracy score of model using training data
LogReg.score(X_train, y_train)

In [None]:
#generate prediction values
y_pred = LogReg.predict(X_test)

In [None]:
#Confusion matrix shows which values model predicted correctly vs incorrectly

cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Not Survival', 'Predicted Survival'],
    index=['True Not Survival', 'True Survival']
)

cm

In [None]:
#accuracy score of model on test data
LogReg.score(X_test, y_test)

In [None]:
#from precision column, model is better at predicting passengers that do not survive
print(classification_report(y_test, y_pred))

Sources:

https://nbviewer.jupyter.org/github/BigDataGal/Data-Mania-Demos/blob/master/Logistic%20Regression%20Demo.ipynb

https://mashimo.wordpress.com/2018/03/31/logistic-regression-using-sklearn/

https://www.kaggle.com/mnassrib/titanic-logistic-regression-with-python/notebook

https://datascienceplus.com/would-you-survive-the-titanic-getting-started-in-python/

https://towardsdatascience.com/predicting-the-survival-of-titanic-passengers-30870ccc7e8