# Kaggle Titanic Challenge
#In this challenge, we ask you to complete the analysis of what sorts of people were likely to survive. In particular, we ask you to apply the tools of machine learning to predict which passengers survived the tragedy.
This will include:
- exploring the data
- feature engineering
- feature selection
- modeling
- testing

In [5]:
import pandas as pd

test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
gender = pd.read_csv('gender_submission.csv')

test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
gender.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


data dictionary
- survived: 0 = no, 1 = yes
- pclass: ticket class 1= 1st, 2 = 2nd, 3 = 3rd
- sibsp: this is the number of siblings or spouses aboard the titanic
- parch: this is the number of parents or children aboard the titanic
- ticket: ticket number
- cabin: cabin number
- embarked: port where the passenger embarked C = cherbourg, Q = Queenstown, S = Southampton

In [8]:
train.shape
#here we have 891 passengers with 12 features

(891, 12)

In [9]:
test.shape
#this won't have the survivor feature. 

(418, 11)

In [11]:
train.info()
#this will show us additional info and allow us to determine if we need to drop NAN values. 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [13]:
train.isnull().sum()
#how many values are null in each column?

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [14]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()  # setting seaborn default for plots

#barchart for categorical features
- Pclass
- Sex
- SibSp(num of siblings and spouses)
- Parch(num of parents and children)
- Embarked
- Cabin

In [None]:
#this will get us one feature and give us two bar charts. so we will have one bar chart for the survived and one bar
#chart for those who did not survive. 
def bar_chart(feature):
    survived = train[train['Survived']==1][feature].value_counts() #getting a count of all that survived and naming it
    dead = train[train['Survived']==0][feature].value_counts() #getting a count of all that did not survive and naming it
    df = pd.DatFrame([survived,dead]) #making a dataframe out of both values
    df.index = ['Survived', 'Dead'] #naming the indexes of the dataframe
    df.plot(kind='bar',stacked=True, figsize=(10,5)) #determining the characteristics of the plot. 
    

In [None]:
bar_chart('Pclass')

In [None]:
bar_chart('SibSp')

In [None]:
bar_chart('Parch')

In [None]:
bar_chart('Embarked')

In [None]:
#combining train and test dataset to create a new data set on the Name value. 
#the goal here is to create a data set where we can extract information from mr. and mrs. titles. 
train_test_data = [train, test]

for datset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False) #this should retreive strings followed by '.'

In [None]:
#looking at the frequency of each value in the new 'titles' dataset
train['Title'].value_counts()

In [None]:
test['Title'].value_counts()

#Here we'll be doing some title mapping. We'll be assigning a number value to each title.
- Mr: 0
- Miss: 1
- Mrs: 2
- Others: 3

In [None]:
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr":3, "Rev": 3, "Col":3, "Mile": 3, "Countess":3, 
                 "Ms":3, "Lady": 3, "Jonkheer": 3, "Don":3, "Dona": 3, "Mme": 3, "Capt": 3, "Sir": 3}
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)