# Titanic (Winsor)
This the intro to Kaggle "Titanic" dataset. Content here is taken in large part from
https://www.kaggle.com/startupsci/titanic-data-science-solutions/data
https://www.kaggle.com/c/titanic/data

In [97]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [103]:
# get the data
train_df = pd.read_csv('data/titanic/train.csv')
test_df = pd.read_csv('data/titanic/test.csv')
combine = [train_df, test_df]
foo = np.array([[2,3,4],[4,5,6]])

# Get familiar with the dataset

In [113]:
# show the first few instances
train_df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [39]:
# show the attribute types
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [110]:
# describe the numeric attributes
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [114]:
# describe the categorical attributes
train_df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Andrews, Mr. Thomas Jr",male,CA. 2343,G6,S
freq,1,577,7,4,644


In [115]:
# check for nulls - this should not be a surprise
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Start collecting information about the data
### We will update this as we go

Survived
    This is the class
    int64 - but limited to "0" (died) and "1" (survived)
    
Passenger ID
    This is an index - probably redundant to that created by pandas
    int64
    probably not much use BUT could adjacent passenger IDs mean family/related/traveling together?

PClass
    the passenger class
    type = int64
    probably important because it indicates what level of the ship they were on
    from the website:
        pclass: A proxy for socio-economic status (SES)
        1st = Upper
        2nd = Middle
        3rd = Lower

Name
    this is the passenger's name
    type = "object" which is a text string - categorical with no immediate grouping BUT
    there may information here based on sirname indicating family relationships or
    titles "(Dr)" indicating more important distinction

Sex
    The gender of the passenger
    This is shown as an "object" (text) so it needs to be converted into categorical or discrete int
    Expect this to be important (woman and children first)

Age
    The age of the passenger
    type = "float" (for young children age may be fractional)
    missing data here - 741/819 present
    Expect this to be imporatant.
    probably want to group this into buckets (turn into categorical)
    from the website:
        Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

SibSp
    Sibling / spouse relationship
    type = int (values from 0 to 8???)
    from the website:
        sibsp: The dataset defines family relations in this way...
        Sibling = brother, sister, stepbrother, stepsister
        Spouse = husband, wife (mistresses and fiancés were ignored)
    -> need to investigate expect 1..6 observed 0..8
       
Parch
    Parent / child relationship
    from the website:
        parch: The dataset defines family relations in this way...
        Parent = mother, father
        Child = daughter, son, stepdaughter, stepson
        Some children travelled only with a nanny, therefore parch=0 for them.
    -> need to investigate expect values 1..6 vs observed 0..6
    -> attribute creation opportunity = passengers age < 10 with parch=0 (child with nanny)
    -> find the passengers who are nannies for children.. passenger age < 10, adjacent Passenger IDs? Female, sharing a room (same Class)
    -> investigate family relationships - use sirname from the Name attribute to identify families. Compare this to what is in the SibSp and Parch attributes

Ticket
    type = "object" - a string with encoded data
    -> need to extract this into fields/sub-attributes
    From website:
        C = Cherbourg, Q = Queenstown, S = Southampton
    -> otherwise ... ?
    
     ticket Ticket number fare Passenger fare cabin Cabin number embarked Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton
     
Fare
    type = float (why not int?  this can probably be converted to int)
    not sure there is more information than Cabin
    
Cabin
    type = "Object" i.e. a string
    contains important information - (I'm presuming) deck and location front-to-back
    many missing values
    need to decode this into sub-attributes (deck + even/odd or increasing values)
    many missing values... is it possible to combine this with Fare to get a "cabin deck" and front/back for all passengers?
    
Embarked
    type - 

# Organize into tasks

difficulty

# This is markdown
1. foo
2. bar
5. blah
3. lala


* something
* else
* alltogether


| Tables        | Are           | Cool  |
| ------------- |:-------------:| -----:|
| col 3 is      | right-aligned | $1600 |
| col 2 is      | centered      |   $12 |
| zebra stripes | are neat      |    $1 |



Markdown | Less | Pretty
--- | --- | ---
*Still* | `renders` | **nicely**
1 | 2 | 3

# Preliminary analyze by correlation...
note this only includes attributes that are (currently) numeric

In [62]:
train_df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [70]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [71]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean()


Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


In [72]:
train_df[['Pclass', 'Survived']].groupby(['Pclass']).mean()


Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


In [90]:
# intialise data of lists. 
data = {'p1':[4,5,6,7,8], 'p2':[4,5,6,5,None],     'Age':[20, 21, 22, 23, 24]} 
  
# Create DataFrame 
df = pd.DataFrame(data) 
  
# Print the output. 
df 

Unnamed: 0,Age,p1,p2
0,20,4,4.0
1,21,5,5.0
2,22,6,6.0
3,23,7,5.0
4,24,8,


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
Age    5 non-null int64
p1     5 non-null int64
p2     4 non-null float64
dtypes: float64(1), int64(2)
memory usage: 200.0 bytes


In [92]:
df.describe()

Unnamed: 0,Age,p1,p2
count,5.0,5.0,4.0
mean,22.0,6.0,5.0
std,1.581139,1.581139,0.816497
min,20.0,4.0,4.0
25%,21.0,5.0,4.75
50%,22.0,6.0,5.0
75%,23.0,7.0,5.25
max,24.0,8.0,6.0


In [94]:
df.describe(include='all')

Unnamed: 0,Age,p1,p2
count,5.0,5.0,4.0
mean,22.0,6.0,5.0
std,1.581139,1.581139,0.816497
min,20.0,4.0,4.0
25%,21.0,5.0,4.75
50%,22.0,6.0,5.0
75%,23.0,7.0,5.25
max,24.0,8.0,6.0


In [91]:
df.corr()

Unnamed: 0,Age,p1,p2
Age,1.0,1.0,0.632456
p1,1.0,1.0,0.632456
p2,0.632456,0.632456,1.0
