## 1. Set up the notebook

In [120]:
import pandas as pd
import numpy as np
%matplotlib inline 
import matplotlib.pyplot as plt
import string

### Titanic Data information


survival Survival	0 = No, 1 = Yes\
pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	Sex	
Age	Age in years	
sibsp	# of siblings / spouses aboard the Titanic	
parch	# of parents / children aboard the Titanic	
ticket	Ticket number	
fare	Passenger fare	
cabin	Cabin number	
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

Variable Notes

pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

## 2. Let's put the data into a dataframe

In [121]:
passengers = pd.read_csv('train.csv')

## 3. Overview of the data and how it looks

In [122]:
passengers.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [123]:
passengers.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [124]:
passengers.info()
passenger_rows = len(passengers)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## 4. Filtering and how we can take views of the data

### Using loc, iloc and index ( locate, index locate and index)

#### dataframe.loc[row,column]

In [125]:
#passengers.loc[:,['Age']] #all rows and only the age column
#passengers.loc[:,[['Age','Sex']] #all rows and only the multiple columns
#passengers.loc[:8,['Age']] #age column and first 8 rows
#passengers.loc[4:8,['Age']] # age column and rows 4 to 8 inclusive
#passengers.loc[4:8,'Name':'Cabin'] # columns between Name and Cabin inclusive and rows 4 to 8 inclusive



#### dataframe.iloc[row_index,column_index]

In [126]:
passengers.iloc[1:3,0:3] #two rows with index 1 and 2, and column with index 0,1,2 note its not inclusive



Unnamed: 0,PassengerId,Survived,Pclass
1,2,1,1
2,3,1,3


#### dataframe.index[ ]

In [127]:
#passengers[passengers.index < 5] #all index rows below 5
#passengers[passengers.Age > 70] #all passengers over 60 years old


## 5. Summary Statistics for numerical values

In [128]:
passengers.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## 6. Adding Columns

In [129]:
# Surnames seems a sensible one to add
passengers['Surname'] = passengers.Name.apply(lambda x: x.split(',')[0])

# Family Size
passengers['Family_Size'] = passengers.SibSp + passengers.Parch

In [130]:
"""
#Passenger Titles, 2 methods
titles = ["Mr", "Mrs", "Ms", "Miss",
          "Doctor", "Dr", "Master", "Col",
          "Rev", "Major", "Countess","Jonkheer",
          "Don", "Capt", "Mme", "Mlle"]

pat = '.*({}).*'.format('|'.join(titles)) # This creates the regex statment from the title list
passengers['Title'] = passengers['Name'].str.replace(pat, r'\1')

#Had to google this, much harder than it seemed. Not fully sure what regex is doing

#Also a substring function from 
# https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/
"""


'\n#Passenger Titles, 2 methods\ntitles = ["Mr", "Mrs", "Ms", "Miss",\n          "Doctor", "Dr", "Master", "Col",\n          "Rev", "Major", "Countess","Jonkheer",\n          "Don", "Capt", "Mme", "Mlle"]\n\npat = \'.*({}).*\'.format(\'|\'.join(titles)) # This creates the regex statment from the title list\npassengers[\'Title\'] = passengers[\'Name\'].str.replace(pat, r\'\x01\')\n\n#Had to google this, much harder than it seemed. Not fully sure what regex is doing\n\n#Also a substring function from \n# https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/\n'

In [134]:
# https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/
def substrings_in_string(big_string, substrings):
    """Looks for each substring in larger string and returns if found"""
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    else:
        return "Missing"
 



In [135]:
#Add passenger titles
passengers['Title']=passengers['Name'].apply(lambda x: substrings_in_string(x, titles))

In [138]:
# Fill Nans in Cabin will string since filled values are string
passengers.Cabin = passengers.Cabin.fillna('Unknown')

In [139]:
passengers.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Surname,Family_Size,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,Braund,1,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,1,Mr
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,Heikkinen,0,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Futrelle,1,Mr
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S,Allen,0,Mr
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,Unknown,Q,Moran,0,Mr
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,McCarthy,0,Mr
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,Unknown,S,Palsson,4,Master
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,Unknown,S,Johnson,2,Mr
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,Unknown,C,Nasser,1,Mr
