## Imports

In [36]:
import pandas as pd
import numpy as np

## Settings and parameters

In [37]:
# settings for pandas
pd.set_option("display.max.columns",None) # alle kolommen tonen
pd.set_option("display.max.rows",500)    # eerste 500 rijen tonen
pd.set_option("display.precision", 2)     # precisie van de kolommen aanpassen
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) # floats output tot 3 decimalen
pd.set_option('display.max_colwidth', None)

# settings for extra passengers

add_extra_passengers = True

## Load data


In [56]:
datapath = '../data/'
filename = 'train.csv' 
df = pd.read_csv(datapath + filename)

### VARIABLE DESCRIPTIONS  
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)   
survival Survival (0 = No; 1 = Yes)  
name Name   
sex Sex  
age Age  
sibsp Number of Siblings/Spouses Aboard  
parch Number of Parents/Children Aboard  
ticket Ticket Number  
fare Passenger Fare (British pound)  
cabin Cabin  
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)  
boat Lifeboat  
body Body Identification Number  
home.dest Home/Destination  

### SPECIAL NOTES
Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)  
Pclass is a proxy for socio-economic status (SES)  
1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower  
Age is in Years; Fractional if Age less than One (1)  
If the Age is estimated, it is in the form xx.5  
Fare is in Pre-1970 British Pounds ()  
Conversion Factors: 1 = 12s = 240d and 1s = 20d    
#### With respect to the family relation variables (i.e. sibsp and parch) some relations were ignored. The following are the definitions used for sibsp and parch.
Sibling: Brother, Sister, Stepbrother, or Stepsister of Passenger Aboard Titanic  
Spouse: Husband or Wife of Passenger Aboard Titanic (Mistresses and Fiances Ignored)  
Parent: Mother or Father of Passenger Aboard Titanic  
Child: Son, Daughter, Stepson, or Stepdaughter of Passenger Aboard Titanic  
Other family relatives excluded from this study include cousins, nephews/nieces,  
aunts/uncles, and in-laws. Some children travelled only with a nanny, therefore parch=0  
for them. As well, some travelled with very close friends or neighbors in a village,  
however, the definitions do not support such relations.  

In [39]:
rename_dict = {
"PassengerId": "Passagier_Id",
"Survived": "Overleefd",
"Pclass": "Ticket_klasse",
"Name": "Naam",
"Sex": "Geslacht",
"Age": "Leeftijd",
"SibSp": "Aantal_overige_familieleden",
"Parch": "Aantal_kinderen",
"Ticket": "Ticket_nummer",
"Fare": "Ticket_prijs",
"Cabin": "Cabine_nummer",
"Embarked": "Opstapplaats"
}

In [40]:
df = df.rename(columns=rename_dict)

df = df[['Passagier_Id',
 'Naam',
 'Geslacht',
 'Leeftijd',
 'Opstapplaats',
 'Aantal_kinderen',
 'Aantal_overige_familieleden',
 'Ticket_nummer',
 'Ticket_klasse',
 'Ticket_prijs',
 'Cabine_nummer',
 'Overleefd']]

df['Workshop_passagier'] = 0

In [41]:
df.info() #kolommen waar nog iets mee moet: leeftijd, opstapplaats & cabine nummer

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Passagier_Id                 891 non-null    int64  
 1   Naam                         891 non-null    object 
 2   Geslacht                     891 non-null    object 
 3   Leeftijd                     714 non-null    float64
 4   Opstapplaats                 889 non-null    object 
 5   Aantal_kinderen              891 non-null    int64  
 6   Aantal_overige_familieleden  891 non-null    int64  
 7   Ticket_nummer                891 non-null    object 
 8   Ticket_klasse                891 non-null    int64  
 9   Ticket_prijs                 891 non-null    float64
 10  Cabine_nummer                204 non-null    object 
 11  Overleefd                    891 non-null    int64  
 12  Workshop_passagier           891 non-null    int64  
dtypes: float64(2), int64

In [47]:
df.tail(3)

Unnamed: 0,Passagier_Id,Naam,Geslacht,Leeftijd,Opstapplaats,Aantal_kinderen,Aantal_overige_familieleden,Ticket_nummer,Ticket_klasse,Ticket_prijs,Cabine_nummer,Overleefd,Workshop_passagier
888,889,"Johnston, Miss. Catherine Helen ""Carrie""",female,,S,2,1,W./C. 6607,3,23.45,,0,0
889,890,"Behr, Mr. Karl Howell",male,26.0,C,0,0,111369,1,30.0,C148,1,0
890,891,"Dooley, Mr. Patrick",male,32.0,Q,0,0,370376,3,7.75,,0,0
891,891,Maikel Jonkers (Kapitein),Man,40.0,Southampton,2,3,,1,100.0,,0,1
892,892,Luuk Jans (Matroos),Man,30.0,Southampton,0,4,,2,300.0,,0,1


In [42]:
Naam_1 = "Maikel Jonkers (Kapitein)" # string
Geslacht_1 = "Man" #Man/vrouw
Leeftijd_1 = 40 # int ?
Opstapplaats_1 = "Southampton" # Southampton(Engeland), Queenstown(Ierland),  Cherbourg (Frankrijk)
Aantal_kinderen_1 = 2 # int
Aantal_overige_familieleden_1 = 3 # int
Ticket_klasse_1 = 1 #1,2,3
Ticket_prijs_1 = 100 # int

In [43]:
Naam_2 = "Luuk Jans (Matroos)" # string
Geslacht_2 = "Man" #Man/vrouw
Leeftijd_2 = 30 # int ?
Opstapplaats_2 = "Southampton" # Southampton(Engeland), Queenstown(Ierland),  Cherbourg (Frankrijk)
Aantal_kinderen_2 = 0 # int
Aantal_overige_familieleden_2 = 4 # int
Ticket_klasse_2 = 2 #1,2,3
Ticket_prijs_2 = 300 # int

In [53]:
def voeg_passagier_toe(passagier_nummer):
    if add_extra_passengers == True:
        nieuwe_index = int(df.tail(1).index.item()) + 1
        passagier_nummer = str(passagier_nummer)
        df.loc[nieuwe_index] = [
                    int(nieuwe_index) + 1, 
                    eval("Naam_"+passagier_nummer), 
                    eval("Geslacht_"+passagier_nummer),
                    eval("Leeftijd_"+passagier_nummer),
                    eval("Opstapplaats_"+passagier_nummer),
                    eval("Aantal_kinderen_"+passagier_nummer),
                    eval("Aantal_overige_familieleden_"+passagier_nummer),
                    "NaN",  #Ticket_nummer
                    eval("Ticket_klasse_"+passagier_nummer),
                    eval("Ticket_prijs_"+passagier_nummer),
                    "NaN",  #Cabine_nummer
                    "0",    #Overleefd
                    1       #Workshop_passagier
                    ]
        print(eval("Naam_"+passagier_nummer) + " is toegevoegd aan de passagiers lijst.")

In [54]:
# als deze cel 2x gedraaid wordt dan worden de personen nog een keer toegevoegd. iets inbouwen zodat het niet kan (zoals index vastzetten?)?
voeg_passagier_toe(1)
voeg_passagier_toe(2)

Maikel Jonkers (Kapitein) is toegevoegd aan de passagiers lijst.
Luuk Jans (Matroos) is toegevoegd aan de passagiers lijst.


In [55]:
df.tail(3)

Unnamed: 0,Passagier_Id,Naam,Geslacht,Leeftijd,Opstapplaats,Aantal_kinderen,Aantal_overige_familieleden,Ticket_nummer,Ticket_klasse,Ticket_prijs,Cabine_nummer,Overleefd,Workshop_passagier
892,892,Luuk Jans (Matroos),Man,30.0,Southampton,0,4,,2,300.0,,0,1
893,894,Maikel Jonkers (Kapitein),Man,40.0,Southampton,2,3,,1,100.0,,0,1
894,895,Luuk Jans (Matroos),Man,30.0,Southampton,0,4,,2,300.0,,0,1


In [49]:
def df_replace_column_values(df,column,old_value,new_value):
    df.loc[df[column] == old_value, column] = new_value

In [50]:
df_replace_column_values(df=df,column="Opstapplaats",old_value="C",new_value="Cherbourg")
df_replace_column_values(df=df,column="Opstapplaats",old_value="Q",new_value="Queenstown")
df_replace_column_values(df=df,column="Opstapplaats",old_value="S",new_value="Southampton")

df_replace_column_values(df=df,column="Geslacht",old_value="male",new_value="Man")
df_replace_column_values(df=df,column="Geslacht",old_value="female",new_value="Vrouw")

In [51]:
df

Unnamed: 0,Passagier_Id,Naam,Geslacht,Leeftijd,Opstapplaats,Aantal_kinderen,Aantal_overige_familieleden,Ticket_nummer,Ticket_klasse,Ticket_prijs,Cabine_nummer,Overleefd,Workshop_passagier
0,1,"Braund, Mr. Owen Harris",Man,22.000,Southampton,0,1,A/5 21171,3,7.250,,0,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",Vrouw,38.000,Cherbourg,0,1,PC 17599,1,71.283,C85,1,0
2,3,"Heikkinen, Miss. Laina",Vrouw,26.000,Southampton,0,0,STON/O2. 3101282,3,7.925,,1,0
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Vrouw,35.000,Southampton,0,1,113803,1,53.100,C123,1,0
4,5,"Allen, Mr. William Henry",Man,35.000,Southampton,0,0,373450,3,8.050,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
888,889,"Johnston, Miss. Catherine Helen ""Carrie""",Vrouw,,Southampton,2,1,W./C. 6607,3,23.450,,0,0
889,890,"Behr, Mr. Karl Howell",Man,26.000,Cherbourg,0,0,111369,1,30.000,C148,1,0
890,891,"Dooley, Mr. Patrick",Man,32.000,Queenstown,0,0,370376,3,7.750,,0,0
891,891,Maikel Jonkers (Kapitein),Man,40.000,Southampton,2,3,,1,100.000,,0,1


In [52]:
print(df["Leeftijd"].unique())  
print(df["Geslacht"].unique())
print(df["Opstapplaats"].unique())
print(df["Aantal_kinderen"].unique())
print(df["Aantal_overige_familieleden"].unique())
print(df["Ticket_klasse"].unique())

[22.   38.   26.   35.     nan 54.    2.   27.   14.    4.   58.   20.
 39.   55.   31.   34.   15.   28.    8.   19.   40.   66.   42.   21.
 18.    3.    7.   49.   29.   65.   28.5   5.   11.   45.   17.   32.
 16.   25.    0.83 30.   33.   23.   24.   46.   59.   71.   37.   47.
 14.5  70.5  32.5  12.    9.   36.5  51.   55.5  40.5  44.    1.   61.
 56.   50.   36.   45.5  20.5  62.   41.   52.   63.   23.5   0.92 43.
 60.   10.   64.   13.   48.    0.75 53.   57.   80.   70.   24.5   6.
  0.67 30.5   0.42 34.5  74.  ]
['Man' 'Vrouw']
['Southampton' 'Cherbourg' 'Queenstown' nan]
[0 1 2 5 3 4 6]
[1 0 3 4 2 5 8]
[3 1 2]


 ![Alt text](../images/southampton.jpg) 
 ![Alt text](../images/queenstown.jpg)
 ![Alt text](../images/southampton.jpg)