In [1]:
# library imports
import pandas as pd
import numpy as np

In [339]:
# autoreload module
%load_ext autoreload
% autoreload 2

In [342]:
# import cleaner script
import cleaning_helper

In [2]:
dataframe = pd.read_csv("data/train.csv")

In [10]:
dataframe.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Expected problems
    1. Cabin has NaN values. No need to fix since concluding from the Tableau analysis we know that this is not important
    2. Age has NaN values. But this is important for modelling the data. So we need to figure out a way to fill these values without disturbing the data.
    3. Embarked has two missing values. We can simply fill these with the most frequent values.

In [22]:
dataframe.drop(labels=['Cabin', 'Ticket', 'PassengerId'], axis=1, inplace=True)

In [32]:
dataframe[dataframe['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,


In [39]:
dataframe.loc[61, "Embarked"] = 'S'

In [41]:
dataframe.loc[829, "Embarked"] = 'S'

In [46]:
dataframe.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         0
dtype: int64

## Working with the Age field
    1. Problem 1 : Want to convert the continuous to a nominal variable.
    2. Problem 2 : Remove the NaN values by replacing the values from roughly the same distribution.

In [81]:
# 3 is Nan
new_series.value_counts()

3     177
0     177
2     169
1     118
9      70
5      54
6      46
4      45
7      24
8       9
10      2
dtype: int64

In [84]:
l_ = [0] * 177 + [2] * 169 + [1] * 118 + [9] * 70 + [5] * 54 + [6] * 46 + [4] * 45 + [7] * 24 + [8] * 9 + [10] * 2

2

In [279]:
a_ = []
for _ in range(177) : a_.append(np.random.choice(l_, replace=False))
pd.Series(a_).value_counts()

0    52
2    36
1    28
9    16
6    14
5    14
7     9
4     8
dtype: int64

In [302]:
dataframe[dataframe['Age'].isnull()].index[0]

180

In [303]:
def fix_age():
    for e in dataframe[dataframe['Age'].isnull()].index:
        dataframe.loc[e, 'Age'] = np.random.choice(l_, replace=False)

In [305]:
fix_age()

In [306]:
dataframe.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [307]:
dataframe.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [310]:
def discretize_field(series):
    u_values = series.unique()
    h_map = {}
    counter = 0
    for u in u_values:
        if u not in h_map:
            h_map[u] = counter
            counter += 1
    new_list = list()
    for each in dataframe.Embarked:
        new_list.append(h_map[each]) 
    return pd.Series(new_list)
    
    
ans = discretize_field(dataframe['Embarked'])
dataframe['dis_embarked'] = discretize_field(dataframe.Embarked)
dataframe.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,dis_embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,0


## working with the Name column.

In [318]:
dataframe.Name.str.extract("([A-Za-z]*\.)").unique()

  """Entry point for launching an IPython kernel.


array(['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Don.', 'Rev.', 'Dr.', 'Mme.',
       'Ms.', 'Major.', 'Lady.', 'Sir.', 'Mlle.', 'Col.', 'Capt.',
       'Countess.', 'Jonkheer.'], dtype=object)

In [324]:
def working_with_names(df):
    ex_series = df.Name.str.extract("([A-Za-z]*\.)")
    unique_values = ex_series.unique()
    h_map = {}
    counter = 0
    for u in unique_values:
        if u not in h_map:
            h_map[u] = counter
            counter += 1
    new_list = list()
    for each in ex_series:
        new_list.append(h_map[each]) 
    return pd.Series(new_list) 

dataframe['dis_name'] = working_with_names(dataframe)

  


In [325]:
dataframe.dis_name.value_counts()

0     517
2     182
1     125
3      40
6       7
5       6
13      2
12      2
9       2
4       1
16      1
7       1
15      1
10      1
11      1
14      1
8       1
Name: dis_name, dtype: int64

## working with the Sex 

In [327]:
def working_with_names(df):
    ex_series = df.Sex
    unique_values = ex_series.unique()
    h_map = {}
    counter = 0
    for u in unique_values:
        if u not in h_map:
            h_map[u] = counter
            counter += 1
    new_list = list()
    for each in ex_series:
        new_list.append(h_map[each]) 
    return pd.Series(new_list) 

dataframe['dis_sex'] = working_with_names(dataframe) 

In [332]:
dataframe.Age = dataframe.Age.astype(int)

In [334]:
dataframe.drop(['Name','Sex'], axis=1, inplace=True)
dataframe.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,dis_embarked,dis_name,dis_sex
0,1,0,3,22,1,0,7.25,S,0,0,0
1,2,1,1,38,1,0,71.2833,C,1,1,1
2,3,1,3,26,0,0,7.925,S,0,2,1
3,4,1,1,35,1,0,53.1,S,0,1,1
4,5,0,3,35,0,0,8.05,S,0,0,0


In [336]:
dataframe.drop(['PassengerId'], axis=1, inplace=True)

ValueError: labels ['PassengerId'] not contained in axis

In [337]:
dataframe.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,dis_embarked,dis_name,dis_sex
0,0,3,22,1,0,7.25,S,0,0,0
1,1,1,38,1,0,71.2833,C,1,1,1
2,1,3,26,0,0,7.925,S,0,2,1
3,1,1,35,1,0,53.1,S,0,1,1
4,0,3,35,0,0,8.05,S,0,0,0
