# Cleaning Titanic
Cleaning the titanic dataset to make it usable for machine learning.

In [1]:
import pandas as pd
import csv

Splitting into train and test sets.

In [2]:
train = pd.read_csv("train.csv", sep=",", header=0, index_col=0)
test = pd.read_csv("test.csv", sep=",", header=0, index_col=0)

In [3]:
data = train.append(test, sort=True)
data

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599
3,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282
4,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,113803
5,35.0,,S,8.0500,"Allen, Mr. William Henry",0,3,male,0,0.0,373450
...,...,...,...,...,...,...,...,...,...,...,...
1305,,,S,8.0500,"Spector, Mr. Woolf",0,3,male,0,,A.5. 3236
1306,39.0,C105,C,108.9000,"Oliva y Ocana, Dona. Fermina",0,1,female,0,,PC 17758
1307,38.5,,S,7.2500,"Saether, Mr. Simon Sivertsen",0,3,male,0,,SOTON/O.Q. 3101262
1308,,,S,8.0500,"Ware, Mr. Frederick",0,3,male,0,,359309


In [4]:
print("Data structure:")
print("***************")
print(data.columns)
print(data.dtypes)
print("\nExample:")
print("**********")
print(data.head())
print("\nStatistics:")
print("*************")
print(data.describe())
print("Correlations:")
print(data.corr())
print("*************")
print("Columns with <10 categories:")

for item in data.columns:
    categorical = pd.Categorical(data[item])
    if len(categorical.categories) > 9:
        continue
    print(item, " ", pd.Categorical(data[item]))

Data structure:
***************
Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'Pclass', 'Sex',
       'SibSp', 'Survived', 'Ticket'],
      dtype='object')
Age         float64
Cabin        object
Embarked     object
Fare        float64
Name         object
Parch         int64
Pclass        int64
Sex          object
SibSp         int64
Survived    float64
Ticket       object
dtype: object

Example:
**********
              Age Cabin Embarked     Fare  \
PassengerId                                 
1            22.0   NaN        S   7.2500   
2            38.0   C85        C  71.2833   
3            26.0   NaN        S   7.9250   
4            35.0  C123        S  53.1000   
5            35.0   NaN        S   8.0500   

                                                          Name  Parch  Pclass  \
PassengerId                                                                     
1                                      Braund, Mr. Owen Harris      0       3   
2            Cum

Handling NA values:

In [5]:
data.Age.fillna(value=data.Age.mean(), inplace=True)
data.Fare.fillna(value=data.Fare.mean(), inplace=True)
data.Embarked.fillna(value=data.Embarked.value_counts().idxmax(), inplace=True)
data.Survived.fillna(value=-1, inplace=True)

Extract title:

In [6]:
titles = pd.DataFrame(data.apply(lambda x: x.Name.split(", ")
                                 [1].split(".")[0], axis=1), columns = ["Title"])
print(pd.Categorical(titles.Title))
data = data.join(titles)

[Mr, Mrs, Miss, Mrs, Mr, ..., Mr, Dona, Mr, Mr, Master]
Length: 1309
Categories (18, object): [Capt, Col, Don, Dona, ..., Ms, Rev, Sir, the Countess]


Calculating family size:

In [7]:
fsiz = pd.DataFrame(data.apply(lambda x: x.SibSp+x.Parch, axis=1), columns=["FSize"])
data = data.join(fsiz)

Dropping useless columns (columns that will not be needed for producing an actual model). Parch and Sibsp columns are not needed because we are using the sum (see: fsiz)

In [8]:
data.drop('Name', axis=1, inplace=True)
data.drop('Cabin', axis=1, inplace=True)
data.drop('Ticket', axis=1, inplace=True)

data.drop('Parch', axis=1, inplace=True)
data.drop('SibSp', axis=1, inplace=True)

In [10]:
for col in data.select_dtypes(exclude=["number"]).columns:
    data[col] = data[col].astype('category')
    data[col] = data[col].cat.codes
    
train = data[data['Survived'] != -1]
train.to_csv("train-clean.csv")

test = data[data['Survived'] == -1]
test.drop('Survived', axis=1, inplace=True)
test.to_csv("test-clean.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [14]:
data.to_csv("data-clean.csv")