# Basic cleaning

In [159]:
import pandas as pd
import numpy as np

from typing import List

from sklearn.impute import SimpleImputer

In [160]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [161]:
# Remove capitalization on column names
train_df.columns = [column_name.lower() for column_name in train_df.columns]
test_df.columns = [column_name.lower() for column_name in test_df.columns]

### Extract data from names column

In [171]:
def extract_first_last_name(names):
    first_name = [name.split('.')[0].split(',')[0].strip() for name in names]
    last_names = [name.split('.')[1].strip() for name in names]
    return (first_name, last_names)


def extract_title(names):
    uncommon_titles = 'Rev,Dr,Col,Major,Mlle,Ms,Sir,Capt,Mme,Jonkheer,Lady,the Countess,Don,Dona'.lower().split(',')
    
    titles = [i.rsplit(', ')[1].rsplit('.')[0].lower() for i in names]  
    titles = ['uncommon' if i in uncommon_titles else i for i in titles]  # Group uncommon titles
    return titles


def binarize_sex(gender):
    return [1 if i == 'male' else 0 for i in gender]


def preprocessing(df):
    df = df.copy(deep=True)
    
    # Seperate first and last names
    df['first_name'], df['last_names'] = extract_first_last_name(df.name)
    
    # Get title from name
    df['title'] = extract_title(df.name)
    
    # Convert [male,female] -> [1,0]
    df.sex = binarize_sex(df.sex)
    
    # Drop name column
    df.drop('name', axis=1, inplace=True)
    
    return df
    
    

train_cleaned = preprocessing(train_df)
test_cleaned = preprocessing(test_df)
print(test_cleaned.dtypes)

train_cleaned.embarked.fillna(train_cleaned.embarked.mode()[0], inplace=True)
test_cleaned.fare.fillna(test_cleaned.fare.mean(), inplace=True)

passengerid      int64
pclass           int64
sex              int64
age            float64
sibsp            int64
parch            int64
ticket          object
fare           float64
cabin           object
embarked        object
first_name      object
last_names      object
title           object
dtype: object


In [173]:
train_cleaned.to_csv('data/train_cleaned.csv', index=False)
test_cleaned.to_csv('data/test_cleaned.csv', index=False)