In [3]:
import os
import pandas as pd
import pickle
from numpy import savetxt
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [30]:
# Project
workdir = '/home/walter/Documents/personal_projects/new-titan'
exp_prefix = 'notebooks/experiments/exp_03'
data_prefix = 'data'
chk_prefix = 'checkpoint'

# Params
target = 'Survived'
features = ['Sex', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
idx = 'Passengerid'

# Paths 
data_train_path = os.path.join(workdir, data_prefix, 'raw/train.csv')

dict_path = os.path.join(workdir, exp_prefix, chk_prefix, 'train_dict.pkl')


In [127]:
def split_for_eval(df, test_size):
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, random_state=100)
    label_train = df.iloc[X_train.index, 0]
    label_test = df.iloc[X_test.index, 0]
    return X_train, y_train, label_train, X_test, y_test, label_test

def save_obj(obj, path):
    if not os.path.exists(os.path.split(path)[0]):
        os.mkdir(os.path.split(path)[0])

    file = open(path, 'wb')
    pickle.dump(obj, file)
    file.close()

def load_obj(path):
    file = open(path, 'rb')
    obj = pickle.load(file)
    file.close()
    return obj

def train_imputers(df, path):
    dict = defaultdict(None)
    dict['embarked_mode'] = df['Embarked'].mode().values[0]
    dict['age_mean'] = df['Age'].mean()
    dict['fare_mean'] = df['Fare'].mean()
    save_obj(dict, path)
    print('> Trained imputers.')

def apply_imputers(df, path):
    dict = load_obj(path)
    data = df
    data['Age'] = data['Age'].fillna(dict['age_mean'])
    data['Embarked'] = data['Embarked'].fillna(dict['embarked_mode'])
    data['Fare'] = data['Fare'].fillna(dict['fare_mean'])
    print('> Applied imputers.')
    return data

def train_pclass_encoder(df, path):
    enc = OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist')
    enc.fit(df[['Pclass']])
    save_obj(enc, path)

def train_embarked_encoder(df, path):
    enc = OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist')
    enc.fit(df[['Embarked']])
    save_obj(enc, path)

def train_sex_encoder(df, path):
    enc = LabelEncoder()
    enc.fit(df['Sex'])
    save_obj(enc, path)

def train_encoders(df, path):
    train_pclass_encoder(df, os.path.join(path, 'pclass_encoder.pkl'))
    train_embarked_encoder(df, os.path.join(path, 'embarked_encoder.pkl'))
    train_sex_encoder(df, os.path.join(path, 'sex_encoder.pkl'))
    print('> Trained encoders.')

def apply_sex_encoder(df, path):
    enc = load_obj(path)
    res = enc.transform(df['Sex'])
    df['Sex'] = res
    return df

def apply_pclass_encoder(df, path):
    enc = load_obj(path)
    new_cols = ['Pclass_' + str(c) for c in enc.categories_[0]]
    data = df
    data[new_cols] = enc.transform(data[['Pclass']])
    data = data.drop(['Pclass'], axis=1)
    return data

def apply_embarked_encoder(df, path):
    enc = load_obj(path)
    new_cols = ['Embarked_' + str(c) for c in enc.categories_[0]]
    data = df
    data[new_cols] = enc.transform(data[['Embarked']])
    data = data.drop(['Embarked'], axis=1)
    return data

def apply_encoders(df, path):
    res = apply_pclass_encoder(df, os.path.join(path, 'pclass_encoder.pkl'))
    res = apply_embarked_encoder(res, os.path.join(path, 'embarked_encoder.pkl'))
    res = apply_sex_encoder(res , os.path.join(path, 'sex_encoder.pkl'))
    print('> Applied encoders.')
    return res

def process_train(X_train, path):
    train_imputers(X_train, os.path.join(path, 'train_dict.pkl'))
    res = apply_imputers(X_train, os.path.join(path, 'train_dict.pkl'))
    train_encoders(res, path)
    res = apply_encoders(res, path)
    return res.values

def apply_transform(df, path):
    res = apply_imputers(df, os.path.join(path, 'train_dict.pkl'))
    res = apply_encoders(res, path)
    return res.values

def save_train_files(path):
    savetxt(os.path.join(path, 'X_train.csv'), X_train_processed, delimiter=',')
    savetxt(os.path.join(path, 'y_train.csv'), y_train, delimiter=',')
    savetxt(os.path.join(path, 'label_train.csv'), label_train, delimiter=',')
    savetxt(os.path.join(path, 'X_test.csv'), X_test_processed, delimiter=',')
    savetxt(os.path.join(path, 'y_test.csv'), y_test, delimiter=',')
    savetxt(os.path.join(path, 'label_test.csv'), label_test, delimiter=',')

In [128]:
# Load
data = pd.read_csv(os.path.join(workdir, data_prefix, 'raw/train.csv'), header=0)

# Create datasets for eval
X_train, y_train, label_train, X_test, y_test, label_test = split_for_eval(data, 0.1)

# Process X_train
X_train_processed = process_train(X_train, os.path.join(workdir, exp_prefix, chk_prefix))
X_test_processed = apply_transform(X_test, os.path.join(workdir, exp_prefix, chk_prefix))

# Save
save_train_files(os.path.join(workdir, data_prefix, 'processed'))

> Trained imputers.
> Applied imputers.
> Trained encoders.
> Applied encoders.
> Applied imputers.
> Applied encoders.
