In [1]:
import pandas as pd
import pickle
from numpy import savetxt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Params
target = 'Survived'
features = ['Sex', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
idx = 'PassengerId'

# Paths 
data_train_path = "/home/walter/Documents/personal_projects/new-titan/data/raw/train.csv"
data_predict_path = '/home/walter/Documents/personal_projects/new-titan/data/raw/test.csv'
submit_path = '/home/walter/Documents/personal_projects/new-titan/notebooks/experiments/exp_02/submit/submit.csv'


train_dict_path = "/home/walter/Documents/personal_projects/new-titan/notebooks/experiments/exp_02/checkpoint/train_dict.pkl"
pclass_encoder_path = "/home/walter/Documents/personal_projects/new-titan/notebooks/experiments/exp_02/checkpoint/pclass_encoder.pkl"
embarked_encoder_path = "/home/walter/Documents/personal_projects/new-titan/notebooks/experiments/exp_02/checkpoint/embarked_encoder.pkl"
sex_encoder_path = "/home/walter/Documents/personal_projects/new-titan/notebooks/experiments/exp_02/checkpoint/sex_encoder.pkl"

tunned_model_path = '/home/walter/Documents/personal_projects/new-titan/notebooks/experiments/exp_02/artifacts/tunned_model.pkl'
trained_model_path = '/home/walter/Documents/personal_projects/new-titan/notebooks/experiments/exp_02/artifacts/model.pkl'

In [2]:
def divide_for_train(df, test_size):
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, random_state=100)
    label_train = df.iloc[X_train.index, 0]
    label_test = df.iloc[X_test.index, 0]

    return X_train, y_train, label_train, X_test, y_test, label_test

def train_pclass_encoder(df, path):
    enc = OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist')
    enc.fit(df[['Pclass']])

    file = open(path, 'wb')
    pickle.dump(enc, file)
    file.close()

def train_embarked_encoder(df, path):
    enc = OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist')
    enc.fit(df[['Embarked']])

    file = open(path, 'wb')
    pickle.dump(enc, file)
    file.close()

def train_sex_encoder(df, path):
    enc = LabelEncoder()
    enc.fit(df['Sex'])

    file = open(path, 'wb')
    pickle.dump(enc, file)
    file.close()

def apply_sex_encoder(df, path):
    file = open(path, 'rb')
    enc = pickle.load(file)
    file.close()

    out = enc.transform(df['Sex'])
    df['Sex'] = out
    return df

def apply_pclass_encoder(df, path):
    file = open(path, 'rb')
    enc = pickle.load(file)
    file.close()

    new_cols = ['Pclass_' + str(c) for c in enc.categories_[0]]
    df[new_cols] = enc.transform(df[['Pclass']])
    df = df.drop(['Pclass'], axis=1)
    return df

def apply_embarked_encoder(df, path):
    file = open(path, 'rb')
    enc = pickle.load(file)
    file.close()

    new_cols = ['Embarked_' + str(c) for c in enc.categories_[0]]
    df[new_cols] = enc.transform(df[['Embarked']])
    df = df.drop(['Embarked'], axis=1)
    return df

def save_train_dict(obj, path):
    file = open(path, 'wb')
    pickle.dump(obj, file)
    file.close()

def load_train_dict(path):
    file = open(path, 'rb')
    dict = pickle.load(file)
    file.close()
    return dict

def load_model(path):
    pkl_file = open(path, 'rb')
    selected_model = pickle.load(pkl_file)
    pkl_file.close()

    return selected_model

In [3]:
# load
data_predict = pd.read_csv(data_predict_path)
model = load_model(trained_model_path)

In [4]:
# preprocessing
X_test = data_predict[features]
label_test = data_predict[idx]

train_dict = load_train_dict(train_dict_path)
X_test['Age'] = X_test['Age'].fillna(train_dict['age_mean'])
X_test['Embarked'] = X_test['Embarked'].fillna(train_dict['embarked_mode'])
X_test['Fare'] = X_test['Fare'].fillna(train_dict['fare_mean'])
X_test = apply_pclass_encoder(X_test,pclass_encoder_path)
X_test = apply_sex_encoder(X_test, sex_encoder_path)
X_test = apply_embarked_encoder(X_test, embarked_encoder_path)
y_predict = model.predict(X_test)

# build df
df_prediction = pd.DataFrame({'PassengerId': label_test, 'Survived': y_predict})
df_prediction.to_csv(submit_path, header=True, index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Age'] = X_test['Age'].fillna(train_dict['age_mean'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Embarked'] = X_test['Embarked'].fillna(train_dict['embarked_mode'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Fare'] = X_test['Fare'].fillna(train_dict['fare_mean'])
A va