In [17]:
import pandas as pd
from dotenv import load_dotenv
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing	import OneHotEncoder
from datetime import datetime
import pickle
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.tree import DecisionTreeClassifier
import json

load_dotenv()
os.chdir(os.getenv("ROOT_DIR"))

from src import utils

In [18]:
# --- PARAMS --- #
prep_id = 'prep_04'
model_id = 'model_04'
data_to_predict_path = os.getenv("DF_NEW_DATA_PATH")

print(data_to_predict_path)

data/raw/test.csv


In [19]:
# --- LOAD DATA --- #
features = ['Pclass', 'Sex', 'Age', 'Name']
target = ['Survived']

df_to_predict = pd.read_csv(data_to_predict_path)
x_predict = df_to_predict[features]
x_predict.head(2)

Unnamed: 0,Pclass,Sex,Age,Name
0,3,male,34.5,"Kelly, Mr. James"
1,3,female,47.0,"Wilkes, Mrs. James (Ellen Needs)"


In [20]:
# --- PREPROCESSING --- #

# sex
with open(os.path.join(os.getenv("ARTIFACTS_PATH"), prep_id, 'encoder_sex_model.pkl'), 'rb') as f:
          encoder_sex_model = pickle.load(f)

x_predict[encoder_sex_model.get_feature_names_out(['Sex'])] = encoder_sex_model.transform(x_predict[['Sex']])
x_predict.drop('Sex', axis=1, inplace=True)

# pclass
with open(os.path.join(os.getenv("ARTIFACTS_PATH"), prep_id, 'encoder_pclass_model.pkl'), 'rb') as f:
		  encoder_pclass_model = pickle.load(f)
                  
x_predict[encoder_pclass_model.get_feature_names_out(['Pclass'])] = encoder_pclass_model.transform(x_predict[['Pclass']])
x_predict.drop('Pclass', axis=1, inplace=True)

# Age
with open(os.path.join(os.getenv("ARTIFACTS_PATH"), prep_id, 'imputer_age_mean.pkl'), 'rb') as f:
		imputer_age_mean = pickle.load(f)

x_predict['Age'] = imputer_age_mean.transform(x_predict[['Age']])

# Name
regex_title = r',\s*(\w*)'
x_predict['Title'] = x_predict['Name'].str.extract(regex_title)

title_mapping = {
    'Mme' : 'Mrs',
    'Mlle': 'Miss',
    'Ms' : 'Miss',
    'Don' : 'Sir',
    'Jonkheer':'Sir',
    'Major':'Military',
    'Capt': 'Military',
    'the' : 'Mrs',
    'Lady': 'Mrs',
    'Col' : 'Military'
}

x_predict['Title'] = x_predict['Title'].replace(title_mapping)

# transform
artifact_path = os.path.join(os.getenv('ARTIFACTS_PATH'), utils.get_nb_name())
with open(os.path.join(os.getenv("ARTIFACTS_PATH"), prep_id, 'encoder_title_model.pkl'), 'rb') as f:
	encoder_title_model = pickle.load(f)
      
x_predict[encoder_title_model.get_feature_names_out(['Title'])] = encoder_title_model.transform(x_predict['Title'].to_numpy().reshape(-1, 1))
x_predict.drop('Title', axis=1, inplace=True)
x_predict.drop('Name', axis=1, inplace=True)


x_predict.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_predict[encoder_sex_model.get_feature_names_out(['Sex'])] = encoder_sex_model.transform(x_predict[['Sex']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_predict[encoder_sex_model.get_feature_names_out(['Sex'])] = encoder_sex_model.transform(x_predict[['Sex']])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_predict.drop('Sex', axis=1, inplace=True

Unnamed: 0,Age,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Title_Dr,Title_Master,Title_Military,Title_Miss,Title_Mr,Title_Mrs,Title_Rev,Title_Sir
0,34.5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,47.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [21]:
# --- PREDICT --- #
model_path = os.path.join(os.getenv("ARTIFACTS_PATH"), model_id, 'model', 'model.pkl')
with open(model_path, 'rb') as f:
    model = pickle.load(f)

y_predict = model.predict(x_predict)

res = df_to_predict[['PassengerId']]
res['Survived'] = y_predict

pred_path = os.path.join(os.getenv("ARTIFACTS_PATH"), utils.get_nb_name(), 'submission.csv')
os.makedirs(os.path.dirname(pred_path), exist_ok=True)
res.to_csv(pred_path, index=False)
print(f'Predictions saved at {pred_path}')

res.head()

Predictions saved at artifacts/model_04b_predict/submission.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res['Survived'] = y_predict


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [22]:
# --- SUBMIT --- #
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

In [23]:
#! kaggle competitions submit -c titanic -f {pred_path} -m "submission model04b_predict"
# public score: 0.78468. Improvement!

100%|██████████████████████████████████████| 2.77k/2.77k [00:01<00:00, 1.85kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster

In [24]:
results = {
    'model_id': model_id,
    'prep_id': prep_id,
	'public_score': 0.78468,
	'date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	'comment': 'Improvement from model03b_predict'
}

with open(os.path.join(os.getenv("ARTIFACTS_PATH"), utils.get_nb_name(), 'results.json'), 'w') as f:
	json.dump(results, f)

print(f'Results saved at {os.path.join(os.getenv("ARTIFACTS_PATH"), utils.get_nb_name(), "results.json")}')

Results saved at artifacts/model_04b_predict/results.json
