In [1]:
import logging
import pickle
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from settings import settings

from data import process_data
from model import (
    train_model,
    get_model_metrics,
    get_inference,
    get_slices,
    get_confusion_matrix,
)

In [2]:
logging.basicConfig(
    filename="pipeline.log",
    level=logging.INFO,
    filemode="a",
    format="%(name)s - %(levelname)s - %(message)s",
)


In [4]:
data = pd.read_csv(settings["data_path"])
target = settings["target_variable"]
cat_features = settings["cat_features"]
model_save_path = settings["model_path"]
model_pkl = settings["pkl_files"]["model"]
encoder_pkl = settings["pkl_files"]["encoder"]
labelizer_pkl = settings["pkl_files"]["labelizer"]
slices_save_path = settings["slices_save_path"]

In [5]:
train, test = train_test_split(
    data, test_size=0.20, random_state=42, stratify=data[target]
)

In [7]:
X_train, y_train, encoder, lb = process_data(
    train, categorical_features=cat_features, label=target, training=True
)



In [8]:
X_test, y_test, encoder, lb = process_data(
    test,
    categorical_features=cat_features,
    label="salary",
    training=False,
    encoder=encoder,
    lb=lb,
)

In [9]:
if os.path.isfile(os.path.join(model_save_path, model_pkl)):
    # Load model
    model = pickle.load(open(os.path.join(model_save_path, model_pkl), "rb"))
    # Load encoder
    encoder = pickle.load(open(os.path.join(model_save_path, encoder_pkl), "rb"))
    # Load labalizer
    lb = pickle.load(open(os.path.join(model_save_path, labelizer_pkl), "rb"))
# If not, train and save a model.
else:
    model = train_model(X_train, y_train)
    # save model  to disk in ./model folder
    pickle.dump(model, open(os.path.join(model_save_path, model_pkl), "wb"))
    pickle.dump(encoder, open(os.path.join(model_save_path, encoder_pkl), "wb"))
    pickle.dump(lb, open(os.path.join(model_save_path, labelizer_pkl), "wb"))
    logging.info("Model saved to disk: %d", model_save_path)

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 1085, in emit
    msg = self.format(record)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 929, in format
    return fmt.format(record)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 668, in format
    record.message = record.getMessage()
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 373, in getMessage
    msg = msg % self.args
TypeError: %d format: a number is required, not dict
Call stack:


BEST PARAMS:  {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}


  File "C:\Users\gchri\anaconda3\envs\udacity\lib\runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\site-packages\traitlets\config\application.py", line 1043, in launch_instance
    app.start()
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\site-packages\ipykernel\kernelapp.py", line 725, in start
    self.io_loop.start()
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\site-packages\tornado\platform\asyncio.py", line 195, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\asyncio\base_events.py", line 570, in run_forever
    self._run_once()
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\asyncio\base

In [10]:
# Getting the predictions
y_pred = get_inference(model, X_test)
# Evaluating the model
precision, recall, fbeta = get_model_metrics(y_test, y_pred)
logging.info("precision: %d, recall: %d, fbeta: %d", precision, recall, fbeta)
# Calculating confusion matrix
cm = get_confusion_matrix(y_test, y_pred)
logging.info("confusion matrix: %d", cm)

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 1085, in emit
    msg = self.format(record)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 929, in format
    return fmt.format(record)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 668, in format
    record.message = record.getMessage()
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 373, in getMessage
    msg = msg % self.args
TypeError: %d format: a number is required, not numpy.ndarray
Call stack:
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.la

In [11]:
precision

0.7801857585139319

In [12]:
recall

0.6428571428571429

In [13]:
fbeta

0.7048951048951049

In [14]:
# Evaluating the slices of all categorical features
# and saving the results in a csv file and in pipeline log
for feature in cat_features:
    perf_df = get_slices(test, feature, y_test, y_pred)
    perf_df.to_csv(slices_save_path, mode="a", index=False)
    logging.info("Performance on slice %d", feature)
    logging.info(perf_df)

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 1085, in emit
    msg = self.format(record)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 929, in format
    return fmt.format(record)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 668, in format
    record.message = record.getMessage()
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 373, in getMessage
    msg = msg % self.args
TypeError: %d format: a number is required, not str
Call stack:
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_i

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 1085, in emit
    msg = self.format(record)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 929, in format
    return fmt.format(record)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 668, in format
    record.message = record.getMessage()
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 373, in getMessage
    msg = msg % self.args
TypeError: %d format: a number is required, not str
Call stack:
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_i

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 1085, in emit
    msg = self.format(record)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 929, in format
    return fmt.format(record)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 668, in format
    record.message = record.getMessage()
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\logging\__init__.py", line 373, in getMessage
    msg = msg % self.args
TypeError: %d format: a number is required, not str
Call stack:
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\gchri\anaconda3\envs\udacity\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_i

In [15]:
perf_df

Unnamed: 0,feature,feature_value,n_samples,precision,recall,fbeta
0,United-States,native-country,5835,0.787724,0.640777,0.706692
1,Thailand,native-country,7,1.0,1.0,1.0
2,China,native-country,14,0.75,0.75,0.75
3,Nicaragua,native-country,11,0.0,0.0,0.0
4,Mexico,native-country,111,0.5,0.333333,0.4
5,Canada,native-country,28,0.25,0.2,0.222222
6,South,native-country,12,1.0,0.166667,0.285714
7,?,native-country,125,0.787879,0.8125,0.8
8,India,native-country,18,0.7,1.0,0.823529
9,Italy,native-country,13,1.0,0.75,0.857143
