## Data Split: Current and Future data

References:
- https://github.com/andrewwlong/diabetes_readmission/blob/master/diabetes_project.ipynb
- https://github.com/iterative/course-ds-base/blob/step-1-organize-ml-project_SOLUTION/notebooks/step-1-organize-ml-project.ipynb

In [6]:
%load_ext autoreload
%autoreload 2

## Download Data

In [84]:
%%writefile ../src/stages/download_data.py

import os
import wget
import yaml
import argparse

from src.utils.logs import get_logger

def download_data(config_path):
    
    with open(config_path) as conf_file:
        config = yaml.safe_load(conf_file)

    logger = get_logger('DOWNLOAD DATA', log_level=config['base']['log_level'])

    url = config['data_load']['source']
        
    logger.info(f'Downloading from {url}')
    local_folder = config['data_load']['local_folder']
    local_filename = os.path.join(local_folder, config['data_load']['local_name'])


    # Create the folder if it does not exist
    os.makedirs(local_folder, exist_ok=True)
    
    
    # Download the file
    wget.download(url, local_filename)
    logger.info(f"Downloaded {local_filename}")


if __name__ == '__main__':

    args_parser = argparse.ArgumentParser()
    args_parser.add_argument('--config', dest='config', required=True)
    args = args_parser.parse_args()

    download_data(config_path=args.config)

Overwriting ../src/stages/download_data.py


## Load Data

In [101]:
%%writefile ../src/stages/load_data.py

import argparse
import pandas as pd
import yaml
import os

from src.utils.utils import calc_prevalence
from src.utils.logs import get_logger

def load_data(config_path):

    with open(config_path) as conf_file:
        config = yaml.safe_load(conf_file)

    logger = get_logger('LOAD DATA', log_level=config['base']['log_level'])

    local_folder = config['data_load']['local_folder']
    local_filename = os.path.join(local_folder, config['data_load']['local_name'])

    # load the csv file
    df = pd.read_csv(local_filename)
    #Here we will label if a patient is likely to be re-admitted within 30 days of discharge.
    df['OUTPUT_LABEL'] = (df.readmitted == '<30').astype('int')
    logger.info('Prevalence:%.3f'%calc_prevalence(df['OUTPUT_LABEL'].values))

    # shuffle the samples
    df = df.sample(n = len(df), random_state = 42)
    df = df.reset_index(drop = True)
    
    # Save 30% of the data as future data 
    df_future=df.sample(frac=0.30,random_state=42)
    df_current = df.drop(df_future.index)
    
    logger.info('Split size current: %.3f'%(len(df_current)/len(df)))
    logger.info('Split size future: %.3f'%(len(df_future)/len(df)))

    logger.info('Prevalence current:%.3f'%calc_prevalence(df_current['OUTPUT_LABEL'].values))
    logger.info('Prevalence future:%.3f'%calc_prevalence(df_future['OUTPUT_LABEL'].values))
    
    # save raw data
    
    df_current.to_csv(config['data_load']['dataset_csv'], index=False)
    df_future.to_csv(config['data_load']['future_csv'], index=False)
    


if __name__ == '__main__':

    args_parser = argparse.ArgumentParser()
    args_parser.add_argument('--config', dest='config', required=True)
    args = args_parser.parse_args()

    load_data(config_path=args.config)

Overwriting ../src/stages/load_data.py


## Append new data for continuous learning

In [103]:
%%writefile ../src/stages/append_new_data.py

import wget
import pandas as pd
import argparse
import yaml
import shutil

from src.utils.logs import get_logger

def append_new_data(config_path):

    with open(config_path) as conf_file:
        config = yaml.safe_load(conf_file)

    logger = get_logger('APPEND NEW DATA', log_level=config['base']['log_level'])

    if not config['base']['continuous_learning']:
        logger.info("Continuous learning NOT enabled")
        # just rename the output
        
        shutil.copy(config['data_load']['dataset_csv'], config['data_load']['appended_dataset_csv'])
        return

    logger.info("continuous learning")
    new_data_url = config['data_load']['source_new']
    local_filename = config['data_load']['new_data']
    
    
    # Download the file
    logger.info(f'Downloading from {new_data_url}')
    wget.download(new_data_url, local_filename)
    logger.info(f"Downloaded {local_filename}")

    ## Append to the current data
    current_csv_name = config['data_load']['dataset_csv']
    df = pd.read_csv(current_csv_name)
    logger.info(f"Shape before adding new data {df.shape}")
    
    new_df = pd.read_csv(config['data_load']['new_data'])
    logger.info(f"Shape of new data {new_df.shape}")

    ## combine the two dataframe and save it

    result_df = pd.concat([df, new_df], ignore_index=True)
    
    logger.info(f"Shape after adding new data {result_df.shape}")

    current_csv_name = config['data_load']['appended_dataset_csv']
    result_df.to_csv(current_csv_name, index=False)
    
    
if __name__ == '__main__':

    args_parser = argparse.ArgumentParser()
    args_parser.add_argument('--config', dest='config', required=True)
    args = args_parser.parse_args()

    append_new_data(config_path=args.config)


    

Overwriting ../src/stages/append_new_data.py


## Features engineering

In [18]:
!mkdir ..\src\stages

In [36]:
!mkdir ..\data\processed

In [92]:
%%writefile ../src/stages/featurize.py

import argparse
import pandas as pd
from typing import Text
import yaml
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import joblib
import os

from src.utils.logs import get_logger

def featurize(config_path, train=False):

    with open(config_path) as conf_file:
        config = yaml.safe_load(conf_file)

    logger = get_logger('FEATURE ENG', log_level=config['base']['log_level'])

    # Create the folder if it does not exist
    os.makedirs(config['featurize']['folder_processed'], exist_ok=True)
    

    #df = pd.read_csv(path)
    if train:
        logger.info('Using training data, fittin encoder')
        df = pd.read_csv(config['data_load']['dataset_csv'])
    else:
        logger.info('new data, use previously fitted encoder')
        df = pd.read_csv(config['data_load']['inference_dataset_csv'])
    

    # replace ? with nan
    df = df.replace('?',np.nan)
    
    cols_num = ['time_in_hospital','num_lab_procedures', 'num_procedures', 'num_medications',
           'number_outpatient', 'number_emergency', 'number_inpatient','number_diagnoses']

    cols_cat = ['race', 'gender', 
           'max_glu_serum', 'A1Cresult',
           'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
           'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
           'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
           'tolazamide', 'insulin',
           'glyburide-metformin', 'glipizide-metformin',
           'glimepiride-pioglitazone', 'metformin-rosiglitazone',
           'metformin-pioglitazone', 'change', 'diabetesMed','payer_code']
    
    

    # handle missing values
    logger.info('Handling missing values')
    df['race'] = df['race'].fillna('UNK')
    df['payer_code'] = df['payer_code'].fillna('UNK')
    df['medical_specialty'] = df['medical_specialty'].fillna('UNK')

    df['max_glu_serum'] = df['max_glu_serum'].fillna('UNK')
    df['A1Cresult'] = df['A1Cresult'].fillna('UNK')

    # Bucket Medical Speciality
    top_10 = ['UNK','InternalMedicine','Emergency/Trauma',\
              'Family/GeneralPractice', 'Cardiology','Surgery-General' ,\
              'Nephrology','Orthopedics',\
              'Orthopedics-Reconstructive','Radiologist']

    # make a new column with duplicated data
    df['med_spec'] = df['medical_specialty'].copy()

    # replace all specialties not in top 10 with 'Other' category
    df.loc[~df.med_spec.isin(top_10),'med_spec'] = 'Other'


    # categorical numeric data into string type, to use with get_dummies
    cols_cat_num = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']
    df[cols_cat_num] = df[cols_cat_num].astype('str')


    cols_to_encode = cols_cat + cols_cat_num + ['med_spec']
    encoder_path = config['featurize']['encoder_path']

    if train:
        # Initialize OneHotEncoder with handle_unknown set to 'ignore'
        encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

        # Fit the encoder on your data
        logger.info('fitting encoder')
        encoder.fit(df[cols_to_encode])

        # Serialize the encoder to a file
        joblib.dump(encoder, encoder_path)
        logger.info('Encoder object serialized')
    
    else:
        # Deserialize the encoder from the file
        logger.info('Loadign encoder object')
        encoder = joblib.load(encoder_path)

    # Use the encoder to transform new data
    new_encoded_array = encoder.transform(df[cols_to_encode])
    new_encoded_df = pd.DataFrame(new_encoded_array, columns=encoder.get_feature_names_out(cols_to_encode))

    df = pd.concat([df,new_encoded_df], axis = 1)

    cols_all_cat = list(new_encoded_df.columns)

    age_id = {'[0-10)':0, 
              '[10-20)':10, 
              '[20-30)':20, 
              '[30-40)':30, 
              '[40-50)':40, 
              '[50-60)':50,
              '[60-70)':60, 
              '[70-80)':70, 
              '[80-90)':80, 
              '[90-100)':90}
    df['age_group'] = df.age.replace(age_id)

    df['has_weight'] = df.weight.notnull().astype('int')
    cols_extra = ['age_group','has_weight']


    logger.info(f'Total number of features: {len(cols_num + cols_all_cat + cols_extra)}')
    logger.info(f'Numerical Features:  {len(cols_num)}')
    logger.info(f'Categorical Features: {len(cols_all_cat)}')
    logger.info(f'Extra features: {len(cols_extra)}')
    logger.info(f'Data shape:{df.shape}')
    
    col2use = cols_num + cols_all_cat + cols_extra
    featured_dataset = df[col2use + ['OUTPUT_LABEL']]
    features_path = config['featurize']['features_path']
    #df_data.to_csv("../data/processed/featured.csv")
    featured_dataset.to_csv(features_path, index=False)

if __name__ == '__main__':

    args_parser = argparse.ArgumentParser()
    args_parser.add_argument('--config', dest='config', required=True)
    args_parser.add_argument('--train', action='store_true', help='Fit and transform encoder')
    args = args_parser.parse_args()

    featurize(config_path=args.config, train=args.train)


Overwriting ../src/stages/featurize.py


In [23]:
from src.stages.featurize import featurize

#featurize('../params.yaml', fit_encoder=True)


In [28]:
!cd ..

In [31]:
!dir

 Volume in drive D has no label.
 Volume Serial Number is A6CB-3452

 Directory of D:\ik_capstone_v2\notebooks

30-07-2024  08:16    <DIR>          .
30-07-2024  08:16    <DIR>          ..
30-07-2024  07:42    <DIR>          .ipynb_checkpoints
30-07-2024  08:16            32,678 step-0-prototype.ipynb
               1 File(s)         32,678 bytes
               3 Dir(s)  273,763,733,504 bytes free


In [29]:
!python src\stages\featurize.py --config ..\params.yaml

C:\Users\shint\AppData\Local\Programs\Python\Python310\python.exe: can't open file 'D:\\ik_capstone_v2\\notebooks\\src\\stages\\featurize.py': [Errno 2] No such file or directory


## Building Training/Validation/Test Samples

### Handling imbalance

Typically, it is better to balance the data in some way to give the positives more weight. There are 3 strategies that are typically utilized: - sub-sample the more dominant class: use a random subset of the negatives - over-sample the imbalanced class: use the same positive samples multiple times - create synthetic positive data

Usually, you will want to use the latter two methods if you only have a handful of positive cases. Since we have a few thousand positive cases, let's use the sub-sample approach. Here, we will create a balanced training data set that has 50% positive and 50% negative. You can also play with this ratio to see if you can get an improvement.

In [89]:
%%writefile ../src/stages/data_split.py

import argparse
import yaml
import pandas as pd
from src.utils.utils import calc_prevalence
from sklearn.preprocessing import StandardScaler
import pickle

from src.utils.logs import get_logger

def data_split(config_path):

    with open(config_path) as conf_file:
        config = yaml.safe_load(conf_file)

    logger = get_logger('DATA SPLIT', log_level=config['base']['log_level'])

    df_data = pd.read_csv(config['featurize']['features_path'])


    # Save 30% of the data as validation and test data 
    df_valid_test=df_data.sample(frac=0.30,random_state=42)
    logger.info('Split size: %.3f'%(len(df_valid_test)/len(df_data)))

    df_test = df_valid_test.sample(frac = 0.5, random_state = 42)
    df_valid = df_valid_test.drop(df_test.index)

    # use the rest of the data as training data
    df_train_all=df_data.drop(df_valid_test.index)


    # Fit the scaler using all training data
    scaler  = StandardScaler()
    #X_train_all = df_train_all[].values.astype('float32')
    target_column=config['featurize']['target_column']
    
    X_train_all = df_train_all.drop(target_column, axis=1).values.astype('float32')
    scaler.fit(X_train_all)
    scalerfile = config['data_split']['scaler_path']
    pickle.dump(scaler, open(scalerfile, 'wb'))

    logger.info('Test prevalence(n = %d):%.3f'%(len(df_test),calc_prevalence(df_test.OUTPUT_LABEL.values)))
    logger.info('Valid prevalence(n = %d):%.3f'%(len(df_valid),calc_prevalence(df_valid.OUTPUT_LABEL.values)))
    logger.info('Train all prevalence(n = %d):%.3f'%(len(df_train_all), calc_prevalence(df_train_all.OUTPUT_LABEL.values)))
    
    # Handling imbalance
    # use the sub-sample approach. Here, we will create a balanced training data set that has 50% positive and 50% negative.
    # You can also play with this ratio to see if you can get an improvement.
    
    # split the training data into positive and negative
    rows_pos = df_train_all.OUTPUT_LABEL == 1
    df_train_pos = df_train_all.loc[rows_pos]
    df_train_neg = df_train_all.loc[~rows_pos]

    # merge the balanced data
    df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos), random_state = 42)],axis = 0)

    # shuffle the order of training samples 
    df_train = df_train.sample(n = len(df_train), random_state = 42).reset_index(drop = True)

    logger.info('Train balanced prevalence(n = %d):%.3f'%(len(df_train), calc_prevalence(df_train.OUTPUT_LABEL.values)))


    
    train_csv_path = config['data_split']['trainset_path']
    validset_path = config['data_split']['validset_path']
    testset_path = config['data_split']['testset_path']
    train_unbalanced_path = config['data_split']['train_unbalanced_path']
    
    df_train_all.to_csv(train_unbalanced_path, index=False)
    df_train.to_csv(train_csv_path, index=False)
    df_valid.to_csv(validset_path, index=False)
    df_test.to_csv(testset_path, index=False)

if __name__ == '__main__':

    args_parser = argparse.ArgumentParser()
    args_parser.add_argument('--config', dest='config', required=True)
    args = args_parser.parse_args()

    data_split(config_path=args.config)

Overwriting ../src/stages/data_split.py


In [28]:
%load_ext autoreload
%autoreload 2

from data_split import data_split

data_split("../data/processed/featured.csv")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Split size: 0.300
Test prevalence(n = 10686):0.110
Valid prevalence(n = 10685):0.114
Train all prevalence(n = 49865):0.113
Train balanced prevalence(n = 11222):0.500


## Train

In [40]:
!mkdir ..\src\train

In [44]:
!mkdir ..\models

In [77]:
%%writefile ../src/train/train.py

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, make_scorer
from typing import Dict, Text
import pickle

from src.utils.logs import get_logger


class UnsupportedClassifier(Exception):

    def __init__(self, estimator_name):

        self.msg = f'Unsupported estimator {estimator_name}'
        super().__init__(self.msg)


def get_supported_estimator() -> Dict:
    """
    Returns:
        Dict: supported classifiers
    """

    return {
        'logreg': LogisticRegression,
        'svm': SVC,
        'knn': KNeighborsClassifier
    }


def train(df: pd.DataFrame, target_column: Text,
          estimator_name: Text, scaler_path: Text, param_grid: Dict,  cv: int):
    """Train model.
    Args:
        df {pandas.DataFrame}: dataset
        target_column {Text}: target column name
        estimator_name {Text}: estimator name
        param_grid {Dict}: grid parameters
        cv {int}: cross-validation value
    Returns:
        trained model
    """

    estimators = get_supported_estimator()

    if estimator_name not in estimators.keys():
        raise UnsupportedClassifier(estimator_name)

    estimator = estimators[estimator_name]()
    f1_scorer = make_scorer(f1_score, average='weighted')
    clf = GridSearchCV(estimator=estimator,
                       param_grid=param_grid,
                       cv=cv,
                       verbose=1,
                       scoring=f1_scorer)
    # Get X and Y
    y_train = df.loc[:, target_column].values.astype('int32')
    X_train = df.drop(target_column, axis=1).values.astype('float32')


    # load the scaler
    scaler = pickle.load(open(scaler_path, 'rb'))

    X_train_tf = scaler.transform(X_train)
    
    clf.fit(X_train, y_train)

    return clf


Overwriting ../src/train/train.py


In [90]:
%%writefile ../src/stages/train.py


import argparse
import joblib

import pandas as pd
from typing import Text
import yaml
import os

from src.train.train import train
from src.utils.logs import get_logger




def train_model(config_path: Text) -> None:


    with open(config_path) as conf_file:
        config = yaml.safe_load(conf_file)

    logger = get_logger('TRAIN', log_level=config['base']['log_level'])

    estimator_name = config['train']['estimator_name']

    train_df = pd.read_csv(config['data_split']['trainset_path'])



    model = train(
        df=train_df,
        target_column=config['featurize']['target_column'],
        estimator_name=estimator_name,
        scaler_path=config['data_split']['scaler_path'],
        param_grid=config['train']['estimators'][estimator_name]['param_grid'],
        cv=config['train']['cv']
    )
    
    logger.info(f'Best score: {model.best_score_}')

    
    # Create the folder if it does not exist
    os.makedirs('models', exist_ok=True)
    logger.info('Save model')
    models_path = config['train']['model_path']
    joblib.dump(model, models_path)


if __name__ == '__main__':

    args_parser = argparse.ArgumentParser()
    args_parser.add_argument('--config', dest='config', required=True)
    args = args_parser.parse_args()

    train_model(config_path=args.config)



Overwriting ../src/stages/train.py


In [35]:
!python ..\src\stages\train.py

Traceback (most recent call last):
  File "D:\IK_CAPSTONE_PROJECT\src\stages\train.py", line 8, in <module>
    from src.train.train import train
ModuleNotFoundError: No module named 'src'


## Evaluate

In [49]:
!mkdir ..\src\report

In [52]:
!mkdir ..\reports

In [50]:
%%writefile ../src/report/visualize.py
import itertools
import matplotlib.colors
import matplotlib.pyplot as plt
import numpy as np
from typing import List, Text


def plot_confusion_matrix(cm: np.array,
                          target_names: List[Text],
                          title: Text = 'Confusion matrix',
                          cmap: matplotlib.colors.LinearSegmentedColormap = None,
                          normalize: bool = True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))

    return plt.gcf()

Writing ../src/report/visualize.py


In [91]:
%%writefile ../src/stages/evaluate.py
import argparse
import joblib
import json
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, f1_score
from typing import Text, Dict
import yaml
import os

from src.report.visualize import plot_confusion_matrix
from src.utils.logs import get_logger

def convert_to_labels(indexes, labels):
    result = []
    for i in indexes:
        result.append(labels[i])
    return result

def write_confusion_matrix_data(y_true, predicted, labels, filename):
    assert len(predicted) == len(y_true)
    predicted_labels = convert_to_labels(predicted, labels)
    true_labels = convert_to_labels(y_true, labels)
    cf = pd.DataFrame(list(zip(true_labels, predicted_labels)), columns=["y_true", "predicted"])
    cf.to_csv(filename, index=False)

def evaluate_model(config_path: Text) -> None:
    """Evaluate model.
    Args:
        config_path {Text}: path to config
    """

    with open(config_path) as conf_file:
        config = yaml.safe_load(conf_file)

    logger = get_logger('EVALUATE', log_level=config['base']['log_level'])

    logger.info('Load model')
    model_path = config['train']['model_path']
    model = joblib.load(model_path)

    logger.info('Load test dataset')
    test_df = pd.read_csv(config['data_split']['testset_path'])

    logger.info('Evaluate (build report)')
    target_column=config['featurize']['target_column']
    y_test = test_df.loc[:, target_column].values
    X_test = test_df.drop(target_column, axis=1).values

    prediction = model.predict(X_test)
    f1 = f1_score(y_true=y_test, y_pred=prediction, average='macro')

    labels = ['Yes', 'No']
    cm = confusion_matrix(prediction, y_test)
    report = {
        'f1': f1,
        'cm': cm,
        'actual': y_test,
        'predicted': prediction
    }

    logger.info('Save metrics')
    # save f1 metrics file
    reports_folder = Path(config['evaluate']['reports_dir'])
    
    # Create the folder if it does not exist
    os.makedirs(reports_folder, exist_ok=True)
    
    metrics_path = reports_folder / config['evaluate']['metrics_file']

    json.dump(
        obj={'f1_score': report['f1']},
        fp=open(metrics_path, 'w')
    )

    logger.info(f'F1 metrics file saved to : {metrics_path}')

    logger.info('Save confusion matrix')
    # save confusion_matrix.png
    plt = plot_confusion_matrix(cm=report['cm'],
                                target_names=['Yes','No'],
                                normalize=False)
    confusion_matrix_png_path = reports_folder / config['evaluate']['confusion_matrix_image']
    plt.savefig(confusion_matrix_png_path)
    logger.info(f'Confusion matrix saved to : {confusion_matrix_png_path}')

    confusion_matrix_data_path = reports_folder / config['evaluate']['confusion_matrix_data']
    write_confusion_matrix_data(y_test, prediction, labels=labels, filename=confusion_matrix_data_path)
    logger.info(f'Confusion matrix data saved to : {confusion_matrix_data_path}')


if __name__ == '__main__':

    args_parser = argparse.ArgumentParser()
    args_parser.add_argument('--config', dest='config', required=True)
    args = args_parser.parse_args()

    evaluate_model(config_path=args.config)

Overwriting ../src/stages/evaluate.py


In [31]:
%load_ext autoreload
%autoreload 2

from train import train

train()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Training All shapes: (49865, 145)
Training shapes: (11222, 145) (11222,)
Validation shapes: (10685, 145) (10685,)


In [36]:
!where python

C:\Users\shint\.pyenv\pyenv-win\versions\3.9.0\python.exe
D:\IK_CAPSTONE_PROJECT\medi-venv\Scripts\python.exe
C:\Users\shint\AppData\Local\Programs\Python\Python310\python.exe
C:\Users\shint\AppData\Local\Microsoft\WindowsApps\python.exe
