In [1]:
import os

In [2]:
%pwd

'/home/ubuntu/learning/mlops/pacmann/lazada-id-reviews/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
%pwd

'/home/ubuntu/learning/mlops/pacmann/lazada-id-reviews'

### Training Config

This code will be apply in `src/LadazaIDReview/entity/config_entity.py`.

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    input_train_path: Path
    output_train_path: Path
    vectorized_train_path: Path
    model_path: Path
    params_max_iter: int
    params_solver: str
    params_n_jobs: int

### Training Config Manager

This code will be apply in `src/LazadaIDReview/config/configurations.py`.

In [6]:
from LazadaIDReviews.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from LazadaIDReviews.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_training_config(self) -> TrainingConfig:
        """read training config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: TrainingConfig type
        """
        data_dump_config = self.config.dump_data
        vectorize_config = self.config.vectorize_data
        train_config = self.config.train_model
        train_params = self.params

        create_directories([train_config.root_dir])

        config = TrainingConfig(
            root_dir=train_config.root_dir,
            input_train_path=Path(data_dump_config.input_train_path),
            output_train_path=Path(data_dump_config.output_train_path),
            vectorized_train_path=Path(vectorize_config.vectorized_train_path),
            model_path=Path(train_config.model_path),
            params_max_iter=train_params.MAX_ITER,
            params_solver=train_params.SOLVER,
            params_n_jobs=train_params.N_JOBS
        )

        return config

### Perform Training

This code in `src/LazadaIDReview/components/training.py`.

For this example, for initial run we could use logistic regression, later on we could try:
+ another model
+ another data enrichment technique
+ another model tweaking

In [8]:
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from LazadaIDReviews import logger

class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config

    def logistic_regression(self) -> None:
        """train the data with linear regression model and dump the data
        """
        logger.info(f"Load vectorized data train from {self.config.vectorized_train_path}.")
        X_train_vec = joblib.load(self.config.vectorized_train_path)
        
        logger.info(f"Load data train output from {self.config.output_train_path}.")
        y_train = joblib.load(self.config.output_train_path)
        
        logger.info(f"Train the model.")
        model = LogisticRegression(
            solver=self.config.params_solver,
            max_iter=self.config.params_max_iter,
            n_jobs=self.config.params_n_jobs
        )
        
        model.fit(X_train_vec, y_train)
        
        logger.info(f"Dump the model.")
        joblib.dump(model, self.config.model_path)

### Traning the Model

This code in `src/LazadaIDReview/pipeline/step_03_training.py`.

In [9]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    training = Training(config=training_config)
    training.logistic_regression()
except Exception as e:
    logger.error(e)
    raise e

[2024-09-11 04:32:36,155: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-09-11 04:32:36,157: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-09-11 04:32:36,158: INFO: common: created directory at: artifacts]
[2024-09-11 04:32:36,158: INFO: common: created directory at: artifacts/models]
[2024-09-11 04:32:36,159: INFO: 994031705: Load vectorized data train from artifacts/preprocessing/X_train_vec.pkl.]
[2024-09-11 04:32:36,169: INFO: 994031705: Load data train output from artifacts/data/y_train.pkl.]
[2024-09-11 04:32:36,420: INFO: 994031705: Train the model.]
[2024-09-11 04:32:40,227: INFO: 994031705: Dump the model.]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Debug**: Predict by showing the data training prediction result.

In [10]:
import pandas as pd

X_train = joblib.load(training_config.input_train_path)
X_train_vec = joblib.load(training_config.vectorized_train_path)
y_train = joblib.load(training_config.output_train_path)
model = joblib.load(training_config.model_path)

y_pred = pd.Series(model.predict(X_train_vec), index = X_train.index)
y_pred

0        1
1        5
2        5
3        5
4        5
        ..
82950    4
82951    4
82952    4
82953    4
82954    4
Length: 82955, dtype: int64

In [11]:
print(f"{classification_report(y_train, y_pred)}")

              precision    recall  f1-score   support

           1       0.96      0.97      0.96     16591
           2       0.97      0.98      0.98     16591
           3       0.92      0.91      0.92     16591
           4       0.84      0.79      0.81     16591
           5       0.76      0.79      0.77     16591

    accuracy                           0.89     82955
   macro avg       0.89      0.89      0.89     82955
weighted avg       0.89      0.89      0.89     82955

