In [1]:
import os

In [2]:
%pwd

'/home/ubuntu/learning/mlops/pacmann/lazada-id-reviews/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
with open('.env') as f:
    os.environ.update(
        line.strip().split('=') for line in f
)

In [5]:
%pwd

'/home/ubuntu/learning/mlops/pacmann/lazada-id-reviews'

### Predict Config

This code will be apply in `src/LadazaIDReview/entity/config_entity.py`.

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class PredictionConfig:
    root_dir: Path
    mlflow_tracking_uri: str
    mlflow_model_name: str
    mlflow_deploy_model_alias: Path
    mlflow_vectorizer_model_path: Path
    
    # for development (debug)
    input_test_path: Path
    output_test_path: Path

### Predict Config Manager

This code will be apply in `src/LazadaIDReview/config/configurations.py`.

In [7]:
from LazadaIDReviews.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from LazadaIDReviews.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_prediction_config(self) -> PredictionConfig:
        """read training evaluation config file and store as 
        config entity then apply the dataclasses
        
        Returns:
            config: PredictionConfig type
        """
        predict_config = self.config.predict
        
        # for development (debug)
        dump_data_config = self.config.dump_data

        create_directories([predict_config.root_dir])

        config = PredictionConfig(
            root_dir=predict_config.root_dir,
            mlflow_tracking_uri=os.environ["MLFLOW_TRACKING_URI"],
            mlflow_model_name=predict_config.mlflow_model_name,
            mlflow_deploy_model_alias=os.environ["MLFLOW_DEPLOY_MODEL_ALIAS"],
            mlflow_vectorizer_model_path=predict_config.mlflow_vectorizer_model_path,
            
            # for development (debug)
            input_test_path=dump_data_config.input_test_path,
            output_test_path=dump_data_config.output_test_path
        )

        return config

In [9]:
from mlflow.artifacts import download_artifacts
from mlflow import MlflowClient
from mlflow import pyfunc

import joblib

---

**Debug**: Explain when doing prediction in the notebook with MLflow.

In [10]:
config = ConfigurationManager()
predict_config = config.get_prediction_config()

[2024-09-11 06:54:05,898: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-09-11 06:54:05,899: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-09-11 06:54:05,901: INFO: common: created directory at: artifacts]
[2024-09-11 06:54:05,902: INFO: common: created directory at: artifacts/predict]


Select the deployed model from MLflow.

In [11]:
client = MlflowClient(tracking_uri=predict_config.mlflow_tracking_uri)
selected_model = client.get_model_version_by_alias(
    predict_config.mlflow_model_name, 
    predict_config.mlflow_deploy_model_alias
)

selected_model.source

'mlflow-artifacts:/1/1f046f7844764a9a907682ce09cca37e/artifacts/models'

In [12]:
loaded_model = pyfunc.load_model(model_uri=selected_model.source)
loaded_model

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:00<00:00, 116.54it/s]


mlflow.pyfunc.loaded_model:
  artifact_path: models
  flavor: mlflow.sklearn
  run_id: 1f046f7844764a9a907682ce09cca37e

Get the model `run_id`.

In [13]:
selected_run_id = selected_model.run_id
selected_run_id

'1f046f7844764a9a907682ce09cca37e'

Download vectorizer (one of the MLflow artifact) from MLflow.

In [14]:
download_artifacts(
    run_id=selected_run_id,
    artifact_path=predict_config.mlflow_vectorizer_model_path,
    dst_path=predict_config.root_dir
)

Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 27.56it/s]


'/home/ubuntu/learning/mlops/pacmann/lazada-id-reviews/artifacts/predict/vectorizer/vectorizer.pkl'

Load the downloaded vectorizer.

In [15]:
root_dir = predict_config.root_dir
mlflow_vectorizer_model_path = predict_config.mlflow_vectorizer_model_path
vectorizer_model_path = f"{root_dir}/{mlflow_vectorizer_model_path}"
vectorizer = joblib.load(vectorizer_model_path)
vectorizer

Predict and evaluate the data test as input.

In [16]:
X_test = joblib.load(predict_config.input_test_path)
y_test = joblib.load(predict_config.output_test_path)

In [17]:
X_test.head()

89695                       brg mulus,kiriman jg cepat, thx
124921    Terima kasih Adata Store dan Lazada, barang su...
186835    Trimaksih Lazada barang nya sangat bagus dan m...
113968    barang sampai juga agak lama pengirimanya,, Al...
71564              bagus, dapet bonus kabel strap lumayan üòÅ
Name: reviewContent, dtype: object

In [18]:
X_test.shape

(85624,)

The request body, for the preparation of the http input request body.

In [19]:
request_body = {
    "reviewContent": X_test.to_list()
}

In [20]:
request_body['reviewContent'][:10]

['brg mulus,kiriman jg cepat, thx',
 'Terima kasih Adata Store dan Lazada, barang sudah sampai, mantab',
 'Trimaksih Lazada barang nya sangat bagus dan memuas kan ..  Waktu pesen kemarin blm ada free besi ny .  Skrng udh ada .  Heee heee ..',
 'barang sampai juga agak lama pengirimanya,, Alhamdulillah barang gak ada yg cacat,,',
 'bagus, dapet bonus kabel strap lumayan üòÅ',
 'Mohon tulis harga dengan benar',
 'mantab jiwa,,sory barang ga sempat dipoto',
 'Recomended banget lur...',
 'Terima kasih Lazada pengiriman cepat dan packingnya rapi, barang sesuai dgn pesanan saya...tp belum saya coba aktifkan',
 'Saya kira karena liburan bakal lama sampainya, ternyata tidak,  Barang asli setelah dibuka,sama dengan foto   Thanks lazada, memang effortless banget']

Vectorize the data test as input.

In [21]:
X_test_vec = vectorizer.transform(request_body['reviewContent'])
X_test_vec

<85624x13619 sparse matrix of type '<class 'numpy.float64'>'
	with 982972 stored elements in Compressed Sparse Row format>

Make prediction.

In [22]:
y_predict = loaded_model.predict(X_test_vec).tolist()

In [23]:
len(y_predict)

85624

In [24]:
y_predict[:10]

[5, 5, 5, 4, 3, 1, 5, 5, 5, 3]

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           1       0.57      0.79      0.66      5485
           2       0.31      0.47      0.37      1745
           3       0.25      0.46      0.33      3486
           4       0.25      0.45      0.33      8470
           5       0.92      0.75      0.82     66438

    accuracy                           0.70     85624
   macro avg       0.46      0.58      0.50     85624
weighted avg       0.79      0.70      0.74     85624



---

### Make Prediction

This code in `src/LazadaIDReview/components/predict.py`.

In [26]:
from LazadaIDReviews import logger

class Predict:
    def __init__(self, config: PredictionConfig):
        self.config = config

    def run(self, data: list) -> list:
        """predict the data with linear regression model
        
        Args:
            data (list): input data to predict

        Raises:
            client_error: error when access mlflow to get deployed model
            download_error: error when download vectorizer from mlflow artifact
            load_error: vectorizer error
        
        Returns:
            y_predict: list type
        """
        try:
            logger.info("Set MLflow Client.")
            client = MlflowClient(tracking_uri=self.config.mlflow_tracking_uri)
            
            logger.info("Select the deployed model from MLflow.")
            selected_model = client.get_model_version_by_alias(
                self.config.mlflow_model_name, 
                self.config.mlflow_deploy_model_alias
            )
            
            logger.info("Get the deployed model run id.")
            selected_run_id = selected_model.run_id
        except Exception as client_error:
            logger.error(client_error)
            raise client_error
        
        root_dir = self.config.root_dir
        mlflow_vectorizer_model_path = self.config.mlflow_vectorizer_model_path
        vectorizer_model_path = Path(f"{root_dir}/{mlflow_vectorizer_model_path}")
        
        try:
            logger.info("Downloading vectorizer from MLflow's artifacts.")
            download_artifacts(
                run_id=selected_run_id,
                artifact_path=self.config.mlflow_vectorizer_model_path,
                dst_path=self.config.root_dir
            )
        except Exception as download_error:
            logger.error(download_error)
            raise download_error
        
        try:
            logger.info("Load the vectorizer model.")
            vectorizer = joblib.load(vectorizer_model_path)
            
            logger.info("Transform the data.")
            X_test_vec = vectorizer.transform(data)
        except Exception as load_error:
            logger.error(load_error)
            raise load_error
        
        logger.info("Predict the data.")
        loaded_model = pyfunc.load_model(model_uri=selected_model.source)
        y_predict = loaded_model.predict(X_test_vec).tolist()
        
        return y_predict

### Predict the Data

**Debug**: test the predict object and it's method.

In [27]:
config = ConfigurationManager()
predict_config = config.get_prediction_config()

[2024-09-11 06:54:07,703: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-09-11 06:54:07,705: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-09-11 06:54:07,706: INFO: common: created directory at: artifacts]
[2024-09-11 06:54:07,707: INFO: common: created directory at: artifacts/predict]


In [28]:
X_test = joblib.load(predict_config.input_test_path)

In [29]:
request_body = {
    "reviewContents": X_test.to_list()
}

In [30]:
data = request_body["reviewContents"]
data[:10]

['brg mulus,kiriman jg cepat, thx',
 'Terima kasih Adata Store dan Lazada, barang sudah sampai, mantab',
 'Trimaksih Lazada barang nya sangat bagus dan memuas kan ..  Waktu pesen kemarin blm ada free besi ny .  Skrng udh ada .  Heee heee ..',
 'barang sampai juga agak lama pengirimanya,, Alhamdulillah barang gak ada yg cacat,,',
 'bagus, dapet bonus kabel strap lumayan üòÅ',
 'Mohon tulis harga dengan benar',
 'mantab jiwa,,sory barang ga sempat dipoto',
 'Recomended banget lur...',
 'Terima kasih Lazada pengiriman cepat dan packingnya rapi, barang sesuai dgn pesanan saya...tp belum saya coba aktifkan',
 'Saya kira karena liburan bakal lama sampainya, ternyata tidak,  Barang asli setelah dibuka,sama dengan foto   Thanks lazada, memang effortless banget']

This code in `app.py`.

In [31]:
try:
    config = ConfigurationManager()
    predict_config = config.get_prediction_config()
    predict = Predict(config=predict_config)
    result = predict.run(data)
except Exception as e:
    logger.error(e)
    raise e

[2024-09-11 06:54:07,872: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-09-11 06:54:07,874: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-09-11 06:54:07,875: INFO: common: created directory at: artifacts]
[2024-09-11 06:54:07,876: INFO: common: created directory at: artifacts/predict]
[2024-09-11 06:54:07,877: INFO: 539828281: Set MLflow Client.]
[2024-09-11 06:54:07,878: INFO: 539828281: Select the deployed model from MLflow.]
[2024-09-11 06:54:07,889: INFO: 539828281: Get the deployed model run id.]
[2024-09-11 06:54:07,889: INFO: 539828281: Downloading vectorizer from MLflow's artifacts.]


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 23.94it/s]

[2024-09-11 06:54:07,952: INFO: 539828281: Load the vectorizer model.]
[2024-09-11 06:54:07,983: INFO: 539828281: Transform the data.]





[2024-09-11 06:54:08,785: INFO: 539828281: Predict the data.]


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:00<00:00, 93.49it/s] 


In [32]:
len(result)

85624

In [33]:
result[:10]

[5, 5, 5, 4, 3, 1, 5, 5, 5, 3]