In [3]:
import os

In [4]:
%pwd

'/home/jafarid/code/yogaposes/research'

In [5]:
os.chdir('../')
%pwd

'/home/jafarid/code/yogaposes'

In [6]:
from dataclasses import dataclass
from pathlib import Path

In [8]:
@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path 
    resnet_trained_model_path: Path
    resnet_updated_base_model_path: Path
    traning_data: Path
    params_augmentation: bool
    params_image_size: list 
    params_batch_size: int 
    params_epoches: int
    params_learning_rate: float
    all_params: dict
    mlflow_uri: str

In [9]:
from yogaposes.constants import *
from yogaposes.utils.common import read_yaml, create_directories
from dotenv import load_dotenv
load_dotenv()

MLFLOW_TRACKING_URI = os.environ['MLFLOW_TRACKING_URI']
MLFLOW_TRACKING_USERNAME = os.environ['MLFLOW_TRACKING_USERNAME']
MLFLOW_TRACKING_PASSWORD = os.environ['MLFLOW_TRACKING_PASSWORD']

In [11]:
class configurationManager:
    def __init__(self,config_file_path=CONFIG_FILE_PATH, params_file_path=PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        
        create_directories([self.config.artifacts_root])
        
    def get_traning_config(self) -> TrainingConfig:
        model_training = self.config.model_training
        prepare_base_model = self.config.prepare_base_model
        training_data = os.path.join(self.config.data_ingestion.root_dir, 'yoga-poses-dataset')
        
        create_directories([model_training.root_dir])
        
        training_config = TrainingConfig(root_dir= model_training.resnet_trained_model_path, 
                                        resnet_trained_model_path= model_training.resnet_trained_model_path,
                                        resnet_updated_base_model_path= prepare_base_model.resnet_updated_base_model_path,
                                        traning_data = training_data,
                                        params_augmentation = self.params.AUGMENTATION,
                                        params_image_size = self.params.IMAGE_SIZE,
                                        params_batch_size= self.params.BATCH_SIZE,
                                        params_epoches = self.params.EPOCHS,
                                        params_learning_rate = self.params.LEARNING_RATE,
                                        all_params = self.params,
                                        mlflow_uri= MLFLOW_TRACKING_URI
                                        )
        
        return training_config

In [12]:
import torch
import torch.optim as optim
from torch import nn
from torchvision.transforms import Compose, ToTensor, Normalize, Resize, CenterCrop
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import numpy as np
import mlflow
from urllib.parse import urlparse
from PIL import ImageFile, Image

In [13]:
class ModelTrainer(object):
    def __init__(self, config:TrainingConfig, loss_fn=None, optimizer=None):
        self.config = config
        self.model = self.load_model()
        self.loss_fn = loss_fn if loss_fn else nn.CrossEntropyLoss(reduction='mean')
        self.optimizer = optimizer if optimizer else optim.Adam(self.model.parameters(), lr=self.config.params_learning_rate)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model.to(self.device)
        
        self.train_loader, self.val_loader = self.set_loaders()
        
        self.losses = []
        self.val_losses = []
        self.accuracy = []
        self.val_accuracy = []
        self.total_epoches = 0
        
        self.train_step_fn = self._make_train_step_fn()
        self.val_step_fn = self._make_val_step_fn()
        
    def load_model(self):
     
        return torch.load(self.config.resnet_updated_base_model_path)
    
    def set_seed(self, seed=42):
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.manual_seed(seed)
        np.random.seed(seed)
    
    def set_loaders(self):
        
        # Allow loading of truncated images
        ImageFile.LOAD_TRUNCATED_IMAGES = True
        
        # image net statistics
        normalizer = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229,0.224,0.225])
        
        composer = Compose([Resize(256), CenterCrop(224), ToTensor(), normalizer])
        
        train_data = ImageFolder(root=os.path.join(self.config.traning_data,'DATASET/TRAIN'), transform=composer)
        val_data = ImageFolder(root=os.path.join(self.config.traning_data,'DATASET/TEST'), transform=composer)
        
        train_loader = DataLoader(train_data, batch_size=self.config.params_batch_size, shuffle=True)
        val_loader = DataLoader(val_data, batch_size=self.config.params_batch_size)
        
        return train_loader, val_loader
    
    # higher order function to be set and built globally and constructed the inner fuction without knowning x and y before hand
    def _make_train_step_fn(self):
        # single batch operation
        def perform_train_step_fn(x,y):
            # set the train mode
            self.model.train()
            
            # step 1: compute model output
            yhat = self.model(x)
            #yhat = yhat.float()
            #y = y.float()
            # step 2: compute the loss
              
            loss= self.loss_fn(yhat,y)
            
            # step 2': compute accuracy 
            yhat = torch.argmax(yhat,1)
            
            total_correct = (yhat ==y).sum().item()
            total = y.shape[0]
            acc = total_correct/total
            
            # step 3: compute the gradient
            loss.backward()
            
            #step4: update parameters
            self.optimizer.step()
            self.optimizer.zero_grad()
            
            #step 5: return the loss
            return loss.item() , acc
        return perform_train_step_fn
    
    def _make_val_step_fn(self):
        # single batch operation
        def perform_val_step_fn(x,y):
            # set the model in val mode
            self.model.eval()
            
            #step 1: compute the prediction
            yhat = self.model(x)
            #yhat = yhat.float()
            #y = y.float()
            #step 2: compute the loss
            loss = self.loss_fn(yhat,y)
            # step 2': compute accuracy 
            yhat = torch.argmax(yhat,1)
          
            total_correct = (yhat ==y).sum().item()
            total = y.shape[0]
            acc = total_correct/total
            
            return loss.item(), acc
        return perform_val_step_fn
    
    def _mini_batch(self, validation=False):
        # one epoch operation 
        if validation:
            data_loader = self.val_loader
            step_fn = self.val_step_fn
            
        else: 
            data_loader = self.train_loader
            step_fn = self.train_step_fn
            
        if data_loader is None:
            return None
        
        mini_batch_losses = []
        mini_batch_accs = []
        for x_batch, y_batch in data_loader:
            x_batch = x_batch.to(self.device)
            y_batch = y_batch.to(self.device)
            
            mini_batch_loss, mini_batch_acc = step_fn(x_batch,y_batch)
            
            mini_batch_losses.append(mini_batch_loss)
            mini_batch_accs.append(mini_batch_acc)
        
        loss = np.mean(mini_batch_losses)
        acc = np.mean(mini_batch_accs)
        return loss, acc
    
    def train(self, seed=42):
        self.set_seed(seed)
        
        for epoch in range(self.config.params_epoches):
            self.total_epoches +=1
            
            # perform training on mini batches within 1 epoch
            loss, acc = self._mini_batch(validation=False)
            self.losses.append(loss)
            self.accuracy.append(acc)
            # now calc validation
            with torch.no_grad():
                val_loss, val_acc = self._mini_batch(validation=True)
                self.val_losses.append(val_loss)
                self.val_accuracy.append(val_acc)
        
            print(f'\nEpoch: {epoch+1} \tTraining Loss: {loss:.4f} \tValidation Loss: {val_loss:.4f}')
            print(f'\t\tTraining Accuracy: {100 * acc:.2f}%\t Validation Accuracy: {100 * val_acc:.2f}%')
                
        self.save_checkpoint()
            
    def save_checkpoint(self):
        checkpoint = {'epoch': self.total_epoches,
                      'model_state_dict': self.model.state_dict(),
                      'optimizer_state_dict': self.optimizer.state_dict(),
                      'loss': self.losses,
                      'accuracy': self.accuracy,
                      'val_loss': self.val_losses,
                      'val_accuracy': self.val_accuracy
                      }
        torch.save(checkpoint, self.config.resnet_trained_model_path)
        
    def load_checkpoint(self):
        checkpoint = torch.load(self.config.resnet_trained_model_path)
        self.model.load_state_dict(checkpoint["model_state_dict"])
        self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        self.total_epoches = checkpoint["epoch"]
        self.losses = checkpoint["loss"]
        self.accuracy = checkpoint['accuracy']
        self.val_accuracy = checkpoint['val_accuracy']
        self.val_losses = checkpoint["val_loss"]
        self.model.train() # always use train for resuming traning
        
    def _preprocess_image(self, filename):
        image = Image.open(filename)
        # Allow loading of truncated images
        ImageFile.LOAD_TRUNCATED_IMAGES = True
        
        # image net statistics
        normalizer = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229,0.224,0.225])
        
        composer = Compose([Resize(256), CenterCrop(224), ToTensor(), normalizer])
        image = composer(image).unsqueeze(0)
        return image
        
    
    def predict(self,x_filename):
        
        self.load_checkpoint()
        self.model.eval()
        x = self._preprocess_image(x_filename)
        x_tensor = torch.as_tensor(x).float()
        y_hat_tensor = self.model(x_tensor.to(self.device))
        
        # set it back to the train mode
        self.model.train()
        
        labels = {0:'downdog', 1: 'godess', 2:'plank', 3:'tree', 4:'warrior2'}
        prediction=np.argmax(y_hat_tensor.detach().cpu().numpy())
        
        return labels[prediction]
    
    def log_into_mlflow(self):
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        
        with mlflow.start_run():
            mlflow.log_params(self.config.all_params)
            mlflow.log_metrics({'train_loss': np.mean(self.losses),'val_loss': np.mean(self.val_losses), 'train_accuracy': np.mean(self.accuracy), 'val_accuracy': np.mean(self.val_accuracy)})
        
            # Model registry does not work with file store
            if tracking_url_type_store != "file":

                # Register the model
                # There are other ways to use the Model Registry, which depends on the use case,
                # please refer to the doc for more information:
                # https://mlflow.org/docs/latest/model-registry.html#api-workflow
                mlflow.pytorch.log_model(self.model, "model", registered_model_name="ResNet18Model")
            else:
                mlflow.pytorch.log_model(self.model, "model")
        
            

In [49]:
try:
    config = configurationManager()
    training_config = config.get_traning_config()
    training = ModelTrainer(config=training_config)
    training.train()
    training.log_into_mlflow()

except Exception as e:
    raise e   

[2024-03-05 02:03:18,404: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-03-05 02:03:18,406: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-05 02:03:18,407: INFO: common: created directory at: artifacts]
[2024-03-05 02:03:18,407: INFO: common: created directory at: artifacts/training]





Epoch: 1 	Training Loss: 8.5584 	Validation Loss: 4.1259
		Training Accuracy: 53.55%	 Validation Accuracy: 79.24%

Epoch: 2 	Training Loss: 3.5759 	Validation Loss: 1.1918
		Training Accuracy: 70.33%	 Validation Accuracy: 89.17%

Epoch: 3 	Training Loss: 2.8397 	Validation Loss: 4.4006
		Training Accuracy: 76.90%	 Validation Accuracy: 75.00%

Epoch: 4 	Training Loss: 2.3950 	Validation Loss: 1.7030
		Training Accuracy: 79.78%	 Validation Accuracy: 88.12%

Epoch: 5 	Training Loss: 2.6101 	Validation Loss: 1.0900
		Training Accuracy: 79.40%	 Validation Accuracy: 90.97%


Registered model 'ResNet18Model' already exists. Creating a new version of this model...
2024/03/05 02:06:05 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: ResNet18Model, version 3
Created version '3' of model 'ResNet18Model'.


In [None]:
try:
    config = configurationManager()
    training_config = config.get_traning_config()
    training = ModelTrainer(config=training_config, inference=True)
    training.train()
    training.log_into_mlflow()

except Exception as e:
    raise e   

In [15]:
try:
    config_inference = configurationManager()
    inference_config = config_inference.get_traning_config()
    inference = ModelTrainer(config=inference_config)
    c = inference.predict('artifacts/data_ingestion/yoga-poses-dataset/DATASET/TRAIN/plank/00000128.jpg')
    print(c)

except Exception as e:
    raise e   

[2024-03-05 19:20:36,074: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-03-05 19:20:36,076: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-05 19:20:36,077: INFO: common: created directory at: artifacts]
[2024-03-05 19:20:36,077: INFO: common: created directory at: artifacts/training]
plank
