In [8]:
'''
define set of models to try run
define the corresponding hyperparameter
tune the hyperparameters
retrieve the best model
train the best model
test the best model'''

'\ndefine set of models to try run\ndefine the corresponding hyperparameter\ntune the hyperparameters\nretrieve the best model\ntrain the best model\ntest the best model'

In [9]:
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from dataclasses import dataclass
from pathlib import Path
from sklearn.model_selection import train_test_split
from src.mlproject.utils.common import read_yaml_keys,read_yaml,create_directories
from src.mlproject.constants import CONFIG_PATH_YAML,SCHEMA_PATH_YAML,PARAMS_PATH_YAML
from src.mlproject.entity.config_entity import DataTransformationConfig
import os
#hyperparameter tuning.py


In [10]:
@dataclass
class ModelTrainerConfig:
   train_csv:Path# what we need is train csv, where was it stored
   test_csv:Path
   model_file: Path#what we need is model file,where will we store it?
   schema_data:Path
   report_file: Path#we will also need a report of all the models performcance,where will we store it?
   

In [11]:
class configurationManager:

   def __init__(self,config_filepath=CONFIG_PATH_YAML,schema_path=SCHEMA_PATH_YAML,params_path=PARAMS_PATH_YAML) :#inner folder,pathto csv,datastore
        self.config_data=read_yaml(config_filepath)
        self.params=read_yaml(params_path)
        print("Loaded params:", self.params)
        self.schema=read_yaml(schema_path)
        #Retrieve the parent folder 
        self.parent_folder =self.config_data.get('parent_folder', '')
         #Retrieve the artifcats root 
        self.artifacts_root =self.parent_folder.get('artifacts_root', '')
        #and create the folder 'artifacts_root'
        create_directories([self.artifacts_root])

         #retrieve the model trainer
        self.model_trainer=self.config_data.get('model_trainer','')
         #get the model trainer inner folder
        
        create_directories([self.model_trainer.get('model_file_innerfolder','')])
        #join the file
        self.model_file_path=os.path.join(self.model_trainer.get('model_file_innerfolder',''),self.model_trainer.get('model_file_path',''))
        self.report_file_path=os.path.join(self.model_trainer.get('model_file_innerfolder',''),self.model_trainer.get('report_file_path',''))
   
   
   def get_model_trainer_config(self)-> ModelTrainerConfig:
       get_model_trainer=ModelTrainerConfig(
           train_csv=self.model_trainer.get('train_csv',''), #this is where it was stores
           test_csv=self.model_trainer.get('test_csv',''),#i was using self.config.get instead of model trainer,so geting wrror cannot eopen empty string'''
           model_file=self.model_file_path,#this is where it will be stored
           schema_data=self.schema,
           report_file=self.report_file_path
           
       )

       return get_model_trainer

    
      

      


    





In [12]:
class ModelEntity:
    models={'RandomForest':RandomForestRegressor(),
                    "DecisionTree": DecisionTreeRegressor(),
                    "GradientBoosting": GradientBoostingRegressor(),
                    "LinearRegression": LinearRegression(),
                   
                    #"CatBoosting Regressor": CatBoostRegressor(verbose=False),#
                    "AdaBoost Regressor": AdaBoostRegressor()
                    }
    

In [13]:
import pandas as pd
from src.mlproject.utils.common import evaluate_score,save_model
import mlflow
from mlflow import sklearn
from sklearn.model_selection import RandomizedSearchCV
class ModelTrainerService:
    def __init__(self,config_instance:ModelTrainerConfig,models_entity:ModelEntity) :
        self.config_instance=config_instance
        self.model_train=config_instance.train_csv
        self.model_test=config_instance.test_csv
        self.model_pickle=config_instance.model_file
        self.report_file=config_instance.report_file
        
        self.model_schema=self.config_instance.schema_data
        
        #self.model_schema=self.config_instance.schema_data,# why is it that if we put a comma it becomes a tuple
        
        self.models_dict=models_entity.models

        configurationmanagerinstance=configurationManager()
        self.parameters=configurationmanagerinstance.params
        self.hyperparameter=self.parameters.get('hyperparameters',{})
        print("Loaded hyperparam:", self.hyperparameter)
        
    def splitting(self):
        cat_col,num_cal,target_col=read_yaml_keys(self.model_schema)
        train_df=pd.read_csv(self.config_instance.train_csv,header='infer',delimiter=',')
        #train_df=pd.read_csv(filepath_or_buffer=self.model_train)
        test_df=pd.read_csv(self.model_test)

        x_train=train_df.drop(target_col,axis=1)
        y_train=train_df[target_col]

        x_test=test_df.drop(target_col,axis=1)
        y_test=test_df[target_col]

        return x_test,y_test,x_train,y_train
    
    
    def get_best_model(self):
        report={}
        x_test,y_test,x_train,y_train=self.splitting()
        '''for i in range(len(self.models_dict)):
           print('..........................................')
           model_name =list(self.models_dict.keys())[i]
           print(model_name)
           model_instance =list(self.models_dict.values())[i]
           
           parameters=self.hyperparameter.get(list(self.models_dict.keys())[i],{})
           print(parameters)'''

           
        for model_name,model_instance in self.models_dict.items():
            print('..........................................')
           # Iterate over models and hyperparameters.
            parameters=self.hyperparameter.get(model_name,{})# i forgot to use get to retrieve the model name such that is resulted into category name and attributes but i wanted attributes only,the 
            
            if parameters:#check if parameters is not empty
                     rs=RandomizedSearchCV(estimator=model_instance,param_distributions=parameters,cv=5)
                     rs.fit(x_train,y_train)
                     predict=rs.predict(x_test)
                     mae,mse,score=evaluate_score(y_test,predicted_value=predict)
                     #tuning the parameters manually below has resulted in error in passing the hyperparameters,using Gridsearch cv takes forever
                     '''for key, value in parameters.items():
                     model_instance.set_params(**{key: value})
                     #model_instance.set_params(**parameters)
                     model_instance.fit(x_train,y_train)
                     y_pred=model_instance.predict(x_test)
                     score=r2_score(y_test,y_pred)'''
                #write to the report file created in the self.report attribute by opening it in append mode and saving the results stored in the variable
                     try:
                        with open(self.report_file, 'a') as file:
                            file.write(f"{model_name}: {score} \t {mae}\t {mse} \n")
                        print(f"Score for {model_name} written to report file.")
                     except Exception as e:
                        print(f"Error writing to report file: {e}")

                     try:
                        with mlflow.start_run(run_name='tired_3') as run:
                                mlflow.log_metric('r2_score', score)
                                mlflow.log_metric('mae', mae)
                                mlflow.log_metric('mse', mse)
                                mlflow.log_params(rs.best_params_)
                                mlflow.sklearn.log_model(rs.best_estimator_, "model_" + model_name)
                                # Print the run id of each model but its not a must
                                print(f"Run ID: {run.info.run_id}")
                               # runid=run.info.run_id
                        print(f"Model {model_name} logged to mlflow.")
                     except Exception as e:
                            print(f"Error logging to mlflow: {e}")

        best_run_id='f95b9d08d6404162815bb3c19fce79f1'
        
        best_run=mlflow.get_run(best_run_id)#i was using search_runs so it resulted into empty runs.
        if  best_run:
            best_hyperparameters= best_run.data.params#used to get the hyperparameters
            best_model=AdaBoostRegressor(**best_hyperparameters)#assigns the hyperparameters to the model
            save_model(model=best_model,path=self.model_pickle)
        
        else:
             print(f"No runs found for run_id: {best_run_id}")
        
        
    

        
                    
                
            

    
    

    
    
    
           


          



           

           
        

            
            

        
        
        

In [14]:
class ModelTrainerPipeline:
    def __init__(self) -> None:
        pass
    def main(self):
        config_manager=configurationManager()
        model_config=config_manager.get_model_trainer_config()
        model_entity=ModelEntity()
        
        model_service=ModelTrainerService(model_config,model_entity)
        model_service.get_best_model()

if __name__=='__main__':
    ModelTrainerPipeline().main()



Loaded params: {'hyperparameters': {'DecisionTree': {'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'], 'splitter': ['best', 'random'], 'max_features': ['sqrt', 'log2']}, 'GradientBoosting': {'loss': ['squared_error', 'huber', 'absolute_error', 'quantile'], 'learning_rate': [0.1, 0.01, 0.05, 0.001], 'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9], 'criterion': ['squared_error', 'friedman_mse'], 'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [8, 16, 32, 64, 128, 256]}, 'Linear Regression': {}, 'XGBRegressor': {'learning_rate': [0.1, 0.01, 0.05, 0.001], 'n_estimators': [8, 16, 32, 64, 128, 256]}, 'AdaBoost Regressor': {'learning_rate': [0.1, 0.01, 0.5, 0.001], 'loss': ['linear', 'square', 'exponential'], 'n_estimators': [8, 16, 32, 64, 128, 256]}}}
Loaded params: {'hyperparameters': {'DecisionTree': {'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'], 'splitter': ['best', 'random'], 'max_features': ['sqrt', 'log2']}, 'GradientBo

Score for DecisionTree written to report file.
Run ID: f1ad8170127d4cecb48166434a2fad48
Model DecisionTree logged to mlflow.
..........................................


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\anaconda3\envs\studenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\anaconda3\envs\studenv\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\User\anaconda3\envs\studenv\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\User\anaconda3\envs\studenv\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_

Score for GradientBoosting written to report file.
Run ID: 3f9adf4d8bb84ebcb4b1d34a45cb9f22
Model GradientBoosting logged to mlflow.
..........................................
..........................................
Score for AdaBoost Regressor written to report file.
Run ID: 6f18551d8513475282bbfa2e45c1d63c
Model AdaBoost Regressor logged to mlflow.
Saved model
