In [1]:
'''
define set of models to try run
define the corresponding hyperparameter
tune the hyperparameters
retrieve the best model
train the best model
test the best model'''

'\ndefine set of models to try run\ndefine the corresponding hyperparameter\ntune the hyperparameters\nretrieve the best model\ntrain the best model\ntest the best model'

In [2]:
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from dataclasses import dataclass
from pathlib import Path
from sklearn.model_selection import train_test_split
from src.mlproject.utils.common import read_yaml_keys,read_yaml,create_directories
from src.mlproject.constants import CONFIG_PATH_YAML,SCHEMA_PATH_YAML
from src.mlproject.entity.config_entity import DataTransformationConfig
import os
#hyperparameter tuning.py


In [3]:
@dataclass
class ModelTrainerConfig:
   train_csv:Path# what we need is train csv, where was it stored
   test_csv:Path
   model_file: Path#what we need is model file,where will we store it?
   schema_data:Path
   report_file: Path#we will also need a report of all the models performcance,where will we store it?
   

In [4]:
class configurationManager:

   def __init__(self,config_filepath=CONFIG_PATH_YAML,schema_path=SCHEMA_PATH_YAML) :#inner folder,pathto csv,datastore
        self.config_data=read_yaml(config_filepath)
       
        self.schema=read_yaml(schema_path)
        #Retrieve the parent folder 
        self.parent_folder =self.config_data.get('parent_folder', '')
         #Retrieve the artifcats root 
        self.artifacts_root =self.parent_folder.get('artifacts_root', '')
        #and create the folder 'artifacts_root'
        create_directories([self.artifacts_root])

         #retrieve the model trainer
        self.model_trainer=self.config_data.get('model_trainer','')
         #get the model trainer inner folder
        
        create_directories([self.model_trainer.get('model_file_innerfolder','')])
        #join the file
        self.model_file_path=os.path.join(self.model_trainer.get('model_file_innerfolder',''),self.model_trainer.get('model_file_path',''))
        self.report_file_path=os.path.join(self.model_trainer.get('model_file_innerfolder',''),self.model_trainer.get('report_file_path',''))
   
   
   def get_model_trainer_config(self)-> ModelTrainerConfig:
       get_model_trainer=ModelTrainerConfig(
           train_csv=self.model_trainer.get('train_csv',''), #this is where it was stores
           test_csv=self.model_trainer.get('test_csv',''),#i was using self.config.get instead of model trainer,so geting wrror cannot eopen empty string'''
           model_file=self.model_file_path,#this is where it will be stored
           schema_data=self.schema,
           report_file=self.report_file_path
           
       )

       return get_model_trainer

    
      

      


    




In [5]:




class ModelEntity:
    models={'RandomForest':RandomForestRegressor(),
                    "DecisionTree": DecisionTreeRegressor(),
                    "GradientBoosting": GradientBoostingRegressor(),
                    "LinearRegression": LinearRegression(),
                   
                    #"CatBoosting Regressor": CatBoostRegressor(verbose=False),#
                    "AdaBoost Regressor": AdaBoostRegressor()
                    }
    
    hyperparameters={
                'DecisionTree':{
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                    'splitter':['best','random'],
                    'max_features':['sqrt','log2'] },

                'RandomForest':{
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                    'max_features':['sqrt','log2',None],
                    'n_estimators': [16,32,64,128,256]},

                'GradientBoosting':{
                    'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
                    'learning_rate':[.1,.01,.05,.001],
                    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                    'criterion':['squared_error', 'friedman_mse'],
                    'max_features':['auto','sqrt','log2'],
                    'n_estimators': [8,16,32,64,128,256]},

                    "Linear Regression":{},

                    "XGBRegressor":{
                        'learning_rate':[.1,.01,.05,.001],
                        'n_estimators': [8,16,32,64,128,256] },

                   ''' "CatBoosting Regressor":{
                        'depth': [6,8,10],
                        'learning_rate': [0.01, 0.05, 0.1],
                        'iterations': [30, 50, 100]},'''

                    "AdaBoost Regressor":{
                        'learning_rate':[.1,.01,0.5,.001],
                        # 'loss':['linear','square','exponential'],
                        'n_estimators': [8,16,32,64,128,256]}
                }
            


       





In [6]:
import pandas as pd
from sklearn.metrics import r2_score
import mlflow
from mlflow import sklearn
class ModelTrainerService:
    def __init__(self,config_instance:ModelTrainerConfig,models_entity:ModelEntity) :
        self.config_instance=config_instance

        self.model_schema=self.config_instance.schema_data
        
        #self.model_schema=self.config_instance.schema_data,# why is it that if we put a comma it becomes a tuple
        self.models_entity=models_entity
    
        self.model_train=config_instance.train_csv
        self.model_test=config_instance.test_csv
        self.model_pickle=config_instance.model_file
        self.report_file=config_instance.report_file
        
        self.models_entity=models_entity
        self.models=models_entity.models
        self.hyperparameter=models_entity.hyperparameters
        

    def splitting(self):
        cat_col,num_cal,target_col=read_yaml_keys(self.model_schema)
        train_df=pd.read_csv(self.config_instance.train_csv,header='infer',delimiter=',')
        #train_df=pd.read_csv(filepath_or_buffer=self.model_train)
        test_df=pd.read_csv(self.model_test)

        x_train=train_df.drop(target_col,axis=1)
        y_train=train_df[target_col]

        x_test=test_df.drop(target_col,axis=1)
        y_test=test_df[target_col]

        return x_test,y_test,x_train,y_train
    
    

    
    
    def model_init(self):
        x_test,y_test,x_train,y_train=self.splitting()

        for i in range(len(list(self.models))):
           model=list(self.models.values())[i]
           hyperparameter=self.hyperparameter[list(self.models.keys())[i]]

           gs=GridSearchCV(estimator=model,param_grid=hyperparameter,cv=5)
           gs.fit(X=x_train,y=y_train)
           #since i have not specified the scoring in the gs instance, i shall explicity perform it here
           y_pred=gs.predict(X=x_test)
           
           score=r2_score(y_test,y_pred)
           
           with mlflow.start_run():
               mlflow.log_param('model_name',model)
               mlflow.log_params(gs.best_params_)
               mlflow.log_metric('r2_score',score)
               mlflow.sklearn.log_model(gs.best_estimator_,model)
               

           #instead of writing all models to a report and finding the overall best score to get best mdoel,i can do this in mlflow ui
           
           #model.set_params(**gs.best_params_)#setting those hyperparameters to the model and fitting the model. 
           #model.fit(x_train,y_train)
           ''' 
           model.set_params(**gs.best_params_)
           model.fit(x_train,y_train)
           predicted_y_test=model.predict(x_test)
           score=r2_score(y_test,predicted_y_test)

           report={}
        
           report[list(self.models.keys())[i]]=score

           with open(self.report_file,'w+') as file:
               file.write(str(report))
        return report
    
    def find_overall_best_model_from_report(self):
            report=self.model_init()
            best_r2_score=max(sorted(report.values()))
            best_model_name=list(report.keys())[list(report.values()).index(best_r2_score)]
            
            model=self.models[best_model_name]

            x_test,y_test,_,_=self.splitting()
            
            predicted=model.predict(x_test)
            r2_score=r2_score(predicted,y_test)
        
            return r2_score,best_model_name'''
           


          



           

           
        

            
            

        
        
        

In [7]:
class ModelTrainerPipeline:
    def __init__(self) -> None:
        pass
    def main(self):
        config_manager=configurationManager()
        model_config=config_manager.get_model_trainer_config()
        model_entity=ModelEntity()
        
        model_service=ModelTrainerService(model_config,model_entity)
        report=model_service.model_init()

if __name__=='__main__':
    ModelTrainerPipeline().main()



RepresenterError: ('cannot represent an object', RandomForestRegressor())