In [111]:
from CementStrength.entity import TrainingConfig
from CementStrength import logger
from CementStrength.utils import read_yaml,create_directories,save_model,load_model
import shutil
import pandas as pd
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import  RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os



class Training:
    def __init__(self,config: TrainingConfig):
        self.config = config
         
        create_directories(self.config.preprocessed_data_dir)
        create_directories(self.config.Models_dir)
        create_directories(self.config.images)

    def copy_training_file(self):
        create_directories(self.config.training_dir)
        shutil.copy(self.config.source_dir,self.config.local_training_file)
        logger.info("file copied")

    def preprocessing(self):
        df = pd.read_csv(self.config.local_training_file)
        df = df.drop(columns='Unnamed: 0')
        if df.isnull().values.any():
            for column in df.columns:
                if df[column].dtype == 'int64'or df[column].dtype == 'float64':
                    df[column] = df[column].fillna(df[column].mean())
                else:
                    df[column] = df[column].fillna(df[column].mode())
        # removing outliers
        Q1 = df.quantile(.25)
        Q3 = df.quantile(.75)
        IQR = Q3-Q1
        df = df[~((df<(Q1-1.5*IQR)) | (df>(Q3+1.5*IQR))).any(axis=1)]
        for column in df.columns:
            df[column] += 1
            df[column] = np.log(df[column])
        fig= plt.figure(figsize=(10,10))
        sns.boxplot(df)
        plt.xticks(rotation=90)
        plt.title("Data after outlier correction")
        plt.savefig(self.config.images+'/'+'afer_outlier.png', format="png")
        plt.close(fig=fig)
        fig= plt.figure(figsize=(8,8))
        sns.heatmap(df.corr(),annot=True,cmap="Spectral_r")
        plt.savefig(self.config.images+'/'+'heatmap.png', format="png")
        plt.close(fig=fig)

        return df.to_csv(self.config.preprocessed_data,index=False)
                
    def clustering(self):
        df = pd.read_csv(self.config.preprocessed_data)
        X = df.drop(columns='Concrete_compressive _strength')
        y = df['Concrete_compressive _strength']
        #X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
        #X_valid,X_test,y_valid,y_test = train_test_split(X_test,y_test,test_size=0.5,random_state=42)

        scaler = StandardScaler()
        X_scaled = pd.DataFrame(scaler.fit_transform(X),columns=X.columns,index=X.index)
        
        
        inertia = []
        K = range(1,11)
        for k in K:
            km = KMeans(n_clusters=k)
            km = km.fit(X_scaled.values)
            inertia.append(km.inertia_)
        fig = plt.plot(K, inertia, 'bx-')
        plt.xlabel('k')
        plt.ylabel('Sum_of_squared_distances')
        plt.title('Elbow Method For Optimal k')
        plt.savefig(self.config.images+'/'+'elbow_method_K.png')
        plt.close()
        
        kl = KneeLocator(range(1, 11), inertia, curve="convex", direction="decreasing")
        no_of_clusters = kl.elbow
        cluster_model = KMeans(n_clusters=no_of_clusters,init="k-means++",random_state=42).fit(X_scaled.values)
        save_model(cluster_model,self.config.Models_dir,'cluster_model')

        clusters_train = cluster_model.predict(X_scaled.values)
        #clusters_valid = cluster_model.predict(X_valid_scaled.values)
        #clusters_test = cluster_model.predict(X_test_scaled.values)
        X_scaled['cluster'] = pd.Series(clusters_train, index=X_scaled.index) #cluster column added
        dfs=[X_scaled[X_scaled['cluster']==i] for i in range(no_of_clusters)]
        y_cluster = [y.loc[dfs[i].index] for i in range(len(dfs))]
        new_dfs = []
        for i in range(len(dfs)):
            dfs[i]['Concrete Compressive strength'] = y_cluster[i] 
            new_dfs.append(dfs[i])
        for i in range(len(new_dfs)):
            new_dfs[i].to_csv(self.config.preprocessed_data_dir+'/'+'cluster'+str(i)+'.csv',index=False)

    def regressor(self):
        
        models = []
        for i,cluster in enumerate(os.listdir(self.config.preprocessed_data_dir)):
            df = pd.read_csv(os.path.join(self.config.preprocessed_data_dir,cluster))
            df = df.drop(columns='cluster')
            X = df.drop(columns='Concrete Compressive strength')
            y = df['Concrete Compressive strength']
            X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
            rm = RandomForestRegressor(random_state=42)
            params = { 'n_estimators': [50,100,150],'criterion':['squared_error','absolute_error','friedman_mse','poisson']}
            grid = GridSearchCV(estimator=rm,param_grid=params)
            grid.fit(X_train, y_train)
            random_model = RandomForestRegressor(criterion=grid.best_params_['criterion'],n_estimators=grid.best_params_['n_estimators']).fit(X_train, y_train)
            save_model(random_model,self.config.Models_dir,'random_model_cluster'+str(i))
            models.append(random_model)



In [112]:
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class TrainingConfig:
    source_dir: Path
    training_dir: Path
    local_training_file: Path
    preprocessed_data_dir: Path
    preprocessed_data: Path
    Models_dir: Path
    images: Path

In [113]:
from CementStrength.utils import read_yaml,create_directories
from CementStrength.entity import DataIngestionConfig,DataValidationConfig,DbOperationsConfig
from CementStrength.constants import *





class ConfigurationManager:
    
      def __init__(self, 
                config_file= CONFIG_FILE_PATH,
                param_file = PARAMS_FILE_PATH):

            self.config_file = read_yaml(config_file)
            self.param_file = read_yaml(param_file)
            
            
      
      def get_data_ingestion_config(self) -> DataIngestionConfig:
            create_directories(self.config_file.data_ingestion.root_dir)  
            create_directories(self.config_file.data_validation.good_dir)
            create_directories(self.config_file.data_validation.bad_dir)
            data_ingestion_config = DataIngestionConfig(
            root_dir = self.config_file.data_ingestion.root_dir,
            source_URL = self.config_file.data_ingestion.source_URL,
            local_data_file = self.config_file.data_ingestion.local_data_file,
            unzip_dir = self.config_file.data_ingestion.unzip_dir
                        
                  )
            return data_ingestion_config
      
      def get_data_validation_config(self) -> DataValidationConfig:
            data_validation_config = DataValidationConfig(
                source_dir = self.config_file.data_validation.source_dir,   
                good_dir =  self.config_file.data_validation.good_dir,
                bad_dir = self.config_file.data_validation.bad_dir
                        
                  )
            return data_validation_config
      
      def get_db_operations_config(self) -> DbOperationsConfig:
            db_operations_config = DbOperationsConfig(
                  source_dir = self.config_file.db_operations.source_dir,
                  db_dir= self.config_file.db_operations.db_dir,
                  db_name= self.config_file.db_operations.db_name,
                  training_dir = self.config_file.db_operations.training_dir,
                  training_file= self.config_file.db_operations.training_file             
                  )
            return db_operations_config

      def get_training_config(self) -> TrainingConfig:
                       
            training_config = TrainingConfig(
                  source_dir = self.config_file.training.source_dir,
                  training_dir = self.config_file.training.training_dir,
                  local_training_file = self.config_file.training.local_training_file,
                  preprocessed_data_dir = self.config_file.training.preprocessed_data_dir,
                  preprocessed_data= self.config_file.training.preprocessed_data,
                  Models_dir= self.config_file.training.Models_dir,
                  images= self.config_file.training.images
                           
                  )
            print("Debug - Training Config:", training_config)
            return training_config

In [114]:
train_config = ConfigurationManager()
configs= train_config.get_training_config()

[2023-11-16 15:21:37,645: INFO: common: yaml file: config\config.yml loaded successfully]
[2023-11-16 15:21:37,654: INFO: common: yaml file: params.yml loaded successfully]
Debug - Training Config: TrainingConfig(source_dir='db_operations/training/input.csv', training_dir='Training', local_training_file='Training/training.csv', preprocessed_data_dir='Training/preprocessed', preprocessed_data='Training/preprocessed_data.csv', Models_dir='Training/models', images='Training/images')


In [115]:
train = Training(config=configs)
train.copy_training_file()
train.preprocessing()
train.clustering()
train.regressor()

[2023-11-16 15:21:38,435: INFO: common: directory created]
[2023-11-16 15:21:38,437: INFO: common: directory created]
[2023-11-16 15:21:38,439: INFO: common: directory created]
[2023-11-16 15:21:38,441: INFO: common: directory created]
[2023-11-16 15:21:38,443: INFO: 3524851760: file copied]


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


[2023-11-16 15:21:39,806: INFO: common: model saved]


  super()._check_params_vs_input(X, default_n_init=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfs[i]['Concrete Compressive strength'] = y_cluster[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfs[i]['Concrete Compressive strength'] = y_cluster[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfs[i]['Concrete Compressive strength'] 

[2023-11-16 15:21:57,278: INFO: common: model saved]
[2023-11-16 15:22:09,028: INFO: common: model saved]
[2023-11-16 15:22:23,128: INFO: common: model saved]
[2023-11-16 15:22:33,274: INFO: common: model saved]
