In [2]:
!pip install pymongo

Collecting pymongo




  Downloading pymongo-4.4.1-cp39-cp39-win_amd64.whl (408 kB)
Collecting dnspython<3.0.0,>=1.16.0
  Downloading dnspython-2.3.0-py3-none-any.whl (283 kB)
Installing collected packages: dnspython, pymongo
Successfully installed dnspython-2.3.0 pymongo-4.4.1


## 1. Data Ingestion Pipeline:
- a. Design a data ingestion pipeline that collects and stores data from various sources such as databases, APIs, and streaming platforms.
- b. Implement a real-time data ingestion pipeline for processing sensor data from IoT devices.
- c. Develop a data ingestion pipeline that handles data from different file formats (CSV, JSON, etc.) and performs data validation and cleansing.

##### Data collection from various sources (Here mongodb)

In [3]:
import pymongo
import pandas as pd
import json
from dataclasses import dataclass
# Provide the mongodb localhost url to connect python to mongodb.
import os

@dataclass
class EnvironmentVariable:
    mongo_db_url:str = os.getenv("MONGO_DB_URL")     # Providing url and access key
    aws_access_key_id:str = os.getenv("AWS_ACCESS_KEY_ID")
    aws_access_secret_key:str = os.getenv("AWS_SECRET_ACCESS_KEY")

env_var = EnvironmentVariable()
mongo_client = pymongo.MongoClient(env_var.mongo_db_url)
TARGET_COLUMN = "class"

In [None]:
import pandas as pd
from sensor.config import mongo_client
import os,sys
import numpy as np
import dill

def get_collection_as_dataframe(database_name:str,collection_name:str)->pd.DataFrame:
    """
    Description: This function return collection as dataframe
    =========================================================
    Params:
    database_name: database name
    collection_name: collection name
    =========================================================
    return Pandas dataframe of a collection
    """
    df = pd.DataFrame(list(mongo_client[database_name][collection_name].find()))
    if "_id" in df.columns:
        df = df.drop("_id",axis=1)
    return df

In [None]:
import os,sys
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

class DataIngestion:
    '''
    Make sure to to import above functions as modules so that every things 
    works, here config entity is basically the above code that is saved as
    config, 
    '''
    def __init__(self,data_ingestion_config:config_entity.DataIngestionConfig ):
        self.data_ingestion_config = data_ingestion_config

    def initiate_data_ingestion(self)->artifact_entity.DataIngestionArtifact:
        
        # Exporting collection data as pandas dataframe
        df:pd.DataFrame  = utils.get_collection_as_dataframe(
            database_name=self.data_ingestion_config.database_name, 
            collection_name=self.data_ingestion_config.collection_name)
            
        #replace na with Nan
        df.replace(to_replace="na",value=np.NAN,inplace=True)
        #Save data in feature store
        
        #Create feature store folder if not available
        feature_store_dir = os.path.dirname(self.data_ingestion_config.feature_store_file_path)
        os.makedirs(feature_store_dir,exist_ok=True)
        #Save df to feature store folder
        df.to_csv(path_or_buf=self.data_ingestion_config.feature_store_file_path,index=False,header=True)
        #split dataset into train and test set
        train_df,test_df = train_test_split(df,test_size=self.data_ingestion_config.test_size,random_state=42)
            
        #create dataset directory folder if not available
        dataset_dir = os.path.dirname(self.data_ingestion_config.train_file_path)
        os.makedirs(dataset_dir,exist_ok=True)

        #Save df to feature store folder
        train_df.to_csv(path_or_buf=self.data_ingestion_config.train_file_path,index=False,header=True)
        test_df.to_csv(path_or_buf=self.data_ingestion_config.test_file_path,index=False,header=True)
            
        #Prepare artifact

        data_ingestion_artifact = artifact_entity.DataIngestionArtifact(
            feature_store_file_path=self.data_ingestion_config.feature_store_file_path,
            train_file_path=self.data_ingestion_config.train_file_path, 
            test_file_path=self.data_ingestion_config.test_file_path)

        return data_ingestion_artifact

## 2. Model Training:
- a. Build a machine learning model to predict customer churn based on a given dataset. Train the model using appropriate algorithms and evaluate its performance.
- b. Develop a model training pipeline that incorporates feature engineering techniques such as one-hot encoding, feature scaling, and dimensionality reduction.
- c. Train a deep learning model for image classification using transfer learning and fine-tuning techniques.

In [None]:
from source import artifact_entity,config_entity
from typing import Optional
import os,sys 
from xgboost import XGBClassifier
from above import the code
from sklearn.metrics import f1_score


class ModelTrainer:
    def __init__(self,model_trainer_config:config_entity.ModelTrainerConfig,
                data_transformation_artifact:artifact_entity.DataTransformationArtifact
                ):
        
        self.model_trainer_config=model_trainer_config
        self.data_transformation_artifact=data_transformation_artifact

    def train_model(self,x,y):
        xgb_clf =  XGBClassifier()
        xgb_clf.fit(x,y)
        return xgb_clf

    def initiate_model_trainer(self,)->artifact_entity.ModelTrainerArtifact:
       
        train_arr = utils.load_numpy_array_data(file_path=self.data_transformation_artifact.transformed_train_path)
        test_arr = utils.load_numpy_array_data(file_path=self.data_transformation_artifact.transformed_test_path)

        x_train,y_train = train_arr[:,:-1],train_arr[:,-1]
        x_test,y_test = test_arr[:,:-1],test_arr[:,-1]

        model = self.train_model(x=x_train,y=y_train)

        yhat_train = model.predict(x_train)
        f1_train_score  =f1_score(y_true=y_train, y_pred=yhat_train)

        yhat_test = model.predict(x_test)
        f1_test_score  =f1_score(y_true=y_test, y_pred=yhat_test)
            
        #check for overfitting or underfiiting or expected score
        if f1_test_score<self.model_trainer_config.expected_score:
            raise Exception(f"Model is not good as it is not able to give \
            expected accuracy: {self.model_trainer_config.expected_score}: model actual score: {f1_test_score}")

        diff = abs(f1_train_score-f1_test_score)

        if diff>self.model_trainer_config.overfitting_threshold:
            raise Exception(f"Train and test score diff: {diff} is more than overfitting threshold {self.model_trainer_config.overfitting_threshold}")

        #save the trained model
        utils.save_object(file_path=self.model_trainer_config.model_path, obj=model)

        #prepare artifact
        model_trainer_artifact  = artifact_entity.ModelTrainerArtifact(model_path=self.model_trainer_config.model_path, 
        f1_train_score=f1_train_score, f1_test_score=f1_test_score)
        return model_trainer_artifact

##### There are lots of code missing so please refer to this git hub for complete [Link](https://github.com/geijinchan/In-Progress)

## 3. Model Validation:
- a. Implement cross-validation to evaluate the performance of a regression model for predicting housing prices.
- b. Perform model validation using different evaluation metrics such as accuracy, precision, recall, and F1 score for a binary classification problem.
- c. Design a model validation strategy that incorporates stratified sampling to handle imbalanced datasets.

In [None]:
        def initiate_model_evaluation(self)->artifact_entity.ModelEvaluationArtifact:
            #if saved model folder has model the we will compare 
            #which model is best trained or the model from saved model folder

        
            latest_dir_path = self.model_resolver.get_latest_dir_path()
            if latest_dir_path==None:
                model_eval_artifact = artifact_entity.ModelEvaluationArtifact(is_model_accepted=True,
                improved_accuracy=None)
                return model_eval_artifact


            #Finding location of transformer model and target encoder
            transformer_path = self.model_resolver.get_latest_transformer_path()
            model_path = self.model_resolver.get_latest_model_path()
            target_encoder_path = self.model_resolver.get_latest_target_encoder_path()

            #Previous trained  objects
            transformer = load_object(file_path=transformer_path)
            model = load_object(file_path=model_path)
            target_encoder = load_object(file_path=target_encoder_path)
            

            #Currently trained model objects
            current_transformer = load_object(file_path=self.data_transformation_artifact.transform_object_path)
            current_model  = load_object(file_path=self.model_trainer_artifact.model_path)
            current_target_encoder = load_object(file_path=self.data_transformation_artifact.target_encoder_path)
            


            test_df = pd.read_csv(self.data_ingestion_artifact.test_file_path)
            target_df = test_df[TARGET_COLUMN]
            y_true =target_encoder.transform(target_df)
            # accuracy using previous trained model
            
            input_feature_name = list(transformer.feature_names_in_)
            input_arr =transformer.transform(test_df[input_feature_name])
            y_pred = model.predict(input_arr)
            print(f"Prediction using previous model: {target_encoder.inverse_transform(y_pred[:5])}")
            previous_model_score = f1_score(y_true=y_true, y_pred=y_pred)
           
            # accuracy using current trained model
            input_feature_name = list(current_transformer.feature_names_in_)
            input_arr =current_transformer.transform(test_df[input_feature_name])
            y_pred = current_model.predict(input_arr)
            y_true =current_target_encoder.transform(target_df)
            print(f"Prediction using trained model: {current_target_encoder.inverse_transform(y_pred[:5])}")
            current_model_score = f1_score(y_true=y_true, y_pred=y_pred)
                
            model_eval_artifact = artifact_entity.ModelEvaluationArtifact(is_model_accepted=True,
            improved_accuracy=current_model_score-previous_model_score)
            return model_eval_artifact

## 4. Deployment Strategy:
- a. Create a deployment strategy for a machine learning model that provides real-time recommendations based on user interactions.
- b. Develop a deployment pipeline that automates the process of deploying machine learning models to cloud platforms such as AWS or Azure.
- c. Design a monitoring and maintenance strategy for deployed models to ensure their performance and reliability over time.

Deployment Strategy:
- Creating a deployment strategy for a machine learning model that provides real-time recommendations based on user interactions would involve designing an API or service to handle user requests and provide recommendations based on the model's predictions.
- Developing a deployment pipeline that automates the process of deploying machine learning models to cloud platforms such as AWS or Azure would typically involve using infrastructure-as-code tools like Terraform or AWS CloudFormation to provision the necessary resources, containerization technologies like Docker, and deployment automation tools like Jenkins or GitLab CI/CD.

- Designing a monitoring and maintenance strategy for deployed models involves setting up monitoring tools to track model performance, resource utilization, and potential issues. It may also include implementing strategies for model retraining and updating based on new data or changes in the environment.