In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator,TransformerMixin
import tensorflow as tf

2024-01-02 02:13:51.025043: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-02 02:13:53.498955: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-02 02:13:53.499048: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-02 02:13:53.853453: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-02 02:13:54.617448: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-02 02:13:54.620041: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
root_dir = Path.cwd().parent
model_dir = root_dir.joinpath('artifacts')
data = pd.read_csv(root_dir.joinpath('laptopPrice.csv'))
display(data.head(3))

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,ASUS,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,34649,2 stars,3,0
1,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,38999,3 stars,65,5
2,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,39999,3 stars,8,1


In [3]:
X = data.drop(columns=['Price'])
y = data['Price']
X_train,  X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
import json
json_vals = json.dumps(json.loads(X_test.iloc[0:1].to_json(orient="records")))
pd.DataFrame(json.loads(json_vals))

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,rating,Number of Ratings,Number of Reviews
0,APPLE,M1,M1,10th,8 GB,DDR4,512 GB,0 GB,Mac,64-bit,0 GB,Casual,1 year,No,No,4 stars,6725,629


In [7]:
class LogScaling(BaseEstimator, TransformerMixin):

    def fit(self, X):
        return self   

    def transform(self, X):
        return np.log1p(X)
    
class TransformationPipeline:

    def __init__(self) -> None:
        pass
    
    def preprocess(self):
        cat_cols = data.select_dtypes('object').columns
        num_cols = ['Number of Ratings', 'Number of Reviews']

        num_pipeline= Pipeline(
            steps=[
            ("imputer",SimpleImputer(strategy="median")),
            ("scaler",StandardScaler())
            ]
        ) 

        cat_pipeline=Pipeline(
            steps=[
            ("imputer",SimpleImputer(strategy="most_frequent")),
            ("one_hot_encoder",OneHotEncoder()),
            ("scaler",StandardScaler(with_mean=False))
            ]
        )

        preprocessor = ColumnTransformer([
            ("log_transform", LogScaling(), num_cols),
            ("num_pipeline", num_pipeline, num_cols),
            ("cat_pipelines",cat_pipeline,cat_cols)
            ], remainder= 'passthrough')
        
        return preprocessor


In [17]:
process = TransformationPipeline()
preprocessor = process.preprocess()
proc_obj = preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test =  preprocessor.transform(X_test)
print(X_train.shape, X_test.shape)

(658, 81) (165, 81)


In [20]:
import joblib
joblib.dump(proc_obj, model_dir.joinpath('preprocessor.pkl'))

['/workspaces/devops-for-mlops/artifacts/preprocessor.pkl']

In [9]:
type(X_train)

scipy.sparse._csr.csr_matrix

In [10]:
class RegressorModel:
    def __init__(self,X_train :pd.DataFrame, X_test:pd.DataFrame, model_path:Path) -> None:
        self.X_train = X_train
        self.X_test = X_test
        self.model_path = model_path
            
    def model_train(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Dense(79))
        model.add(tf.keras.layers.Dense(200))
        model.add(tf.keras.layers.Dense(200))
        model.add(tf.keras.layers.Dense(200))
        model.add(tf.keras.layers.Dense(1))
        model.compile(
        loss = 'mse', 
        optimizer = tf.keras.optimizers.Adam(), 
        metrics = [tf.keras.metrics.RootMeanSquaredError(name='rmse')]
    )
        model.fit(X_train.toarray(), y_train, validation_data=(X_test.toarray(), y_test), epochs=20)
        model.save(self.model_path)
        return

In [11]:
model_obj = RegressorModel(X_train, X_test, model_dir)
model_obj.model_train()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
INFO:tensorflow:Assets written to: /workspaces/devops-for-mlops/artifacts/assets


INFO:tensorflow:Assets written to: /workspaces/devops-for-mlops/artifacts/assets


In [21]:
model = tf.keras.models.load_model(model_dir)


In [22]:
preds = model.predict(X_test.toarray()).reshape(165,1)
preds



array([[120568.34 ],
       [ 25586.377],
       [ 33304.6  ],
       [ 40663.75 ],
       [ 69370.375],
       [ 57484.82 ],
       [131387.75 ],
       [ 36300.78 ],
       [ 48738.273],
       [ 44257.46 ],
       [118047.99 ],
       [ 73605.57 ],
       [ 83549.51 ],
       [ 49792.11 ],
       [ 52635.402],
       [ 43263.305],
       [ 32579.996],
       [ 56225.203],
       [ 46406.355],
       [113693.69 ],
       [105214.8  ],
       [ 59212.23 ],
       [ 68719.61 ],
       [ 65525.094],
       [ 80597.91 ],
       [ 50652.32 ],
       [ 58237.36 ],
       [ 53220.445],
       [ 33585.383],
       [ 33057.348],
       [ 75786.7  ],
       [ 90358.78 ],
       [ 54899.668],
       [ 33906.914],
       [ 60822.19 ],
       [133744.55 ],
       [ 42255.09 ],
       [ 48503.96 ],
       [ 91332.125],
       [ 27284.85 ],
       [ 54482.633],
       [ 36665.47 ],
       [ 54075.387],
       [117176.44 ],
       [ 55678.79 ],
       [ 30502.346],
       [ 53396.867],
       [ 4820

### Test the fast api microservice with requests library

In [3]:
import requests,json
X = data.drop(columns=['Price'])
y = data['Price']
X_train,  X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)
url = 'http://0.0.0.0:8080/predict/'
json_vals= json.loads(json.dumps(X_test.iloc[0:1].to_json(orient="records")))
json_values = {'test_data':json_vals}
x = requests.post(url, json=json_values)
x.json()

{'Price': 133136.859375}