In [1]:
# Data Ingestion
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('data/gemstone.csv',index_col='id')
data.head() 

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [2]:
# it is a oridinal encoding
cut_categories=["Fair","Good","Very Good","Premium","Ideal"]
clarity_categories = ["I1","SI2","SI1","VS2", "VS1" , "VVS2" , "VVS1" ,"IF"]
color_categories = ["D" ,"E" ,"F" , "G" ,"H" , "I", "J"]

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer,make_column_selector

In [4]:
X = data.drop(['price'],axis=1)
y = data['price']

In [5]:
num_pipeline = Pipeline([("impute",SimpleImputer(strategy='mean')),
                         ("scaler",StandardScaler())])

num_pipeline

In [6]:
OrdinalEncoder()

In [7]:
cat_pipeline = Pipeline([("impute",SimpleImputer(strategy='most_frequent')),
               ("encoder",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
                         ("scaler",StandardScaler())])

cat_pipeline

In [8]:
preprocessing = ColumnTransformer([
    ('num',num_pipeline,make_column_selector(dtype_include='float')),
    ("cat",cat_pipeline,make_column_selector(dtype_include="object"))
])

preprocessing

In [9]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [10]:
preprocessing.fit_transform(X_train)

array([[-0.19606969,  0.90607122,  0.92386228, ..., -0.132631  ,
         0.29923693, -0.64954469],
       [-0.60727286,  0.90607122, -0.63967556, ...,  0.87240773,
        -0.93296005,  0.01597861],
       [ 0.47484073, -0.66303749,  0.402683  , ..., -0.132631  ,
         0.29923693, -0.64954469],
       ...,
       [-1.06176056, -0.57073697, -1.16085484, ...,  0.87240773,
         0.29923693,  1.34702521],
       [-1.06176056, -0.10923441, -0.11849628, ...,  0.87240773,
         0.29923693,  2.01254851],
       [ 0.28006029,  0.99837174,  0.402683  , ..., -0.132631  ,
        -0.93296005, -1.31506799]])

In [11]:
import pandas as pd
import numpy as np
test_data = pd.read_csv("../artifacts/test_data.csv")
print(test_data.head().to_string())

       id  carat        cut color clarity  depth  table     x     y     z  price
0   72471   1.00      Ideal     H     SI1   62.5   57.0  6.34  6.35  3.97   5033
1   11426   0.39      Ideal     E     SI1   61.0   56.0  4.74  4.71  2.89    613
2  115662   0.61  Very Good     E     VS2   61.6   57.0  5.43  5.47  3.35   2041
3  101946   0.31    Premium     F     VS1   60.5   59.0  4.40  4.37  2.65    758
4  112880   2.01    Premium     I     SI1   58.7   60.0  8.30  8.25  4.83  14341


In [12]:
np.c_[[1]*len(test_data),np.array(test_data['price'])]

array([[    1,  5033],
       [    1,   613],
       [    1,  2041],
       ...,
       [    1,  1024],
       [    1, 15563],
       [    1,  1814]], dtype=int64)

In [13]:
from sklearn.metrics import mean_squared_error as mse,r2_score
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet,SGDRegressor

In [14]:
full_pipeline = Pipeline([("preprocessing",preprocessing),
                    ("lin_reg",LinearRegression())])

full_pipeline

In [15]:
full_pipeline.fit(X_train,y_train)

In [16]:
y_train_pred = full_pipeline.predict(X_train)

In [17]:
rmse_score = mse(y_train,y_train_pred,squared=False)
r2_score = r2_score(y_train,y_train_pred)
rmse_score,r2_score

(1013.7811526471509, 0.9366787333598693)

In [18]:
from sklearn.model_selection import cross_val_score

In [19]:
lin_reg_rmses = -cross_val_score(
    full_pipeline,
    X_train,
    y_train,
    cv=3,
    scoring="neg_root_mean_squared_error"
)

In [20]:
pd.Series(lin_reg_rmses).describe()

count       3.000000
mean     1015.567847
std         9.482998
min      1004.941196
25%      1011.767400
50%      1018.593603
75%      1020.881172
max      1023.168741
dtype: float64

In [21]:
mse(y_test,full_pipeline.predict(X_test),squared=False)

1019.3522760492173

# data preparation

In [22]:
X_train_prep = preprocessing.fit_transform(X_train)
X_train_prep

array([[-0.19606969,  0.90607122,  0.92386228, ..., -0.132631  ,
         0.29923693, -0.64954469],
       [-0.60727286,  0.90607122, -0.63967556, ...,  0.87240773,
        -0.93296005,  0.01597861],
       [ 0.47484073, -0.66303749,  0.402683  , ..., -0.132631  ,
         0.29923693, -0.64954469],
       ...,
       [-1.06176056, -0.57073697, -1.16085484, ...,  0.87240773,
         0.29923693,  1.34702521],
       [-1.06176056, -0.10923441, -0.11849628, ...,  0.87240773,
         0.29923693,  2.01254851],
       [ 0.28006029,  0.99837174,  0.402683  , ..., -0.132631  ,
        -0.93296005, -1.31506799]])

In [23]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


In [24]:
models = {'lin_reg':LinearRegression(),
          'ridge':Ridge(alpha=0.1),
          'lasso':Lasso(alpha=1),
          'elastic_net':ElasticNet(),
          'tree':DecisionTreeRegressor(),
          'random_forest':RandomForestRegressor()}

In [29]:
model_scores = pd.DataFrame(columns=['training_RMSE_Score',"CV_RMSE_mean_score","CV_RMSE_precision(std)"])

for model_key in models:
    model = models[model_key]

    model.fit(X_train_prep,y_train)
    y_pred = model.predict(X_train_prep)

    train_rmse_score = mse(y_train,y_pred,squared=False)

    cross_val_rmses = -cross_val_score(model,
                                      X_train_prep,
                                      y_train,
                                      cv=3,
                                      scoring="neg_root_mean_squared_error")
    
    cross_val_rmse_score = pd.Series(cross_val_rmses).mean()
    cross_val_score_std = pd.Series(cross_val_rmses).std()
    
    model_scores.loc[model_key] = [train_rmse_score,cross_val_rmse_score,cross_val_score_std]
    
    


In [32]:
model_scores

Unnamed: 0,training_RMSE_Score,CV_RMSE_mean_score,CV_RMSE_precision(std)
lin_reg,1013.781153,1015.567847,9.482998
ridge,1013.781153,1015.568398,9.483305
lasso,1013.91011,1014.679736,8.376937
elastic_net,1535.482755,1535.559575,8.634958
tree,14.107197,833.219788,10.980587
random_forest,226.662934,608.136614,3.351403


In [41]:
best_model_key = model_scores.sort_values(by=['CV_RMSE_mean_score','training_RMSE_Score'],ascending=[True,True]).index[0]
models[best_model_key]

In [92]:
model_scores.index = ['training_RMSE_Score',"CV_RMSE_mean_score","CV_RMSE_precision(std)"]
model_scores.drop(['training_RMSE_Score',"CV_RMSE_mean_score","CV_RMSE_precision(std)"],axis=1,inplace=True)
model_scores

Unnamed: 0,lin_reg,ridge,lasso,elastic_net,tree,random_forest
training_RMSE_Score,1012.309758,1012.309759,1012.381065,1537.221415,14.351319,228.842965
CV_RMSE_mean_score,1012.577574,1012.577224,1012.583802,1537.347175,838.548271,614.977304
CV_RMSE_precision(std),7.36793,7.368971,7.424074,1.928046,11.431172,5.803528


#### RandomForests are performing better for the given regression task 

In [1]:
from DaimondPricePrediction.logger import logging

ModuleNotFoundError: No module named 'DaimondPricePrediction'

In [10]:
from datetime import datetime 
datetime.now().strftime("%d_%m_%Y_%H:%M:%S")

'03_11_2023_16:43:41'

In [52]:
from src.DaimondPricePrediction.utils.utils import load_pickle_object

load_pickle_object("artifacts/model.pkl")

CustomException: Error occured in python script :name [f:\ml_end_to_end_project\src\DaimondPricePrediction\utils\utils.py] line number [29] error message [[Errno 2] No such file or directory: 'artifacts/model.pkl']