### **Feature Engineering**

In [87]:
import pandas as pd
import matplotlib.pyplot as plt

In [88]:
df = pd.read_csv(r"D:\Downloads\diamond.csv")
df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...,...
193568,193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [89]:
df1 = df.copy()

In [90]:
import seaborn as sns

In [91]:
df['cut'].unique()

array(['Premium', 'Very Good', 'Ideal', 'Good', 'Fair'], dtype=object)

In [92]:
cut_map = {"Fair":1, "Good":2, "Very Good": 3, "Premium":4, "Ideal":5}

In [93]:
df['color'].unique()

array(['F', 'J', 'G', 'E', 'D', 'H', 'I'], dtype=object)

In [94]:
color_map = {"D":1, "E":2, "F":3, "G":4, "H":5, "I":6, "J":7}

In [95]:
df['clarity'].unique()

array(['VS2', 'SI2', 'VS1', 'SI1', 'IF', 'VVS2', 'VVS1', 'I1'],
      dtype=object)

In [96]:
clarity_map = {"I1":1, "SI2":2, "SI1":3, "VS2":4, "VS1":5, "VVS2":6, "VVS1":7, "IF":8}

#### **Apply Map Function on 'cut', 'color', 'clarity' features of dataset**

In [97]:
df['cut'] = df['cut'].map(cut_map)
df['color'] = df['color'].map(color_map)
df['clarity'] = df['clarity'].map(clarity_map)

In [98]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,4,3,4,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,3,7,2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,5,4,5,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,5,4,5,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,4,4,4,62.6,59.0,7.65,7.61,4.77,14453


### **Model Training**

#### Drop 'id' feature from Dataset

In [99]:
df1.drop('id', axis = 1, inplace=True)

In [100]:
df1.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


### **Split the Dataset into Independent & Dependent Variables**

In [101]:
X = df1.drop('price', axis = 1)
y = df1.price

In [102]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [103]:
y

0         13619
1         13387
2          2772
3           666
4         14453
          ...  
193568     1130
193569     2874
193570     3036
193571      681
193572     2258
Name: price, Length: 193573, dtype: int64

#### **Seperate the Categorical & Numerical Features From Dataset**

In [104]:
categorical_features = X.columns[X.dtypes=='O']
numerical_features = X.columns[X.dtypes != "O"]

In [105]:
categorical_features

Index(['cut', 'color', 'clarity'], dtype='object')

In [106]:
numerical_features

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [107]:
cut_categories = ['Premium', 'Very Good', 'Ideal', 'Good', 'Fair']
color_categories = ['F', 'J', 'G', 'E', 'D', 'H', 'I']
clarity_categories = ['VS2', 'SI2', 'VS1', 'SI1', 'IF', 'VVS2', 'VVS1', 'I1']

In [108]:
%pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [109]:
 #Handling Missing Values
from sklearn.impute import SimpleImputer     
# Feature Scaling Purpose
from sklearn.preprocessing import StandardScaler
# Ordinal Encoding for categorical features
from sklearn.preprocessing import OrdinalEncoder
# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [110]:
# Numerical Pipeline

num_pipeline = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('sacler', StandardScaler())
])

# Categorical Pipeline
cat_pipeline = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordinalencoder', OrdinalEncoder(categories = [cut_categories, color_categories, clarity_categories])),
    ('scaler', StandardScaler())
])

# Column Trasfer add Two Pipelines
preprocessor = ColumnTransformer([('num_pipeline', num_pipeline, numerical_features),
                                  ('cat_pipeline', cat_pipeline, categorical_features)
])

In [111]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [112]:
preprocessor.fit_transform(X_train)

array([[-0.82314374, -1.12998781, -0.64189666, ...,  0.65242448,
         0.05875301,  1.60539392],
       [ 0.94502267, -1.77782269,  0.92190185, ..., -0.38150613,
         1.12307004, -0.07924587],
       [ 1.9584839 ,  0.16568195,  0.40063568, ..., -1.41543675,
         1.12307004, -1.20233906],
       ...,
       [ 0.92345966,  0.90606467,  0.40063568, ..., -1.41543675,
        -0.47340551, -1.20233906],
       [-1.03877378, -0.66724861, -0.64189666, ..., -0.38150613,
        -0.47340551,  2.16694052],
       [-1.03877378, -0.01941373,  0.92190185, ..., -0.38150613,
        -0.47340551, -0.64079246]])

In [113]:
preprocessor.transform(X_test)

array([[-0.62907669,  0.25822979, -0.12063049, ...,  0.65242448,
         0.59091152,  0.48230073],
       [ 2.60537405, -2.14801405, -0.12063049, ..., -0.38150613,
        -0.47340551, -0.64079246],
       [-1.1250258 , -1.22253565,  0.92190185, ..., -1.41543675,
         0.05875301,  2.16694052],
       ...,
       [-0.82314374, -0.01941373, -0.64189666, ...,  0.65242448,
        -0.47340551,  2.16694052],
       [ 0.90189666, -0.66724861,  1.44316802, ..., -1.41543675,
         1.65522856,  0.48230073],
       [ 0.47063656,  0.90606467, -0.64189666, ..., -0.38150613,
         0.05875301, -1.20233906]])

In [114]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns = preprocessor.get_feature_names_out())

In [115]:
X_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.652424,0.058753,1.605394
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-0.381506,1.123070,-0.079246
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-1.415437,1.123070,-1.202339
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.652424,-1.537723,2.166941
4,-0.995648,0.258230,0.400636,-1.176382,-1.152082,-1.136403,-0.381506,1.655229,0.482301
...,...,...,...,...,...,...,...,...,...
135496,-0.629077,-1.500179,1.964434,-0.546492,-0.518125,-0.644575,-0.381506,0.058753,0.482301
135497,2.411307,0.443325,2.485700,1.919078,1.872797,1.930288,-0.381506,-1.537723,0.482301
135498,0.923460,0.906065,0.400636,0.992240,0.921862,1.047891,-1.415437,-0.473406,-1.202339
135499,-1.038774,-0.667249,-0.641897,-1.212375,-1.197364,-1.252127,-0.381506,-0.473406,2.166941


In [116]:
X_test = pd.DataFrame(preprocessor.transform(X_test), columns = preprocessor.get_feature_names_out())

In [117]:
X_test

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.629077,0.258230,-0.120630,-0.600482,-0.581521,-0.572248,0.652424,0.590912,0.482301
1,2.605374,-2.148014,-0.120630,2.126042,2.198832,1.959219,-0.381506,-0.473406,-0.640792
2,-1.125026,-1.222536,0.921902,-1.374347,-1.414721,-1.469110,-1.415437,0.058753,2.166941
3,-1.017211,-0.574701,0.921902,-1.158385,-1.161138,-1.194265,-1.415437,1.655229,2.166941
4,0.858771,0.628421,-0.641897,0.947248,0.985258,1.004495,0.652424,1.123070,0.482301
...,...,...,...,...,...,...,...,...,...
58067,0.255007,0.535873,0.921902,0.416340,0.369414,0.425874,-0.381506,1.655229,-0.640792
58068,-0.607514,0.535873,-0.641897,-0.528495,-0.554351,-0.499920,0.652424,0.590912,-1.202339
58069,-0.823144,-0.019414,-0.641897,-0.834441,-0.862273,-0.847093,0.652424,-0.473406,2.166941
58070,0.901897,-0.667249,1.443168,1.046230,0.967145,0.932167,-1.415437,1.655229,0.482301


### **Model Training**

In [131]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score
regression = LinearRegression()
regression.fit(X_train, y_train)

In [119]:
y_pred = regression.predict(X_test)
y_pred

array([ 1372.79398597, 15792.2130202 ,   733.22229683, ...,
        1315.65210015,  6780.28837042,  5249.95868253])

In [120]:
regression.coef_

array([ 6538.82211476,  -232.4061404 ,  -177.05602534, -2090.53215704,
        -556.05883268,   -44.19321465,   -10.72128479,  -181.811724  ,
         121.17907723])

In [121]:
regression.intercept_

3976.8787389022987

In [122]:
import numpy as np

In [133]:
# Automate
def evaluate_model(Actual, Predicted):
    mse = mean_squared_error(Actual, Predicted)
    mae = mean_absolute_error(Actual, Predicted)
    rmse = np.sqrt(mse)
    score = r2_score(Actual, Predicted)
    return mse, mae, rmse, score

In [134]:
evaluate_model(y_test, y_pred)

(2665184.939656333, 1135.0835947511694, 1632.539414426596, 0.835064099195437)

### **Train Multiple Models**

In [125]:
models = {"LinearRegression": LinearRegression(),
          "Lasso": Lasso(),
          "Ridge":Ridge(),
          "ElasticNet":ElasticNet()}

In [126]:
Model_list = []
r2_list = []

In [127]:
for i in range(len(list(models))):
    print(i)

0
1
2
3


In [128]:
models.values()

dict_values([LinearRegression(), Lasso(), Ridge(), ElasticNet()])

In [129]:
models.keys()

dict_keys(['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet'])

In [137]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    #Make predictions
    y_pred = model.predict(X_test)
    mse, mae, rmse, score = evaluate_model(y_test, y_pred)
    Model_list.append(list(models.keys())[i])
    print("Model Training Performance")
    print(f"MSE:{mse}")
    print(f"MAE : {mae}")
    print(f"RMSE : {rmse}")
    print(f"R2SQUARE : {score*100}")
    
    r2_list.append(score)
    print("*" * 80)
    print('\n')

Model Training Performance
MSE:1432389.797210269
MAE : 723.6034569047462
RMSE : 1196.824881597249
R2SQUARE : 91.1356057138532
********************************************************************************


Model Training Performance
MSE:1432315.1270561789
MAE : 725.055055982241
RMSE : 1196.7936860863608
R2SQUARE : 91.13606781270963
********************************************************************************


Model Training Performance
MSE:1432403.9294425463
MAE : 723.6370749375686
RMSE : 1196.8307856345216
R2SQUARE : 91.13551825604019
********************************************************************************


Model Training Performance
MSE:2665184.939656333
MAE : 1135.0835947511694
RMSE : 1632.539414426596
R2SQUARE : 83.5064099195437
********************************************************************************


