Model Training

In [1]:
import pandas as pd

In [6]:
df = pd.read_csv("data/gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [7]:
df=df.drop(labels=['id'],axis = 1)


In [8]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [10]:
##Independent and dependent variables

X = df.drop(labels= ["price"],axis=1)
Y = df.price


In [12]:
#Define which column needs to be ordinal encoded and which column needs to be scaled first

caetgorical_cols = X.select_dtypes(include="object").columns
numerical_cols = X.select_dtypes(exclude="object").columns

In [13]:
#Define the custom ranking for each ordinal variable

cut_categories = ['Fair','Good',"Very Good",'Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ["I1",'SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [14]:
from sklearn.impute import SimpleImputer #handling missing values
from sklearn.preprocessing import StandardScaler #Handling feaetrure scaling
from sklearn.preprocessing import OrdinalEncoder #Ordinal Encoding
##Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [15]:
##Numerical Pipeline

num_pipeline = Pipeline(
    
    steps = [
    ("imputer",SimpleImputer(strategy='median')),
    ("scaler",StandardScaler())

    ]
    
)

In [23]:
#Categorical Pipeline

cat_pipeline = Pipeline(

    steps = [
    ('imputer',SimpleImputer(strategy="most_frequent")),
    ("ordnialencoder",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ("scaler",StandardScaler())    

    ]
)

In [24]:
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,caetgorical_cols)
])

In [29]:
## Train Test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.30,random_state=40)

In [30]:
preprocessor.fit_transform(X_train)

array([[ 0.45410567, -0.29654463, -0.63784219, ...,  0.87150191,
        -0.31702606, -0.6503144 ],
       [ 3.02766057, -1.58999042,  1.96548435, ..., -0.13314643,
         0.91504383, -1.31534777],
       [ 0.54061172,  1.45884607, -1.15850749, ..., -1.13779478,
         0.91504383, -1.31534777],
       ...,
       [ 2.61675685,  1.82840201, -0.63784219, ..., -1.13779478,
         0.91504383, -1.31534777],
       [ 0.45410567,  0.99690115, -1.6791728 , ..., -2.14244312,
         0.29900888,  0.67975234],
       [-0.88673805, -1.12804549,  2.48614965, ..., -1.13779478,
        -0.93306101,  1.34478571]])

In [27]:
pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.454106,-0.296545,-0.637842,0.636818,0.663672,0.622410,0.871502,-0.317026,-0.650314
1,3.027661,-1.589990,1.965484,2.494366,2.433180,2.182754,-0.133146,0.915044,-1.315348
2,0.540612,1.458846,-1.158507,0.672887,0.645523,0.811984,-1.137795,0.915044,-1.315348
3,0.562238,-2.513880,1.444819,0.817163,0.790714,0.549496,-1.137795,-0.317026,-1.315348
4,-0.843485,-0.019378,-0.637842,-0.851023,-0.888051,-0.865022,0.871502,1.531079,1.344786
...,...,...,...,...,...,...,...,...,...
135496,-0.000051,-0.019378,-0.637842,0.176940,0.200877,0.199512,0.871502,-0.933061,-0.650314
135497,-0.540714,-0.573712,1.444819,-0.409179,-0.443405,-0.471290,-0.133146,-0.317026,0.014719
135498,2.616757,1.828402,-0.637842,2.106625,2.061130,2.255668,-1.137795,0.915044,-1.315348
135499,0.454106,0.996901,-1.679173,0.564681,0.627374,0.680740,-2.142443,0.299009,0.679752


In [31]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [32]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.454106,-0.296545,-0.637842,0.636818,0.663672,0.62241,0.871502,-0.317026,-0.650314
1,3.027661,-1.58999,1.965484,2.494366,2.43318,2.182754,-0.133146,0.915044,-1.315348
2,0.540612,1.458846,-1.158507,0.672887,0.645523,0.811984,-1.137795,0.915044,-1.315348
3,0.562238,-2.51388,1.444819,0.817163,0.790714,0.549496,-1.137795,-0.317026,-1.315348
4,-0.843485,-0.019378,-0.637842,-0.851023,-0.888051,-0.865022,0.871502,1.531079,1.344786


In [33]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [34]:
regression = LinearRegression()
regression.fit(X_train,y_train)

In [35]:
regression.coef_

array([ 6444.18494694,   -95.79974015,   -67.01577837, -1823.10963   ,
          -8.19306509,  -460.63495897,    74.53336566,  -464.42173169,
         650.12826018])

In [36]:
regression.intercept_

3966.0501029512675

In [41]:
import numpy as np
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae,rmse,r2_square

In [45]:
# Train multiple models
models = {

    "LinearRegression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "ElasticNet" : ElasticNet()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    #Make predications
    y_pred = model.predict(X_test)

    mae,rmse,r2_square = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    

    print('Model Training Performance')
    print("RMSE: ", rmse )
    print("MAE: ",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)

    print("="*40)
    print('\n')

LinearRegression
Model Training Performance
RMSE:  1017.1722564866208
MAE:  674.6638217378082
R2 score 93.6343140306353


Lasso
Model Training Performance
RMSE:  1015.6056722113335
MAE:  675.3910275839801
R2 score 93.65390698366912


Ridge
Model Training Performance
RMSE:  1017.1731135131284
MAE:  674.6868170797188
R2 score 93.63430330371294


ElasticNet
Model Training Performance
RMSE:  1536.886461343751
MAE:  1065.5716610505156
R2 score 85.46751887552743




In [46]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']