Model Training

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('./data/gemstone.csv', index_col = 0)
data.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
features = data.drop('price', axis = 1)
target = data.price

In [4]:
cats_cols = features.select_dtypes('object').columns
nums_cols = features.select_dtypes(exclude = 'object').columns

In [47]:
cut_map = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
clarity_map = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
color_map = ['D', 'E', 'F', 'G', 'H', 'I', 'J']

In [7]:
from sklearn.impute import SimpleImputer # Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Handling Categorical Features
from sklearn.pipeline import Pipeline # Creates a Pipeline
from sklearn.compose import ColumnTransformer

In [53]:
# Numerical Pipeline

nums_pipe = Pipeline(
    steps = (
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaling', StandardScaler())
    )
)
# Categorical Pipeline

cats_pipe = Pipeline(
    steps = (
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordinalencoder', OrdinalEncoder(categories = [cut_map, color_map, clarity_map])),
    ('scaler', StandardScaler())
    )
)

preprocessor = ColumnTransformer(
    transformers = [
    ('Numerical_Pipeline', nums_pipe, nums_cols),
    ('Categorical_Pipeline', cats_pipe, cats_cols)
    ]
)

In [13]:
from sklearn.model_selection import train_test_split # train test split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.3, random_state = 1)

In [57]:
preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [63]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [65]:
regressor = LinearRegression()
regressor.fit(X = X_train, y = y_train)

LinearRegression()

In [66]:
regressor.coef_

array([ 6428.87814082,   -91.88110272,   -66.15819502, -1508.99672807,
        -275.0081191 ,  -487.32468042,    71.84138685,  -463.98810432,
         652.71737887])

In [67]:
regressor.intercept_

3973.592984553613

In [68]:
def evalute_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2_square = r2_score(y_true, y_pred)
    return mae, rmse, r2_square

In [71]:
# Training Multiple Models

models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}

model_list = []
r2_list = []

for model in models:
    main_model = models[model]
    main_model.fit(X = X_train, y = y_train)

    # predictions
    y_pred = main_model.predict(X_test)
    
    # preformance
    
    mae, rmse, r2_square = evalute_model(y_test, y_pred)

    print(model)
    model_list.append(model)
    r2_list.append(r2_square)
    print('Model Training Performance')
    print('RMSE:', rmse)
    print('MAE:', mae)
    print('R2 SCORE:', r2_square)
    print('#'*20)



LinearRegression
Model Training Performance
RMSE: 1012.1792803995619
MAE: 672.9617720149348
R2 SCORE: 0.9366157287500738
####################
Lasso
Model Training Performance
RMSE: 1010.402436940046
MAE: 673.8172864700618
R2 SCORE: 0.9368380709307652
####################
Ridge
Model Training Performance
RMSE: 1012.2023151018992
MAE: 672.9936437915959
R2 SCORE: 0.9366128437780875
####################
ElasticNet
Model Training Performance
RMSE: 1534.6566639577798
MAE: 1063.3673862260225
R2 SCORE: 0.8542900951017094
####################


In [72]:
r2_list

[0.9366157287500738,
 0.9368380709307652,
 0.9366128437780875,
 0.8542900951017094]