In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data_set/gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df=df.drop(labels=['id'],axis = 1)

In [4]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
X = df.drop(labels= ["price"],axis=1)
Y = df.price

In [6]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((135501, 9), (58072, 9))

In [15]:
num_col = X_train.columns[X.dtypes != "object"]

In [16]:
X_train[num_col].corr()

Unnamed: 0,carat,depth,table,x,y,z
carat,1.0,0.0293,0.225181,0.980684,0.980248,0.972174
depth,0.0293,1.0,-0.230916,-0.008771,-0.009573,0.083495
table,0.225181,-0.230916,1.0,0.237376,0.232878,0.211533
x,0.980684,-0.008771,0.237376,1.0,0.999209,0.987605
y,0.980248,-0.009573,0.232878,0.999209,1.0,0.987513
z,0.972174,0.083495,0.211533,0.987605,0.987513,1.0


In [17]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [18]:
corr_features = correlation(X_train[num_col], 0.9)
len(set(corr_features))

3

In [19]:
corr_features

{'x', 'y', 'z'}

In [20]:
X_train = X_train.drop(corr_features, axis = 1)

In [22]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table
192266,0.42,Ideal,G,VS1,62.0,55.0
146063,1.11,Premium,I,VS2,62.3,58.0
44674,1.01,Premium,F,SI1,62.3,59.0
38298,1.28,Premium,G,SI2,61.5,62.0
38911,0.62,Premium,D,SI1,58.9,61.0


In [23]:
X_test = X_test.drop(labels = ['x','y','z'], axis = 1)

In [25]:
X_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table
137280,1.51,Good,I,SI1,63.1,58.0
145587,0.3,Premium,F,IF,62.1,58.0
14616,1.21,Ideal,H,SI1,59.5,60.0
20279,0.5,Very Good,F,VS2,62.6,56.0
133733,1.01,Good,G,VS2,63.6,56.0


In [26]:
X = X.drop(labels = ['x','y','z'], axis = 1)

In [27]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table
0,1.52,Premium,F,VS2,62.2,58.0
1,2.03,Very Good,J,SI2,62.0,58.0
2,0.7,Ideal,G,VS1,61.2,57.0
3,0.32,Ideal,G,VS1,61.6,56.0
4,1.7,Premium,G,VS2,62.6,59.0


In [28]:
categorical_cols = X.columns[X.dtypes == "object"]
numerical_cols = X.columns[X.dtypes != "object"]

In [29]:
cut_categories = ['Fair','Good',"Very Good",'Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ["I1",'SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [30]:
from sklearn.impute import SimpleImputer #handling missing values
from sklearn.preprocessing import StandardScaler #Handling feaetrure scaling
from sklearn.preprocessing import OrdinalEncoder #Ordinal Encoding
##Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [31]:
num_pipeline = Pipeline(
    
    steps = [
    ("imputer",SimpleImputer(strategy='median')),
    ("scaler",StandardScaler())

    ]
    
)

In [32]:
cat_pipeline = Pipeline(

    steps = [
    ('imputer',SimpleImputer(strategy="most_frequent")),
    ("ordnialencoder",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ("scaler",StandardScaler())    

    ]
)

In [33]:
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [34]:
preprocessor.fit_transform(X_train)

array([[-0.80000867,  0.16547629, -1.15996256,  0.87257335,  0.30166625,
         0.68343039],
       [ 0.69032242,  0.44354549,  0.40291595, -0.13444957,  1.53374552,
         0.01751037],
       [ 0.4743324 ,  0.44354549,  0.92387545, -0.13444957, -0.31437339,
        -0.64840964],
       ...,
       [ 2.02946049,  0.25816602,  0.40291595, -0.13444957,  1.53374552,
         0.01751037],
       [-0.97280068, -0.85411078, -0.11804355,  0.87257335, -0.93041303,
         1.3493504 ],
       [ 1.53268346,  0.99968389,  0.40291595, -0.13444957, -0.93041303,
        -0.64840964]])

In [35]:
pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.800009,0.165476,-1.159963,0.872573,0.301666,0.68343
1,0.690322,0.443545,0.402916,-0.134450,1.533746,0.01751
2,0.474332,0.443545,0.923875,-0.134450,-0.314373,-0.64841
3,1.057505,-0.297972,2.486754,-0.134450,0.301666,-1.31433
4,-0.368029,-2.707905,1.965794,-0.134450,-1.546453,-0.64841
...,...,...,...,...,...,...
135496,0.949510,1.741202,-0.639003,-2.148495,0.301666,-1.31433
135497,-0.929603,-0.946801,-0.118044,0.872573,-0.314373,0.01751
135498,2.029460,0.258166,0.402916,-0.134450,1.533746,0.01751
135499,-0.972801,-0.854111,-0.118044,0.872573,-0.930413,1.34935


In [36]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [37]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [38]:
regression = LinearRegression()
regression.fit(X_train,y_train)

In [39]:
regression.coef_

array([4223.85149673,  -44.62479758,  -68.75447686,   79.55647925,
       -443.12681024,  737.02295163])

In [40]:
regression.intercept_

3967.1136744378264

In [41]:
import numpy as np
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae,rmse,r2_square

In [42]:
models = {

    "LinearRegression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "ElasticNet" : ElasticNet()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    #Make predications
    y_pred = model.predict(X_test)

    mae,rmse,r2_square = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    

    print('Model Training Performance')
    print("RMSE: ", rmse )
    print("MAE: ",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)

    print("="*40)
    print('\n')

LinearRegression
Model Training Performance
RMSE:  1096.0538396528743
MAE:  807.1331595184494
R2 score 92.60411864208866


Lasso
Model Training Performance
RMSE:  1096.0530105157382
MAE:  806.805431680334
R2 score 92.60412983168054


Ridge
Model Training Performance
RMSE:  1096.0537920426939
MAE:  807.1277842441931
R2 score 92.60411928461045


ElasticNet
Model Training Performance
RMSE:  1831.1991475515304
MAE:  1244.7272067606416
R2 score 79.35583530504596




In [43]:

model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']