In [58]:
# import data
import pandas as pd
df = pd.read_csv('diamonds.csv') 
df = df.rename(columns={"Unnamed: 0": "id"})
df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,2,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,3,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,4,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63,334
4,5,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50,2757
53936,53937,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61,2757
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56,2757
53938,53939,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74,2757


# Initial Data Analysis



In [59]:
# Preprocess Data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder


from sklearn.model_selection import train_test_split
seed = 309

y = df[['price']]
x = df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=seed)

display(x_train)

# ordinal encode cut
cutOrder = [['Ideal'], ['Premium'], ['Very Good'], ['Good'], ['Fair']]
cutEnc = OrdinalEncoder().fit(cutOrder)
print(cutEnc.categories_)
print(cutEnc.transform(cutOrder))
encoder_df = pd.DataFrame(cutEnc.transform(x_train[['cut']]))
x_train['cut'] = encoder_df
encoder_df = pd.DataFrame(cutEnc.transform(x_test[['cut']]))
x_test['cut'] = encoder_df

# ordinal encoder color
# "The scale begins with the letter D, representing colorless, and continues with increasing presence of color to the letter Z, or light yellow or brown." - google
# https://www.gia.edu/gia-about/4cs-color#:~:text=GIA's%20color%2Dgrading%20scale%20for,defined%20range%20of%20color%20appearance.
# because that's a spectrum, ordinal encode
colorOrder = ['D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
colEnc = OrdinalEncoder(categories=[colorOrder])
encoder_df = pd.DataFrame(colEnc.fit_transform(x_train[['color']]))
x_train['color'] = encoder_df
encoder_df = pd.DataFrame(colEnc.transform(x_test[['color']]))
x_test['color'] = encoder_df

# ordinal encode clarity
# see dataset documentation
clarityOrder = ['I1', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2', 'IF']
claEnc = OrdinalEncoder(categories=[clarityOrder])
encoder_df = pd.DataFrame(claEnc.fit_transform(x_train[['clarity']]))
x_train['clarity'] = encoder_df
encoder_df = pd.DataFrame(claEnc.transform(x_test[['clarity']]))
x_test['clarity'] = encoder_df


# scale
toScale = ['carat', 'depth', 'table', 'x', 'y', 'z']
scalers = []
for label in toScale:
    scaler = MinMaxScaler()
    x_train[label] = scaler.fit_transform(x_train[[label]])
    x_test[label] = scaler.transform(x_test[[label]])
    scalers.append(scalers)
    
priceScaler = MinMaxScaler()
y_train['price'] = scaler.fit_transform(y_train[['price']])
y_test['price'] = scaler.transform(y_test[['price']])

display(x_train)
display(y_train)

display(x_test)
display(y_test)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
36878,0.42,Ideal,E,VS2,61.7,56.0,4.80,4.82,2.97
18820,1.03,Ideal,G,VVS1,61.7,56.0,6.45,6.56,4.00
53406,0.82,Ideal,E,SI2,62.1,55.0,6.04,6.01,3.74
25954,1.40,Ideal,G,VVS1,60.8,57.0,7.21,7.23,4.39
13825,1.00,Very Good,E,VS2,63.5,56.0,6.37,6.32,4.03
...,...,...,...,...,...,...,...,...,...
33475,0.32,Ideal,F,VS2,61.5,56.0,4.42,4.40,2.71
9617,1.33,Premium,G,SI2,61.3,59.0,7.06,6.93,4.29
5089,1.02,Fair,F,SI2,65.5,59.0,6.27,6.24,4.10
45319,0.32,Good,E,SI1,63.5,56.0,4.34,4.38,2.77


[array(['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], dtype=object)]
[[2.]
 [3.]
 [4.]
 [1.]
 [0.]]




Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
36878,0.045738,2.0,2.0,1.0,0.519444,0.250000,0.446927,0.081834,0.093396
18820,0.172557,3.0,3.0,1.0,0.519444,0.250000,0.600559,0.111375,0.125786
53406,0.128898,,,,0.530556,0.230769,0.562384,0.102037,0.117610
25954,0.249480,2.0,3.0,3.0,0.494444,0.269231,0.671322,0.122750,0.138050
13825,0.166320,3.0,1.0,3.0,0.569444,0.250000,0.593110,0.107301,0.126730
...,...,...,...,...,...,...,...,...,...
33475,0.024948,2.0,4.0,2.0,0.513889,0.250000,0.411546,0.074703,0.085220
9617,0.234927,2.0,2.0,1.0,0.508333,0.307692,0.657356,0.117657,0.134906
5089,0.170478,4.0,4.0,3.0,0.625000,0.307692,0.583799,0.105942,0.128931
45319,0.024948,,,,0.569444,0.250000,0.404097,0.074363,0.087107


Unnamed: 0,price
36878,0.034115
18820,0.399059
53406,0.126568
25954,0.800551
13825,0.286873
...,...
33475,0.027087
9617,0.232861
5089,0.185283
45319,0.010651


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
48107,0.062370,,,,0.486111,0.230769,0.479516,0.087097,0.097799
38948,0.022869,,,,0.577778,0.250000,0.397579,0.073345,0.086164
9301,0.166320,1.0,1.0,1.0,0.580556,0.269231,0.587523,0.107640,0.127044
9206,0.220374,4.0,4.0,2.0,0.605556,0.269231,0.626629,0.113582,0.136792
53638,0.020790,,,,0.500000,0.269231,0.399441,0.073514,0.082704
...,...,...,...,...,...,...,...,...,...
48792,0.124740,,,,0.477778,0.307692,0.565177,0.101019,0.113836
44389,0.043659,,,,0.536111,0.269231,0.444134,0.080306,0.093082
1211,0.126819,3.0,3.0,1.0,0.477778,0.269231,0.567970,0.102886,0.115094
2871,0.145530,2.0,4.0,7.0,0.461111,0.326923,0.593110,0.105942,0.118239


Unnamed: 0,price
48107,0.086938
38948,0.008759
9301,0.229563
9206,0.228374
53638,0.012165
...,...
48792,0.091641
44389,0.067853
1211,0.141166
2871,0.159440


# Exploratory Data Analysis


The correlation of data with price shows to be strongest among carat and the size, which is reasonable and to be expected; a larger, better quality diamond will be more expensive than a small, bad quality one. The ID can be ignored.

In [60]:
# data analysis

import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.concut([x_train, x_test], axis=1)

cor = df.corr(method='pearson')
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)

for label in ['cut', 'color', 'clarity', 'carat', 'depth', 'table', 'x', 'y', 'z']: 
    df.plot.scatter(x = 'price', y =  label);

AttributeError: module 'pandas' has no attribute 'concut'

In [None]:
# Build classification (or regression) models using the training data

# You should use the following 10 regression algorithms implemented in scikit-learn to perform regression. These 10
# algorithms are very popular regression methods: (1) linear regression, (2) k-neighbors regression, (3) Ridge regression,
# (4) decision tree regression, (5) random forest regression, (6) gradient Boosting regression, (7) SGD regression, (8)
# support vector regression (SVR), (9) linear SVR, and (10) multi-layer perceptron regression. You are encouraged to
# read the documentation (and provided references if you would like to know more details) about these methods from
# scikit-learn, e.g. linear regression is implemented in sklearn.linear model.LinearRegression.

from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge

import time


models = [LinearRegression(),  KNeighborsRegressor(), Ridge(), DecisionTreeRegressor(), 
          RandomForestRegressor(), GradientBoostingRegressor(), SGDRegressor(),
          SVR(), LinearSVR(), MLPRegressor()]

names = ["Linear Regression", "K Neighbours Regressor", "Ridge Regression", "Decision Tree Regressor", 
        "Random Forest Regressor", "Gradient Boosting Regressor", "SGD Regressor",  "SVR", "SVR Linear", "Multi Layer Perceptron Regressor"]

answers = []
scores = []
timeDelta = []

for i in range (len (models)):
    print(names[i])
    
    time1 = time.time()
    
    model = models[i]
    model.fit(x_train, y_train)
    ans = model.predict(x_test)
    
    time2 = time.time()
    
    timeDelta.append(time2-time1)
    
    answers.append(ans)
    
    sc = model.score(x_test, y_test)
    print("--> Score: ", sc)
    scores.append(sc)
    

y_test = y_test.to_frame()

In [None]:
# Evaluate models by using cross validation (Optional)

In [None]:
# Assess model on the test data.
# Report the results (keep 2 decimals) of all the 10 regression algorithms on the test data in terms of mean
# squared error (MSE), root mean squared error (RMSE), relative squared error (RSE), mean absolute error
# (MAE), and execution time. You should report them in a table.

import math

def meanSquaredError(myAnswer, trueAnswer): 
    ans = 0.0
    
    for i in range (len (trueAnswer)):
        ans = ans + ((trueAnswer.iat[i, 0] - myAnswer[i]) * (trueAnswer.iat[i, 0] - myAnswer[i]))
    
    return (ans/len(trueAnswer))

def rootMeanSquaredError(myAnswer, trueAnswer):
    ans = 0.0
    
    for i in range (len (trueAnswer)):
        ans = ans + ((trueAnswer.iat[i, 0] - myAnswer[i])* (trueAnswer.iat[i, 0] - myAnswer[i]))
    
    return math.sqrt(ans/len(trueAnswer))


def meanAbsoluteError(myAnswer, trueAnswer):
    ans = 0.0
    
    for i in range (len (trueAnswer)):
        ans = ans + (abs(trueAnswer.iat[i, 0] - myAnswer[i]))
    
    return (ans/len(trueAnswer))


def relativeSquaredError(myAnswer, trueAnswer):
    # meanTrue
    meanTrue = 0.0
    for i in range (len (trueAnswer)):
        meanTrue = meanTrue + trueAnswer.iat[i, 0]
        
    meanTrue = meanTrue / len(trueAnswer)
    
    sqrtError1 = meanSquaredError(myAnswer, trueAnswer)
    sqrtError2 = meanSquaredError(myAnswer, pd.DataFrame([meanTrue] * len(trueAnswer)))
    
    return sqrtError1 / sqrtError2

header = ["Types of Error"]
header = header + names
msq = ["Mean Squared Error"]
rmsq = ["Root Mean Squared Error"]
rse = ["Relative Squared Error"]
mae = ["Mean Absolute Error"]
time = ["Execution Time"]          
time = time + timeDelta
sc = ["Scores"]
sc = sc + scores

for i in range ( len(answers) ) : 
    result = answers[i]
    
    msq.append(meanSquaredError(result, y_test))
    rmsq.append(rootMeanSquaredError(result, y_test))
    rse.append(relativeSquaredError(result, y_test))
    mae.append(meanAbsoluteError(result, y_test))
    


**Part 1 - Regression - Report**

*• Based on exploratory data analysis, discuss what preprocessing that you need to do before regression, and provide evidence and justifications.*

The basic pre-processing that needs to be done is scaling all numeric data (carat, x, y, z, table, depht, price) and encoding all categorical data. All given categorical data can be ordinal encoded, as all are a range of values that are clearly ordered (colour: range from clear to yellow/brown, which is clearly ordered, general quality (following clear quality standards), cut quality (also following clear standards)). Sources for each order have been noted as comments with the respective encoders.

*• Report the results (keep 2 decimals) of all the 10 regression algorithms on the test data in terms of mean squared error (MSE), root mean squared error (RMSE), relative squared error (RSE), mean absolute error (MAE), and execution time. You should report them in a table.*

In [None]:
import tabulate
data = [header, msq, rmsq, rse, mae, time, sc]

for i in range (1, len (data) - 1):
    for j in range (1, len (data[i])):
        data[i][j] = round(data[i][j], 2)
        
for i in range (1, len(sc)): 
    data[6][i] = str( round(sc[i], 2)) + "%"
        
table = tabulate.tabulate(data, tablefmt='html')
table

*• Compare the performance of different regression algorithms in terms of MSE, RMSE, RSE, and MAE, then analyse and discuss their differences and provide conclusions.*

