In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
df = pd.read_csv('../data/Diamond_Prices.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum().to_frame('Null Count')

In [None]:
df.duplicated().to_frame('Has Duplicated')

In [None]:
# Fair, Good, Very Good, Premium, Ideal
# Cut: 1, 2, 3, 4, 5

# change the cut column to numerical, create new column
df['cut_rating'] = df['cut'].map({'Fair':1, 'Good':2, 'Very Good':3, 'Premium':4, 'Ideal':5})

In [None]:
# I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best)
# Clarity: 1, 2, 3, 4, 5, 6, 7, 8

# change the clarity column to numerical
df['clarity_rating'] = df['clarity'].map({'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8})

In [None]:
# J (worst) to D (best)
# Color: 1, 2, 3, 4, 5, 6, 7

# change the color column to numerical
df['color_rating'] = df['color'].map({'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7})

In [None]:
filtered_df = df[['carat', 'cut_rating', 'clarity_rating', 'color_rating', 'price', 'depth', 'table', 'length', 'width']]

In [None]:
filtered_df.head()

In [None]:
filtered_df.info()

In [None]:
# correlation matrix
corr = filtered_df.corr()

In [None]:
corr

In [None]:
# plot the heatmap
sns.heatmap(corr,
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [None]:
# Generate pair plot
sns.pairplot(filtered_df)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = filtered_df[['carat', 'cut_rating', 'clarity_rating', 'color_rating', 'depth', 'table', 'length', 'width']]
y = filtered_df['price']

In [None]:
y.describe()

In [None]:
# price_range = filtered_df['price'].max() - filtered_df['price'].min()

In [None]:
# price_scaled = (df['price'] - df['price'].min()) / price_range

In [None]:
# price_scaled = price_scaled * 100

In [None]:
# y = price_scaled

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31)

In [None]:
plt.scatter(X_train['carat'], y_train, color='blue')
plt.scatter(X_test['carat'], y_test, color='red')
plt.xlabel("Carat")
plt.ylabel("Price")
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()

lr_baseline = lr_model.fit(X_train, y_train)
y_predict = lr_baseline.predict(X_test)

coef_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': lr_baseline.coef_})

print(coef_df)

print ('Coefficients: ', lr_baseline.coef_)
print ('Intercept: ', lr_baseline.intercept_)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_predict)

print("MAE: %.2f" % mae)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % rmse)
print("R2-score: %.6f" % r2)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.model_selection import KFold

kf = KFold(shuffle=True, random_state=72018, n_splits=3)

In [None]:
from sklearn.model_selection import GridSearchCV

estimator = Pipeline([("polynomial_features", PolynomialFeatures()),
                      ("linear_regression", LinearRegression())])

params = {
    'polynomial_features__degree': [1, 2, 3],
}

lr_poly = GridSearchCV(estimator, params, cv=kf)

In [None]:
for train_index, test_index in kf.split(X_train):
    print("Train index:", train_index[:10], len(train_index))
    print("Test index:",test_index[:10], len(test_index))
    print('')
    Xcv_train, Xcv_test, ycv_train, ycv_test = (X.iloc[train_index, :],
                                                    X.iloc[test_index, :],
                                                    y[train_index],
                                                    y[test_index])

In [None]:
lr_poly.fit(Xcv_train, ycv_train)

In [None]:
pf = PolynomialFeatures(degree=lr_poly.best_params_['polynomial_features__degree'])

lr_poly2 = Pipeline([("make_higher_degree", pf),
                    ("linear_regression", LinearRegression())])

lr_poly2 = lr_poly2.fit(Xcv_train, ycv_train)
lr_poly2.score(Xcv_train, ycv_train)

In [None]:
lr_poly2.named_steps["make_higher_degree"].get_feature_names_out()

In [None]:
y_poly2_pred = lr_poly2.predict(Xcv_test)

In [None]:
r2_score(ycv_test, y_poly2_pred)

In [None]:
plt.scatter(Xcv_test['carat'], ycv_test, color='blue')
plt.scatter(Xcv_test['carat'], y_poly2_pred, color='red')
plt.xlabel("Carat")
plt.ylabel("Price")
plt.show()

In [None]:
mae = mean_absolute_error(ycv_test, y_poly2_pred)
mse = mean_squared_error(ycv_test, y_poly2_pred)
rmse = np.sqrt(mse)
r2 = r2_score(ycv_test, y_poly2_pred)

print("MAE: %.2f" % mae)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % rmse)
print("R2-score: %.6f" % r2)

In [None]:
from sklearn.preprocessing import StandardScaler

s = StandardScaler()
lr = LinearRegression()

estimator = Pipeline([("scaler", s),
                      ("regression", lr)])

In [None]:
from sklearn.linear_model import Lasso

estimator2 = Pipeline([("scaler", StandardScaler()),
        ("polynomial_features", PolynomialFeatures()),
        ("lasso_regression", Lasso())])

params2 = {
    'polynomial_features__degree': [1, 2, 3],
    'lasso_regression__alpha': [0.0001,0.001,0.05,0.1,1]
}

lr_opt = GridSearchCV(estimator2, params2, cv=kf)

In [None]:
lr_poly_opt = lr_opt.fit(Xcv_train, ycv_train)

In [None]:
lr_poly_opt.best_score_, lr_poly_opt.best_params_

In [None]:
lr_poly2_l1_001 = Pipeline([("scaler", StandardScaler()),
                            ("polynomial_features", PolynomialFeatures(degree=lr_poly_opt.best_params_['polynomial_features__degree'])),
                            ("lasso_regression", Lasso(alpha=lr_poly_opt.best_params_['lasso_regression__alpha']))])

lr_poly2_l1 = lr_poly2_l1_001.fit(Xcv_train, ycv_train)
lr_poly2_l1.score(Xcv_train, ycv_train)

In [None]:
y_lr_poly2_l1_pred = lr_poly2_l1.predict(Xcv_test)

In [None]:
plt.scatter(Xcv_train['carat'], ycv_train, color='blue')
plt.scatter(Xcv_test['carat'], y_lr_poly2_l1_pred, color='red')
plt.xlabel("Carat")
plt.ylabel("Price")
plt.show()

In [None]:
mae = mean_absolute_error(ycv_test, y_lr_poly2_l1_pred)
mse = mean_squared_error(ycv_test, y_lr_poly2_l1_pred)
rmse = np.sqrt(mse)
r2 = r2_score(ycv_test, y_lr_poly2_l1_pred)

print("MAE: %.2f" % mae)
print("MSE: %.2f" % mse)
print("MSE: %.2f" % rmse)
print("R2-score: %.6f" % r2)

In [None]:
Xcv_test

In [None]:
ycv_test

In [None]:
carat = 1.52
cut_rating = 5
clarity_rating = 5
color_rating = 6
depth = 4.59
table = 54.0
length = 7.32
width = 7.39

test_pred = lr_poly2_l1.predict([[carat, cut_rating, clarity_rating, color_rating, depth, table, length, width]])

In [None]:
test_pred