# Predicting Diamond Prices

### Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

## Our first Machine Learning model

In [None]:
DATA_DIR = '../data'
FILE_NAME = 'diamonds.csv'
data_path = os.path.join(DATA_DIR, FILE_NAME)
diamonds = pd.read_csv(data_path)
## Preparation done from Chapter 2
diamonds = diamonds.loc[(diamonds['x']>0) | (diamonds['y']>0)]
diamonds.loc[11182, 'x'] = diamonds['x'].median()
diamonds.loc[11182, 'z'] = diamonds['z'].median()
diamonds = diamonds.loc[~((diamonds['y'] > 30) | (diamonds['z'] > 30))]
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True)], axis=1)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['color'], prefix='color', drop_first=True)], axis=1)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['clarity'], prefix='clarity', drop_first=True)], axis=1)

In [None]:
w = np.mean(diamonds['price']/diamonds['carat'])
w

In [None]:
def first_ml_model(carat):
    return 4008.024 * carat

In [None]:
carat_values = np.arange(0.5, 5.5, 0.5)
preds = first_ml_model(carat_values)
pd.DataFrame({"Carat": carat_values, "Predicted price":preds})

## Practical Considerations Before Modelling

### Train-test split

In [None]:
diamonds

In [None]:
X = diamonds.drop(['cut','color','clarity','price'], axis=1)
y = diamonds['price']

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)

### Dimensionality reduction using PCA  

In [None]:
sns.pairplot(X_train[['x','y','z']], plot_kws={"s": 3});

In [None]:
# 1. Import the class you will use
from sklearn.decomposition import PCA
# 2. Create an instance of the class
pca = PCA(n_components=3, random_state=123)
# 3. Use the fit method of the instance
pca.fit(X_train[['x','y','z']])
# 4. Use the transform method to perform the transformation
princ_comp = pca.transform(X_train[['x','y','z']])

In [None]:
pca.explained_variance_ratio_.round(3)

In [None]:
princ_comp = pd.DataFrame(data=princ_comp, columns=['pc1', 'pc2', 'pc3'])
sns.pairplot(princ_comp, plot_kws={"s": 3});

In [None]:
princ_comp.corr().round(4)

In [None]:
## Get only the first principal component
pca = PCA(n_components=1, random_state=123)
## Train the pca transformer
pca.fit(X_train[['x','y','z']])
# Add the new feature to the dataset
X_train['dim_index'] = pca.transform(X_train[['x','y','z']]).flatten()
# Drop x, y, and z
X_train.drop(['x','y','z'], axis=1, inplace=True)

In [None]:
X_train.head()

### Standarization: centering and scaling 

In [None]:
numerical_features = ['carat', 'depth', 'table', 'dim_index']

In [None]:
# 1. Import the class you will use
from sklearn.preprocessing import StandardScaler
# 2. Create an instance of the class
scaler = StandardScaler()
# 3. Use the fit method of the instance
scaler.fit(X_train[numerical_features])
# 4. Use the transform method to perform the transformation
X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])

In [None]:
X_train[numerical_features].head()

In [None]:
X_train[numerical_features].describe().round(4)

## Multiple Linear Regression

In [None]:
# 1. Import the Estimator class you will use
from sklearn.linear_model import LinearRegression
# 2. Create an instance of the class
ml_reg = LinearRegression()
# 3. Use the fit method of the instance
ml_reg.fit(X_train, y_train)
# 4. Use the predict method to get the predictions
y_pred_ml_reg = ml_reg.predict(X_train)

In [None]:
pd.Series(ml_reg.coef_, index=X_train.columns).sort_values(ascending=False).round(2)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_train, y_pred_ml_reg))


In [None]:
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_train, y_pred_ml_reg))


In [None]:
# training the model without carat
ml_reg.fit(X_train.drop('carat', axis=1), y_train)
y_pred_ml_reg = ml_reg.predict(X_train.drop('carat', axis=1))
pd.Series(ml_reg.coef_, index=X_train.drop('carat', axis=1).columns).sort_values(ascending=False).round(2)

In [None]:
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_train, y_pred_ml_reg))

In [None]:
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_train, y_pred_ml_reg))

In [None]:
# re-training the model with all features again.
ml_reg.fit(X_train, y_train)
y_pred_ml_reg = ml_reg.predict(X_train)

In [None]:
from sklearn.metrics import mean_squared_error
mse_ml_reg = mean_squared_error(y_true=y_train, y_pred=y_pred_ml_reg)
print('{:0.2f}M'.format(mse_ml_reg/1e6))

In [None]:
y_pred_null_model = np.full(y_train.shape, y_train.mean())
mse_null_model = mean_squared_error(y_true=y_train, y_pred=y_pred_null_model)
print('{:0.2f}M'.format(mse_null_model/1e6))

## Lasso regression

In [None]:
# 1. Import the Estimator class you will use
from sklearn.linear_model import Lasso
# 2. Create an instance of the class
lasso = Lasso(alpha=10)
# 3. Use the fit method of the instance
lasso.fit(X_train, y_train)
# 4. Use the predict method to get the predictions
y_pred_lasso = lasso.predict(X_train)

## MSE calculation
mse_lasso = mean_squared_error(y_true=y_train, y_pred=y_pred_lasso)
print('{:0.2f}M'.format(mse_lasso/1e6))

In [None]:
pd.Series(lasso.coef_, index=X_train.columns).sort_values(ascending=False).round(2)

### KNN Regression

In [None]:
# 1. Import the Estimator class you will use
from sklearn.neighbors import KNeighborsRegressor
# 2. Create an instance of the class
knn = KNeighborsRegressor(n_neighbors=12)
# 3. Use the fit method of the instance
knn.fit(X_train, y_train)
# 4. Use the predict method to get the predictions
y_pred_knn = knn.predict(X_train)

In [None]:
mse_knn = mean_squared_error(y_true=y_train, y_pred=y_pred_knn)
print('{:0.2f}M'.format(mse_knn/1e6))

## Evaluating in the testing dataset

### KNN with perfect performance

In [None]:
perfect_knn = KNeighborsRegressor(n_neighbors=1)
perfect_knn.fit(X_train, y_train)
mean_squared_error(y_true=y_train, y_pred=perfect_knn.predict(X_train))

### Applying the tranformations

In [None]:
## Replacing x, y, z with dim_index using PCA: notice we are not training, just transforming
X_test['dim_index'] = pca.transform(X_test[['x','y','z']]).flatten()

# Remove x, y and z from the dataset
X_test.drop(['x','y','z'], axis=1, inplace=True)

## Scale our numerical features so they have zero mean and a variance of one
X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])

In [None]:
mse = pd.DataFrame(columns=['train', 'test'], index=['MLR','Lasso','KNN'])
model_dict = {'MLR': ml_reg, 'Lasso': lasso, 'KNN': knn}
for name, model in model_dict.items():
    mse.loc[name, 'train'] = mean_squared_error(y_true=y_train, y_pred=model.predict(X_train))/1e6 
    mse.loc[name, 'test'] = mean_squared_error(y_true=y_test, y_pred=model.predict(X_test))/1e6
    

In [None]:
mse

In [None]:
fig, ax = plt.subplots()
mse.sort_values(by='test', ascending=False).plot(kind='barh', ax=ax, zorder=3)
ax.grid(zorder=0)

In [None]:
demo_pred = X_test.iloc[:10].copy()
pred_dict = {'y_true':y_test[:10]}
for name, model in model_dict.items():
    pred_dict['pred_'+name] = model.predict(demo_pred).round(1)

pd.DataFrame(pred_dict)