In [1]:
# Import necessary modules
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

In [2]:
# Expolore the dataset
df = pd.read_csv('../input/diamonds.csv')
df = df.drop('Unnamed: 0', axis=1)
print(df.head(5))
print(df.info())
print(df.describe())

In [3]:
# Explore the distribution of diamonds in the dataset
sns.set()
plt.figure(figsize = (12,8), dpi=200)
plt.subplot(2,1,1)
_ = plt.hist(df['carat'], bins = 100)
_ = plt.xlabel('Carat')
_ = plt.ylabel('Count')
_ = plt.title('Distribution of number of diamonds in dataset versus carat')
plt.subplot(2,1,2)
_ = plt.hist(df['price'], bins = 100)
_ = plt.xlabel('Price, $')
_ = plt.ylabel('Count')
_ = plt.title('Distribution of number of diamonds in dataset versus price')
plt.tight_layout()
plt.savefig('Price and carat distribution')
plt.show()

In [4]:
df.cut = pd.Categorical(values=df.cut, categories=['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'], ordered=True)
df.color = pd.Categorical(values = df.color, categories=['D', 'E', 'F', 'G', 'H','I','J'], ordered=True)
df.clarity = pd.Categorical(values = df.clarity, categories=['I1', 'SI2', 'SI1','VS2','VS1','VVS2','VVS1','IF'], ordered=True)

plt.figure(figsize = (12,8), dpi=400)
plt.subplot(3,1,1)
sns.countplot(y='cut', data=df, color="c")
plt.xlim((0,22000))
plt.xlabel('')
plt.ylabel('Cut')

plt.subplot(3,1,2)
sns.countplot(y='color', data=df, color="r")
plt.ylabel('Color')
plt.xlim((0,22000))
plt.xlabel('')

plt.subplot(3,1,3)
sns.countplot(y='clarity', data=df)
plt.xlabel('Number of diamonds in the dataset')
plt.ylabel('Clarity')
plt.xlim((0,22000))

plt.tight_layout()
plt.show()

In [5]:
sns.stripplot(x='color', y='price', data=df, jitter=True, size=3)
plt.xlabel('Color')
plt.ylabel('Price, $')
plt.show()

In [6]:
sns.stripplot(x='clarity', y='price', data=df, jitter=True, size=3)
plt.xlabel('Clarity')
plt.ylabel('Price, $')
plt.show()

In [7]:
sns.violinplot(x='cut', y='price', data=df, inner=None, color='lightgray')
sns.stripplot(x='cut', y='price', data=df, jitter=True, size=1.5)
plt.xlabel('Cut')
plt.ylabel('Price, $')
plt.show()

In [8]:
plt.style.use('fivethirtyeight')
sns.pairplot(df, hue='cut')
plt.show()

In [9]:
color_range = ['D', 'E', 'F', 'G', 'H','I','J']
plt.style.use('fivethirtyeight')
plt.figure(figsize = (12,8), dpi=200)
for col in color_range:   
    plt.scatter(df[df['color']== col].carat, df[df['color']== col].price, alpha=0.8, marker = 'D', label=col)   
plt.xlabel('Carat')
plt.ylabel('Price, $')
plt.legend()
plt.title('''Distribution of diamond's price via carat and color''')
plt.savefig('''Distribution of diamond's price via carat and color''')
plt.show()

In [12]:
# Categorical features preprocessing
df_all = pd.get_dummies(df)

In [13]:
# Train and test data split and scaling data
y = df_all['price'].values
X = df_all.drop('price', axis=1).values
X_scale = preprocessing.scale(X)
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size = 0.3, random_state = 42)

In [14]:
# Linear regression model
reg = linear_model.LinearRegression()
kf = KFold(n_splits = 5, shuffle=True, random_state = 42)
cv_results = cross_val_score(reg, X_train, y_train, cv=kf)
print(cv_results.mean())
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
linreg_score = reg.score(X_test, y_test)
print(linreg_score)

In [15]:
# GradientBoosting model
gb = GradientBoostingRegressor(n_estimators=300, verbose = True, random_state = 241)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
print(gb.score(X_test, y_test))

In [None]:
# GradientBoosting model gives accuracy of 0.975 for price prediction.