# Diamond Prices

In [1]:
# load all necessary libraries
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# set the random seed for reproducibility
np.random.seed(42)

# load the diamonds dataset
diamonds = pd.read_csv('./data/diamonds.csv')

# display the first few rows of the dataset
print(diamonds.head())

# display information about the dataset
print(diamonds.info())

   Unnamed: 0  carat      cut color clarity  depth  table  price     x     y  \
0           1   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98   
1           2   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84   
2           3   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07   
3           4   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23   
4           5   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35   

      z  
0  2.43  
1  2.31  
2  2.31  
3  2.63  
4  2.75  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53940 non-null  int64  
 1   carat       53940 non-null  float64
 2   cut         53940 non-null  object 
 3   color       53940 non-null  object 
 4   clarity     53940 non-null  object 
 5   depth       53940 non-null  float64
 6   table       53940

## 01. Price

You decide to start by investigating the mean diamond price and price by cut. You'll test your stats knowledge to set up each calculation to create the confidence intervals!

In [2]:
# calculate the sample mean of the price
mean_price = diamonds['price'].mean()
print(f"Mean price: {mean_price}")
# calculate the sample standard deviation of the price
std_price = diamonds['price'].std()
print(f"Standard deviation of price: {std_price}")
# calculate the sample size
n = diamonds['price'].count()
print(f"Sample size: {n}")

Mean price: 3932.799721913237
Standard deviation of price: 3989.4397381463023
Sample size: 53940


In [8]:
# calculate the margin of error for the 95% confidence interval
confidence_level = 0.95
z_score = stats.norm.ppf((1 + confidence_level) / 2)
print(f"Z-score for {confidence_level*100}% confidence level: {z_score}")

margin_of_error = z_score * (std_price / np.sqrt(n))
# calculate the confidence interval
confidence_interval = (mean_price - margin_of_error, mean_price + margin_of_error)
print(f"95% confidence interval for the mean price: {confidence_interval}")

Z-score for 95.0% confidence level: 1.959963984540054
95% confidence interval for the mean price: (3899.1327133652608, 3966.4667304612135)


In [None]:
# separate the diamonds by cut
cuts = diamonds['cut'].unique()
cut_means = diamonds.groupby('cut')['price'].mean()
cut_stds = diamonds.groupby('cut')['price'].std()
cut_ns = diamonds.groupby('cut')['price'].count()

# calculate the confidence intervals for each cut
cut_conf_intervals = {}
for cut in cuts:
    cut_mean = cut_means[cut]
    cut_std = cut_stds[cut]
    cut_n = cut_ns[cut]

    z_score_cut = stats.norm.ppf((1 + confidence_level) / 2)
    margin_of_error_cut = z_score_cut * (cut_std / np.sqrt(cut_n))

    cut_conf_intervals[cut] = (cut_mean - margin_of_error_cut, cut_mean + margin_of_error_cut)

print("95% confidence intervals for the mean price by cut:")
for cut, interval in cut_conf_intervals.items():
    print(f"{cut}: {interval}")

95% confidence intervals for the mean price by cut:
Ideal: (3406.695938058631, 3508.388002361767)
Premium: (4511.670497203647, 4656.844911396164)
Good: (3825.8448914480405, 4031.884011935572)
Very Good: (3911.579051759853, 4051.9407297332777)
Fair: (4184.844657613898, 4532.670870336413)


Now let's check if the mean price for the Premium cut is statistically significant different from the cut Fair.
For this we will use a two-sample t-test (we will assume a normal distribution).

In [12]:
import pandas as pd
from scipy.stats import ttest_ind

# Example: Extract prices
premium_prices = diamonds.loc[diamonds['cut'] == 'Premium', 'price']
fair_prices = diamonds.loc[diamonds['cut'] == 'Fair', 'price']

# Two-sample t-test (Welch's version, doesn't assume equal variances)
stat, p_value = ttest_ind(premium_prices, fair_prices, equal_var=False)

print(f"T-statistic: {stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpretation at alpha=0.05
if p_value < 0.05:
    print("Reject H0: Significant difference in means (and possibly medians).")
else:
    print("Fail to reject H0: No significant difference.")


T-statistic: 2.3453
P-value: 0.0191
Reject H0: Significant difference in means (and possibly medians).


In [13]:
# let's test normality for each group

from scipy.stats import shapiro

# Test normality for each group
print(shapiro(premium_prices.sample(500)))  # test a subset if the dataset is huge
print(shapiro(fair_prices.sample(500)))


ShapiroResult(statistic=0.8408276893675948, pvalue=5.422507993282913e-22)
ShapiroResult(statistic=0.797370405899994, pvalue=1.677553807008937e-24)


In [15]:
# since we can not assume normality, we will use a non-parametric test
from scipy.stats import mannwhitneyu

# Mann-Whitney U test
stat, p_value_mw = mannwhitneyu(premium_prices, fair_prices, alternative='two-sided')

print(f"Mann-Whitney U statistic: {stat:.4f}")
print(f"P-value: {p_value_mw:.4f}")

if p_value_mw < 0.05:
    print("Reject H0: Significant difference in distributions.")
    print("The distributions of Premium and Fair cuts are significantly different.")
else:
    print("Fail to reject H0: No significant difference in distributions.")
    print("The distributions of Premium and Fair cuts are not significantly different.")
    print("This suggests that the Premium cut does not have a significantly higher price than the Fair cut.")


Mann-Whitney U statistic: 10367091.5000
P-value: 0.0000
Reject H0: Significant difference in distributions.
The distributions of Premium and Fair cuts are significantly different.


## 02. Cut proportions

You're expecting a new shipment of diamonds and you'd like to know what cuts you can expect. 
Most diamonds sold at the shop are 'Premium' or 'Ideal' cut, so you hope to receive mostly diamonds of these cuts, but how what proportion can you actually expect? To answer your question, you decide to investigate the proportion of all diamonds that are 'Premium' or 'Ideal' cut.

In [16]:
total_ideal = diamonds[diamonds['cut'] == 'Ideal'].shape[0]
total_premium = diamonds[diamonds['cut'] == 'Premium'].shape[0]
total_diamonds = diamonds.shape[0]

# calculate the proportions
proportion_ideal = total_ideal / total_diamonds
proportion_premium = total_premium / total_diamonds

print(f"Proportion of Ideal cut diamonds: {proportion_ideal:.2%}")
print(f"Proportion of Premium cut diamonds: {proportion_premium:.2%}")

Proportion of Ideal cut diamonds: 39.95%
Proportion of Premium cut diamonds: 25.57%


In [17]:
# z-score for 90% confidence level
confidence_level_90 = 0.90
z_score_90 = stats.norm.ppf((1 + confidence_level_90) / 2)
print(f"Z-score for {confidence_level_90*100}% confidence level: {z_score_90}")

Z-score for 90.0% confidence level: 1.6448536269514722


In [20]:
# calculate the margin of error for proportions for the 90% confidence interval

margin_of_error_ideal = z_score_90 * np.sqrt((proportion_ideal * (1 - proportion_ideal)) / total_diamonds)
margin_of_error_premium = z_score_90 * np.sqrt((proportion_premium * (1 - proportion_premium)) / total_diamonds)
print(f"Margin of error for Ideal cut: {margin_of_error_ideal:.4f}")
print(f"Margin of error for Premium cut: {margin_of_error_premium:.4f}")

# confidence intervals for proportions
confidence_interval_ideal = (proportion_ideal - margin_of_error_ideal, proportion_ideal + margin_of_error_ideal)
confidence_interval_premium = (proportion_premium - margin_of_error_premium, proportion_premium + margin_of_error_premium)
print(f"90% confidence interval for Ideal cut: {confidence_interval_ideal}")
print(f"90% confidence interval for Premium cut: {confidence_interval_premium}")

Margin of error for Ideal cut: 0.0035
Margin of error for Premium cut: 0.0031
90% confidence interval for Ideal cut: (0.3960676097092352, 0.40300543441386455)
90% confidence interval for Premium cut: (0.25258341384049005, 0.2587625260927691)
