In [35]:
import pandas as pd
import numpy as np
from scipy import stats

In [36]:
wine = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/EDA/winequality-red.csv", sep=",")

In [37]:
wine = wine.drop_duplicates()

In [38]:
wine.loc[wine["alcohol"] < 0, "alcohol"] = np.median(wine["alcohol"])

In [39]:
print(wine.shape)
print(wine.dtypes)
print(wine.isnull().sum())

(1359, 12)
fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [40]:
desc = wine.describe().T
print(desc)

                       count       mean        std      min      25%      50%  \
fixed acidity         1359.0   8.310596   1.736990  4.60000   7.1000   7.9000   
volatile acidity      1359.0   0.529478   0.183031  0.12000   0.3900   0.5200   
citric acid           1359.0   0.272333   0.195537  0.00000   0.0900   0.2600   
residual sugar        1359.0   2.523400   1.352314  0.90000   1.9000   2.2000   
chlorides             1359.0   0.088124   0.049377  0.01200   0.0700   0.0790   
free sulfur dioxide   1359.0  15.893304  10.447270  1.00000   7.0000  14.0000   
total sulfur dioxide  1359.0  46.825975  33.408946  6.00000  22.0000  38.0000   
density               1359.0   0.996709   0.001869  0.99007   0.9956   0.9967   
pH                    1359.0   3.309787   0.155036  2.74000   3.2100   3.3100   
sulphates             1359.0   0.658705   0.170667  0.33000   0.5500   0.6200   
alcohol               1359.0  10.432315   1.082065  8.40000   9.5000  10.2000   
quality               1359.0

In [41]:
from scipy.stats import skew, kurtosis
desc["skewness"] = wine.skew()
desc["kurtosis"] = wine.kurtosis()
print(desc)

                       count       mean        std      min      25%      50%  \
fixed acidity         1359.0   8.310596   1.736990  4.60000   7.1000   7.9000   
volatile acidity      1359.0   0.529478   0.183031  0.12000   0.3900   0.5200   
citric acid           1359.0   0.272333   0.195537  0.00000   0.0900   0.2600   
residual sugar        1359.0   2.523400   1.352314  0.90000   1.9000   2.2000   
chlorides             1359.0   0.088124   0.049377  0.01200   0.0700   0.0790   
free sulfur dioxide   1359.0  15.893304  10.447270  1.00000   7.0000  14.0000   
total sulfur dioxide  1359.0  46.825975  33.408946  6.00000  22.0000  38.0000   
density               1359.0   0.996709   0.001869  0.99007   0.9956   0.9967   
pH                    1359.0   3.309787   0.155036  2.74000   3.2100   3.3100   
sulphates             1359.0   0.658705   0.170667  0.33000   0.5500   0.6200   
alcohol               1359.0  10.432315   1.082065  8.40000   9.5000  10.2000   
quality               1359.0

In [42]:
wine["quality_label"] = wine["quality"].apply(lambda q: "good" if q >= 6 else "bad")
print(wine["quality_label"].value_counts())
print(wine["quality_label"].value_counts(normalize=True))

quality_label
good    719
bad     640
Name: count, dtype: int64
quality_label
good    0.529065
bad     0.470935
Name: proportion, dtype: float64


In [43]:
groups = wine.groupby("quality_label")
for name, group in groups:
    print(f"==== {name} ====")
    print(group.describe().T)

==== bad ====
                      count       mean        std      min      25%  \
fixed acidity         640.0   8.141719   1.561787  4.60000   7.1000   
volatile acidity      640.0   0.592930   0.184137  0.18000   0.4600   
citric acid           640.0   0.237547   0.184963  0.00000   0.0800   
residual sugar        640.0   2.526797   1.339649  1.20000   1.9000   
chlorides             640.0   0.094144   0.058115  0.03900   0.0740   
free sulfur dioxide   640.0  16.659375  10.875864  3.00000   8.0000   
total sulfur dioxide  640.0  55.146094  36.996713  6.00000  23.7500   
density               640.0   0.997044   0.001596  0.99256   0.9961   
pH                    640.0   3.309016   0.154503  2.74000   3.2000   
sulphates             640.0   0.620484   0.181235  0.33000   0.5275   
alcohol               640.0   9.920781   0.773478  8.40000   9.4000   
quality               640.0   4.885937   0.364018  3.00000   5.0000   

                            50%       75%        max  
fixed a

In [44]:
data = wine.select_dtypes(include=[np.number])

for col in data.columns:
    values = data[col].values

    mean = np.mean(values)
    median = np.median(values)
    mode = stats.mode(values, keepdims=True)   # mode trả về object
    var = np.var(values, ddof=1)               # sample variance (ddof=1)
    std = np.std(values, ddof=1)               # sample standard deviation
    data_range = np.max(values) - np.min(values)
    q1 = np.percentile(values, 25)
    q3 = np.percentile(values, 75)
    iqr = q3 - q1

    print(f"\n=== {col} ===")
    print(f"Mean: {mean:.4f}")
    print(f"Median: {median:.4f}")
    print(f"Mode: {mode.mode[0]} (Count={mode.count[0]})")
    print(f"Variance: {var:.4f}")
    print(f"Standard Deviation: {std:.4f}")
    print(f"Range: {data_range:.4f}")
    print(f"Q1 (25%): {q1:.4f}")
    print(f"Q3 (75%): {q3:.4f}")
    print(f"IQR: {iqr:.4f}")


=== fixed acidity ===
Mean: 8.3106
Median: 7.9000
Mode: 7.2 (Count=49)
Variance: 3.0171
Standard Deviation: 1.7370
Range: 11.3000
Q1 (25%): 7.1000
Q3 (75%): 9.2000
IQR: 2.1000

=== volatile acidity ===
Mean: 0.5295
Median: 0.5200
Mode: 0.5 (Count=37)
Variance: 0.0335
Standard Deviation: 0.1830
Range: 1.4600
Q1 (25%): 0.3900
Q3 (75%): 0.6400
IQR: 0.2500

=== citric acid ===
Mean: 0.2723
Median: 0.2600
Mode: 0.0 (Count=118)
Variance: 0.0382
Standard Deviation: 0.1955
Range: 1.0000
Q1 (25%): 0.0900
Q3 (75%): 0.4300
IQR: 0.3400

=== residual sugar ===
Mean: 2.5234
Median: 2.2000
Mode: 2.0 (Count=133)
Variance: 1.8288
Standard Deviation: 1.3523
Range: 14.6000
Q1 (25%): 1.9000
Q3 (75%): 2.6000
IQR: 0.7000

=== chlorides ===
Mean: 0.0881
Median: 0.0790
Mode: 0.08 (Count=50)
Variance: 0.0024
Standard Deviation: 0.0494
Range: 0.5990
Q1 (25%): 0.0700
Q3 (75%): 0.0910
IQR: 0.0210

=== free sulfur dioxide ===
Mean: 15.8933
Median: 14.0000
Mode: 6.0 (Count=121)
Variance: 109.1455
Standard Deviatio