In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Set plot style
sns.set(style="whitegrid")

# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv("/kaggle/input/wine-quality-dataset/WineQT.csv")
df.info()

In [None]:
df.head()

In [None]:
df.describe().T

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df[df.duplicated()]

In [None]:
df["quality"].unique()

In [None]:
df["quality"].value_counts()

<h2>First Look Observations</h2>
<p>All input features in the dataset are numeric, with no missing (null) values. The target variable "quality" is numeric but can be treated as an ordinal categorical feature.</p>

<ul>
  <li><strong>Density and pH</strong> have low variance; they may be weak in explaining quality.</li>
  <li><strong>Alcohol</strong> shows relatively high variance; it could be an important factor for distinguishing quality.</li>
  <li><strong>Volatile acidity</strong> has moderate-to-high variance; as it affects taste and aroma, it may strongly explain quality.</li>
  <li><strong>Sulphates</strong> show noticeable variance; they could influence quality as a preservative component.</li>
  <li><strong>Total sulfur dioxide</strong> has a wide spread; it may create differing impacts on quality.</li>
  <li><strong>Residual sugar</strong> is mostly low but includes outliers; it might play a role in explaining quality in certain cases.</li>
</ul>


In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(df.corr(),annot=True)

<h2>Heatmap Observations</h2>
<ul>
  <li><strong>Alcohol</strong> shows a relatively high positive correlation (0.48); it may be an important factor for distinguishing quality.</li>
  <li><strong>Sulphates</strong> and <strong>Citric acid</strong> show a moderate positive correlation (0.24–0.26); they could influence quality as preservative components.</li>
  <li><strong>Volatile acidity</strong> has a negative correlation (-0.41); it may reduce quality by negatively affecting taste and aroma.</li>
    <li><strong>Total Sulfur Dioxide</strong> has a negative correlation (-0.18); it may lower quality by causing undesirable chemical imbalances.</li>

</ul>


In [None]:
df.groupby("quality").mean()

In [None]:
df["quality"].value_counts().plot(kind="bar")
plt.xlabel("Qualty")
plt.ylabel("Count")
plt.show()

In [None]:
sns.pairplot(df)

In [None]:
import math
import seaborn as sns
import matplotlib.pyplot as plt

def draw_multivariate_plot(df, plot_type="box", target="quality",
                           cols=None, ncols=4, figsize=(25, 12), **kwargs):
    """
    Draws multiple plots (box, point, strip, scatter) comparing numeric features
    against a target variable (default = "quality").
    """

    # If no specific columns are given, select all numeric columns except the target
    if cols is None:
        cols = df.select_dtypes("number").columns.drop(target, errors="ignore").tolist()
    if not cols:
        raise ValueError("No columns found to plot.")

    # Map plot type strings to seaborn plotting functions
    plot_funcs = {
        "box": sns.boxplot,
        "point": sns.pointplot,
        "strip": sns.stripplot,
        "scatter": sns.scatterplot,
    }
    
    # Validate that the chosen plot_type is supported
    if plot_type not in plot_funcs:
        raise ValueError(f"Invalid plot_type: {plot_type}")
    

    n = len(cols)
    nrows = math.ceil(n / ncols)
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, squeeze=False)
    axes = axes.flatten()  
    f = plot_funcs[plot_type]

    for ax, col in zip(axes, cols):
        f(x=target, y=col, data=df, ax=ax, **kwargs)
        ax.set_title(f"{col} vs {target}")  
        ax.grid(True, alpha=0.3)            

    # Hide any unused subplot axes 
    for ax in axes[n:]:
        ax.set_visible(False)

    plt.tight_layout()
    plt.show()


In [None]:
col = [c for c in df.columns if c != "quality" and c!= "Id"]
draw_multivariate_plot(df, plot_type="box", target="quality", cols=col, figsize=(25, 14))


In [None]:
draw_multivariate_plot(df, plot_type="point", target="quality", cols=col, figsize=(22, 14))

In [None]:
draw_multivariate_plot(df, plot_type="scatter", target="quality", cols=col, figsize=(22, 14))

In [None]:
draw_multivariate_plot(df, plot_type="strip", target="quality", cols=col, figsize=(22, 14))

In [None]:
sns.scatterplot(x="fixed acidity",y="density",hue="quality",data=df)

In [None]:

col = [c for c in df.columns if c != "quality" and c!= "Id"]
(fig,ax)=plt.subplots(4,4,figsize=(25,20))
ax =ax.flatten()

for i,column in enumerate(col):
    sns.kdeplot(
        data=df,
        x=column,
        hue=df.quality,
        ax=ax[i]
    )

    ax[i].set_title(f"{column} Distrubiton")
    ax[i].set_xlabel(None)

for i in range(i+1, len(ax)):
    ax[i].set_visible(False)


In [None]:
# Select all columns except 'quality' and 'Id'
columns = [c for c in df.columns if c not in ["quality", "Id"]]

fig, ax = plt.subplots(11, 2, figsize=(15, 45)) 
plt.subplots_adjust(hspace=0.5)   # Add some vertical space between rows

for i in range(11):
    # --- Left panel: Boxplot (distribution of each feature) ---
    sns.boxplot(x=columns[i], data=df, ax=ax[i, 0])
    ax[i, 0].set_title(f"Distribution of {columns[i]}")   
    
    # --- Right panel: Scatterplot (feature vs quality) ---
    sns.scatterplot(x=columns[i], y='quality', data=df, hue='quality', ax=ax[i, 1])
    ax[i, 1].set_title(f"{columns[i]} vs Quality")

    # Remove the legend from all scatterplots except the first one
    if i > 0:
        leg = ax[i, 1].get_legend()
        if leg:
            leg.remove()

# Move the legend from the first scatterplot outside the plot area (for cleaner layout)
handles, labels = ax[0, 1].get_legend_handles_labels()
ax[0, 1].legend(handles, labels, title="quality", 
                bbox_to_anchor=(1.02, 1), loc="upper left")

plt.show()


<div style="border: 3px solid purple; padding: 15px; border-radius: 8px; background-color: transparent;">

<h2>Conclusion</h2>

<p>After conducting a thorough exploratory data analysis on the Red Wine Quality dataset, several key insights emerged:</p>
<p><strong>Alcohol content</strong> shows the strongest positive correlation with wine quality. Higher alcohol levels tend to be associated with better-rated wines.</p>

<p><strong>Volatile acidity</strong> has a strong negative impact on quality. Wines with higher acidity are generally rated lower.</p>

<p><strong>Sulphates</strong> and <strong>citric acid</strong> also show moderate positive correlations with quality, suggesting their role in enhancing flavor and preservation.</p>

<p>Most wines in the dataset cluster around a quality score of 5–6, while truly high-quality (≥8) or very low-quality (≤3) wines are rare, showing the dataset is slightly imbalanced toward average wines.</p>

<p>Several features such as residual sugar, density, and pH show weak or negligible correlation with quality, implying limited influence.</p>

<p>Outlier analysis revealed a few extreme values in alcohol and volatile acidity, which may affect model performance if not treated.</p>

<p>The dataset is slightly imbalanced in terms of quality scores, which should be considered in future modeling efforts.</p>

<p>Overall, wine quality in this dataset appears to be driven most strongly by a combination of alcohol, acidity balance, and preservation factors. This suggests that improving these chemical properties—particularly raising alcohol within acceptable limits and lowering volatile acidity—could be key strategies for producing higher-quality red wines..</p>

</div>
