Import dependencies

In [None]:
# For typing
from typing import Dict, TypeAlias

# For Data science
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from scipy import stats
from sklearn import (
    linear_model,
    model_selection,
    preprocessing,
    metrics,
)

LinearRegressionModel: TypeAlias = linear_model.LinearRegression
StandardScaler: TypeAlias = preprocessing.StandardScaler

# plt.close("all")

# Getting data, observations and preliminary processing
Get datasets

In [None]:
# Get datasets
red_wine = pd.read_csv("src/winequality-red.csv", delimiter=";")
white_wine = pd.read_csv("src/winequality-white.csv", delimiter=";")

Show dataset info

In [None]:
# Get data for red wine
red_wine.info(verbose=False)

In [None]:
# Get data for white wine
white_wine.info(verbose=False)

Look for missing values

In [None]:
if white_wine.isna().any().any() | white_wine.isna().any().any():
    print("There are missing values in datasets")
else:
    print("There is no missing values in datasets")

Look for correlations

In [None]:
def get_heatmap(dataframe: pd.DataFrame, name: str) -> None:
    # Get correlation matrix
    correlation_statistics = dataframe.corr()

    # Set mask to get triangle visualization
    stst_mask = np.triu(correlation_statistics)

    # Set size for the plot
    plt.figure(figsize=(15, 15))

    # Get heatmap
    sns.heatmap(correlation_statistics, mask=stst_mask, annot=True)

    plt.title(f"Heatmap of Correlation Matrix for {name}.")


get_heatmap(red_wine, "red_wine");

In [None]:
get_heatmap(white_wine, "white_wine")

There is the sufficient correlation between density and residual sugar features. Let's get the combined feature instead of correlating ones.

In [None]:
# Update correlating feature
white_wine = white_wine.assign(
    density_to_residuals=white_wine["residual sugar"] * white_wine["density"]
)


# Drop original columns
white_wine.drop(columns=["density", "residual sugar"], inplace=True)

# Move assigned column to the first position
new_column_order = ["density_to_residuals"] + [
    column for column in white_wine if column != "density_to_residuals"
]

white_wine_ordered = white_wine.reindex(columns=new_column_order)
# new_column_order
# Get heatmap for updated dataset
get_heatmap(white_wine_ordered, "white_wine")

Now, there are no correlating features in dataset. Let's look at target distribution.

In [None]:
# Get distribution for red wine target
sns.countplot(data=red_wine, x="quality");

In [None]:
# Get distribution for white wine target
sns.countplot(data=white_wine, x="quality");

Target distributions looked as normal ones. Let's check the normality by test.

In [None]:
red_wine_normality = stats.shapiro(red_wine["quality"])[0]
white_wine_normality = stats.shapiro(white_wine["quality"])[0]

print(
    f"Normality test statistics for the both dataset targets are: {red_wine_normality:.2f} and {white_wine_normality:.2f}."
)

Not a perfect result but the acceptable one.

# Linear regression for red_wine dataset
Splitting dataframe to train and test subsets

In [None]:
# Set feature and target subsets
red_wine_features = red_wine.iloc[:, :-1]
red_wine_target = red_wine.iloc[:, -1]

# Get split subsets
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    red_wine_features, red_wine_target, test_size=0.3, random_state=100
)

Scale features

In [None]:
# Get scaler
red_wine_scaler = preprocessing.StandardScaler()

# Train scaler
red_wine_X_train_scaled = red_wine_scaler.fit_transform(X_train)
# Scale test data
red_wine_X_test_scaled = red_wine_scaler.fit_transform(X_test)

Check if split target data is balanced

In [None]:
# Get DataFrame from Series object
train_df = pd.DataFrame(y_train, columns=["quality"])

# Create a count plot for the DataFrame column
plt.figure(figsize=(8, 6))
sns.countplot(x="quality", data=train_df)
plt.title("Count plot for train_df")
plt.show()

# Get portions
y_train.value_counts(normalize=True)

Repeat for test subset

In [None]:
# Get DataFrame from Series object
test_df = pd.DataFrame(y_test, columns=["quality"])

# Create a count plot for the DataFrame column
plt.figure(figsize=(8, 6))
sns.countplot(x="quality", data=test_df)
plt.title("Count plot for test_df")
plt.show()

# Get portions
y_test.value_counts(normalize=True)

Figures and portion values show balansed status.

Get linear regression

In [None]:
# Get regression object
red_wine_reg = linear_model.LinearRegression()

# Get cross-validation results
cross_validation_score = model_selection.cross_val_score(
    red_wine_reg, red_wine_X_train_scaled, y_train, cv=5
)

# Get fit
red_wine_reg.fit(red_wine_X_train_scaled, y_train)

# Get train score
red_wine_train_score = red_wine_reg.score(red_wine_X_train_scaled, y_train)
# Geet test score
red_wine_test_score = red_wine_reg.score(red_wine_X_test_scaled, y_test)

print(f"Cross validation score: {cross_validation_score[:]}.")
print(f"Train score: {red_wine_train_score:.2f}.")
print(f"Test score: {red_wine_test_score:.2f}.")

Model looks stable

Let's evaluate model

In [None]:
# Get metrics for fit model
def evaluate_model(
    model: LinearRegressionModel,
    x_train: StandardScaler,
    x_test: StandardScaler,
    y_train: StandardScaler,
    y_test: StandardScaler,
) -> Dict[str, float]:
    y_train_predicted = model.predict(x_train)
    y_test_predicted = model.predict(x_test)

    # Get mean square metrics
    mse_train = metrics.mean_squared_error(y_train, y_train_predicted)
    mse_test = metrics.mean_squared_error(y_test, y_test_predicted)

    # Get RMS metrics
    rmse_train = metrics.mean_squared_error(y_train, y_train_predicted, squared=False)
    rmse_test = metrics.mean_squared_error(y_test, y_test_predicted, squared=False)

    return {
        "MSE trained": round(mse_train, 3),
        "MSE tested": round(mse_test, 3),
        "RMSE trained": round(rmse_train, 3),
        "RMSE tested": round(rmse_test, 3),
    }


# Get metrics for red wine model
evaluate_model(
    red_wine_reg, red_wine_X_train_scaled, red_wine_X_test_scaled, y_train, y_test
)

Trained and test metrics look quite similar. Let's visualize errors.

In [None]:
# Get density plot
sns.kdeplot(y_test, fill=True, color="r", label="test subset")
sns.kdeplot(
    red_wine_reg.predict(red_wine_X_test_scaled),
    fill=True,
    color="b",
    label="predicted",
)
plt.title("Distribution of observations in test dataset and and predicted dataset")
plt.legend();

There are regions under 4 and above 8 quality grade which is not covered by model. It's suspected that the reason is the presence of out-layers in the dataset.

# Linear regression for white_wine dataset
Splitting dataframe to train and test subsets

In [None]:
# Set feature and target subsets
white_wine_features = white_wine_ordered.iloc[:, :-1]
white_wine_target = white_wine_ordered.iloc[:, -1]

# Get split subsets
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    white_wine_features, white_wine_target, test_size=0.3, random_state=100
)

Scale features

In [None]:
# Get scaler
white_wine_scaler = preprocessing.StandardScaler()

# Train scaler
white_wine_X_train_scaled = white_wine_scaler.fit_transform(X_train)
# Scale test data
white_wine_X_test_scaled = white_wine_scaler.fit_transform(X_test)

Check if split target data is balanced

In [None]:
# Get DataFrame from Series object
train_df = pd.DataFrame(y_train, columns=["quality"])

# Create a count plot for the DataFrame column
plt.figure(figsize=(8, 6))
sns.countplot(x="quality", data=train_df)
plt.title("Count plot for train_df")
plt.show()

# Get portions
y_train.value_counts(normalize=True)

# train_df

Repeat for test subset

In [None]:
# Get DataFrame from Series object
test_df = pd.DataFrame(y_test, columns=["quality"])

# Create a count plot for the DataFrame column
plt.figure(figsize=(8, 6))
sns.countplot(x="quality", data=test_df)
plt.title("Count plot for test_df")
plt.show()

# Get portions
y_test.value_counts(normalize=True)

Figures and portion values show balanced status.

Get linear regression

In [None]:
# Get regression object
white_wine_reg = linear_model.LinearRegression()

# Get cross-validation results
cross_validation_score = model_selection.cross_val_score(
    white_wine_reg, white_wine_X_train_scaled, y_train, cv=5
)

print(f"Cross validation score: {cross_validation_score[:]}")

# Get fit
white_wine_reg.fit(white_wine_X_train_scaled, y_train)

# Get train score
white_wine_train_score = white_wine_reg.score(white_wine_X_train_scaled, y_train)

# Geet test score
white_wine_test_score = white_wine_reg.score(white_wine_X_test_scaled, y_test)

print(f"Train score: {white_wine_train_score:.2f}")
print(f"Test score: {white_wine_test_score:.2f}")

Model looks stable

Let's evaluate model

In [None]:
# Evaluate model
evaluate_model(
    white_wine_reg, white_wine_X_train_scaled, white_wine_X_test_scaled, y_train, y_test
)

Trained and test metrics look quite similar. Let's visualize errors.

In [None]:
# Get density plot
sns.kdeplot(y_test, fill=True, color="r", label="test subset")
sns.kdeplot(
    white_wine_reg.predict(white_wine_X_test_scaled),
    fill=True,
    color="b",
    label="predicted",
)
plt.title("Distribution of observations in test dataset and and predicted dataset")
plt.legend();

There are regions under 4 and above 8 quality grade which is not covered by model. It's suspected that the reason is the presence of out-layers in the dataset.

# Summary
1. Both datasets were analysed and correlating features were combined to new feature.
2. Both regression models (for red and white wine) were successfully trained.
3. Values of the coefficient of determination for both models are relatively low (0,24 - 0.38) that would come from the out-layers in datasets.
4. Out-layer detection and cleaning is required to get models improved.