In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from pandas.plotting import andrews_curves
from statsmodels.nonparametric.smoothers_lowess import lowess
from scipy import stats
import matplotlib.patches as mpatches
red_wine=pd.read_csv("winequality-red.csv",sep=";")
white_wine=pd.read_csv("winequality-white.csv",sep=";")

In [None]:
#Column renaming according to usual conventions
red_wine.columns=red_wine.columns.str.replace(" ","_")
white_wine.columns=white_wine.columns.str.replace(" ","_")
red_wine["type"]="red"
white_wine["type"]="white"
#Categorization of quality into three groups
red_wine["quality_label"] = red_wine["quality"].apply(lambda value: "low" if value<=5 else "medium" if value<7 else "high")
white_wine["quality_label"]=white_wine["quality"].apply(lambda value: "low" if value<=5 else "medium" if value<7 else "high")
red_wine["quality_label"]=pd.Categorical(red_wine["quality_label"],categories=["low","medium","high"])
white_wine["quality_label"]=pd.Categorical(white_wine["quality_label"],categories=["low","medium","high"])
#Useful user-created variables
labels=["Fixed acidity","Volatile acidity","Citric acid","Residual sugar","Chlorides","Free SO2","Total SO2","Density","pH","Sulphates","Alchohol","Quality"]

In [None]:
wines=pd.concat([red_wine,white_wine])
wines=wines.sample(frac=1,random_state=42).reset_index(drop=True)

In [None]:
#FUNCTIONS
#Frequency analysis for specific feature on both types
def freq(x):
    fig,axes=plt.subplots(nrows=1,ncols=2,sharey=True,sharex=True,figsize=(10,4),facecolor="snow")
    wines.hist(column=x,by="type",ax=axes,color="crimson",edgecolor="black",linewidth=1)
    x=x.replace("_"," ")
    x=x.capitalize()
    fig.supxlabel(x)
    x+=" content in wines"
    plt.suptitle(x,x=0.5,y=1.05,ha="center",fontsize="x-large")
    fig.supylabel("Frequency")
    axes[0].set_title("Red Wines")
    axes[1].set_title("White Wines")
#Creates a boxplot for variables xy to z of winetype dataset. If out==False removes outliers
def boxplotter(x,y,z,winetype,out):
    fig, axes = plt.subplots(1,2, figsize=(10,10),facecolor="snow")
    temp=winetype[[z,x]]
    temp.boxplot(by=z, ax=axes[0],showfliers=out,color="dimgray");axes[0].set_title(""); axes[0].set_xlabel(z); axes[0].set_ylabel(x)
    temp=winetype[[z,y]]
    temp.boxplot(by=z, ax=axes[1],showfliers=out,color="dimgray");axes[1].set_title(""); axes[1].set_xlabel(z); axes[1].set_ylabel(y)
#Normalizes a dataset
def normalizer(df): 
    df=df.drop(["type","quality_label"],axis=1)
    scaler=MinMaxScaler()
    scaler.fit(df)
    scaled=scaler.fit_transform(df)
    scaled_df=pd.DataFrame(scaled,columns=df.columns)
    return scaled_df
#Takes a,b variables for x and y axis respectively; nor is a boolean value for normalization
def plotter(a,b,c,d,nor):
    if nor:
        c=normalizer(c)
        d=normalizer(d)
    fig, axes = plt.subplots(1,2, figsize=(14,7),facecolor="snow")
    x=sns.regplot(ax=axes[0],x=c[a],y=c[b],lowess=True,scatter_kws={"color":"crimson"},line_kws={"color":"black"})
    y=sns.regplot(ax=axes[1],x=d[a],y=d[b],lowess=True,scatter_kws={"color":"crimson"},line_kws={"color":"black"})
    return x,y
#Graphs relationship between x,y variables on df dataset
def rel_check(x,y,df):
    gg=sns.JointGrid(x=x,y=y,data=df)
    gg=gg.plot_joint(sns.regplot,color="crimson")
    gg=gg.plot_marginals(sns.histplot,kde=True,color="goldenrod")
    return gg
#Multigraphs kde,scatter and hist for all variables (It takes time)
def exp(df):
    pp=sns.PairGrid(df)
    pp=pp.map_upper(sns.kdeplot,cmap="hot",shade=True,thresh=False)
    pp=pp.map_diag(plt.hist,color="darkgrey")
    pp=pp.map_lower(sns.scatterplot,color="crimson",edgecolor="black")
    return pp
#Used for the removal of outliers with quantile principle
def correction(df):
    labels=["fixed_acidity","volatile_acidity","citric_acid","residual_sugar","chlorides","free_sulfur_dioxide","total_sulfur_dioxide","density","sulphates","alcohol","quality"]
    df=df[labels]
    sorted(df)
    for i in df:
        q1=df[i].quantile(0.25)
        q3=df[i].quantile(0.75)
        iqr=q3-q1
        low=q1-1.5*iqr
        high=q3+1.5*iqr 
        df=df.loc[(df[i]>low) & (df[i]<high)]
    return df
#Multivariate analysis with x,y features of df dataset, with quality label hue
def multivariate(x,y,df):
    colors={"low":"yellow","medium":"crimson","high":"black"}
    fig,axes=plt.subplots(figsize=(12,8),facecolor="snow")
    scatter=axes.scatter(df[x],df[y],c=df["quality_label"].map(colors),alpha=0.5)
    x=x.replace("_"," ").capitalize()+", "
    axes.set_xlabel(x,fontsize=10)
    y=y.replace("_"," ")+" "
    axes.set_ylabel(y,fontsize=10)
    if len(df["type"].unique())==2: title="Wines"
    elif 1 in df["type"].unique(): title="Red wines"
    else: title="White wines" 
    axes.set_title(title+" - "+x +y+ "and quality")
    low_quality=mpatches.Patch(color="yellow",label="Low quality wines")
    medium_quality=mpatches.Patch(color="crimson",label="Medium quality wines")
    high_quality=mpatches.Patch(color="black",label="Higher quality wines")
    axes.legend(handles=[low_quality,medium_quality,high_quality])
    plt.show()

In [None]:
#Fixed acidity refers to tartaric acid, the primary acid taste in wine, essential to mouthfeel and balance, adding an additional complexity to wine
#Citric acid is the fruity flavour. Adds some acidity, used less frequently due to the strong flavour and tendecy to break down to vinegar/ethanol.
#Volatile acidity is the gaseous acidic elements of a wine. Creates a vinegar/nail polish remover smell. Usualy undesirable

In [None]:
#Only pH seems to have normal distribution. All other values are positely/right skewed
#Removal of outliers for all except pH should be done with IQR-distance filter

In [None]:
#Alcohol is the strongest indicator of a good quality wine, irrespective of type
#For red wines, simmilary strong indicator is volatile acidity, inversely propotionate with quality
#Sulphates and citric acid also have a small positive correlation
#For white wines, apart from alcohol, the strongest indicators are chlorides, volatile acidity and total SO2, all inversely propotiional to quality
#For the compined dataset, strong features are alcohol(+), volatile acidity(-) and chlorides(-)

#Some coupling between features are:
#(Free SO2-Total SO2)(Alcohol-Density)
#(Total SO2-Residual sugar)(Density-Residual sugar)

#We elected to not use density, as it is a property of the wine's composition

Checking the skewness of the variables

In [None]:
for i in white_wine.columns:
    sns.displot(white_wine[i], kde=True)
    plt.title(i)
    plt.show()

In [None]:
for i in red_wine.columns:
    sns.displot(red_wine[i], kde=True)
    plt.title(i)
    plt.show()

In [None]:
#ANOVA
f,p=stats.f_oneway(wines[wines["quality_label"]=="low"]["alcohol"],
wines[wines["quality_label"]=="medium"]["alcohol"],
wines[wines["quality_label"]=="high"]["alcohol"])
print("ANOVA test for mean alcohol level across wine samples with different quality ratings")
print("F Statistic:",f,"\tp-value:",p)

In [None]:
conr=red_wine.describe().applymap("{:.2f}".format)
conw=white_wine.describe().applymap("{:.2f}".format)
keys=["Red Whine Statistics","White Wine Statistics"]
wines_sum=pd.concat((conr,conw),axis=0, keys=keys)
del conr,conw,keys
wines_sum

In [None]:
fig= plt.subplots(figsize=(5,5),facecolor="lightgrey")
wines["type"].value_counts().plot(kind="pie", autopct="%.1f%%", wedgeprops={"alpha":0.5}, colors=["yellow","red"], title="Distribution of wines based on colour")

In [None]:
tempr=pd.DataFrame(data={"quality_label":["low","medium","high"],"count":red_wine.value_counts(subset="quality_label")})
tempw=pd.DataFrame(data={"quality_label":["low","medium","high"],"count":white_wine.value_counts(subset="quality_label")})
#Normalized for better understanding due to the radically different dataframe sizes
fig, axes = plt.subplots(1,2, sharey=True, squeeze=True, facecolor="snow")
tempr["count"]=preprocessing.minmax_scale(tempr["count"], feature_range=(0.1, 1))
tempw["count"]=preprocessing.minmax_scale(tempw["count"], feature_range=(0.1, 1))
tempr["count"].plot(kind='bar',ax=axes[0],color=["crimson"]); axes[0].set_title('Red'); axes[0].set_xlabel("Wine quality")
tempw["count"].plot(kind='bar',ax=axes[1],color=["crimson"]); axes[1].set_title('White'); axes[1].set_xlabel("Wine quality")

In [None]:
#Univariate frequency analysis for red wines
red_wine.hist(bins=15,color="crimson",edgecolor="black",linewidth=1,xlabelsize=8,ylabelsize=8,grid=False)
plt.tight_layout(rect=(0,0,1.2,1.2))
plt.suptitle("Red wine univariate plots",x=0.65,y=1.25,fontsize=15)

In [None]:
#Univariate frequency analysis for white wines
white_wine.hist(bins=15,color="crimson",edgecolor="black",linewidth=1,xlabelsize=8,ylabelsize=8,grid=False)
plt.tight_layout(rect=(0,0,1.2,1.2))
plt.suptitle("White wine univariate plots",x=0.65,y=1.25,fontsize=15)

In [None]:
#Heatmap for red wines
plt.figure(figsize=(12,8))
sns.heatmap(red_wine.corr().round(2),color="k",linecolor="snow",linewidth=0.2,annot=True,yticklabels=labels,xticklabels=labels)
plt.xticks(rotation=45)
plt.title("Heatmap of red wines", fontsize=15)
plt.show()
red_wine.corr()[["quality"]].sort_values(by="quality",ascending=False)

In [None]:
#Heatmap for white wines
plt.figure(figsize=(12,8))
sns.heatmap(white_wine.corr().round(2),color="k",linecolor="snow",linewidth=0.2,annot=True,yticklabels=labels,xticklabels=labels)
plt.xticks(rotation=45)
plt.title("Heatmap of white wines", fontsize=15)
plt.show()
white_wine.corr()[["quality"]].sort_values(by="quality",ascending=False)

In [None]:
#Heatmap for all wines
plt.figure(figsize=(12,8))
sns.heatmap(wines.corr().round(2),color="k",linecolor="snow",linewidth=0.2,annot=True,yticklabels=labels,xticklabels=labels)
plt.xticks(rotation=45)
plt.title("Heatmap of all wines", fontsize=15)
plt.show()
#Alcohol, volatile acidity, density and chlorides influence a wine's quality
wines.corr()[["quality"]].sort_values(by="quality",ascending=False)

In [None]:
red_wine.corr()[["volatile_acidity"]].sort_values(by="volatile_acidity",ascending=False)

In [None]:
multivariate("alcohol","total_sulfur_dioxide",red_wine)

In [None]:
#exp(wines)

In [None]:
#exp(wines[["alcohol","free_sulfur_dioxide","volatile_acidity","chlorides"]])

In [None]:
freq("free_sulfur_dioxide")

In [None]:
#Create a new dataframe g1, with quality and quality label columns serving as identifiers
g1=pd.melt(wines,id_vars=["quality","quality_label"],value_vars=wines.columns[:12],var_name="variable",value_name="value")
g1.sort_values(by=["variable","quality"],ascending=[True,True],inplace=True)
g1.reset_index(drop=True,inplace=True)
g1.quality=g1.quality.astype("str")
g1.quality=g1.quality.astype("category")
#Create histplots for every variable binned by quality label
g1=g1.groupby(by=["variable","quality_label"],as_index=False)[["value"]].median()
pp=sns.FacetGrid(g1,col="variable",col_wrap=4,sharey=False,sharex=False,height=3,aspect=1,
hue="quality_label",hue_order=["low","medium","high"],palette="rocket")
pp.map(sns.barplot,"quality_label","value",order=["low","medium","high"])
plt.show()

In [None]:
#Having done all that we can check the correlation for each pair of significant value to quality

In [None]:
rel_check("free_sulfur_dioxide","total_sulfur_dioxide",red_wine)

In [None]:
rel_check("chlorides","alcohol",red_wine)

In [None]:
plotter("free_sulfur_dioxide","quality",red_wine,white_wine,False)

In [None]:
plotter("chlorides","quality",red_wine,white_wine,False)

In [None]:
plotter("volatile_acidity","quality",red_wine,white_wine,False)

In [None]:
plotter("alcohol","quality",red_wine,white_wine,False)