# Wine Data analysis

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn import preprocessing as prep
from sklearn.preprocessing import StandardScaler


redwine_data = 'data/winequality-red.xlsx'
whitewine_data = 'data/winequality-white.xlsx'

In [None]:
df_white = pd.read_excel(whitewine_data, skiprows=0, header=1, na_values='NA')
df_red = pd.read_excel(redwine_data, skiprows=0, header=1, na_values='NA')

df_white

## Data cleaning

In [None]:

df_white.isnull().sum()

In [None]:
df_white.describe()

In [None]:
df_white.sample(5)

In [None]:
df_white.dtypes

### Remove outliers based on high std

In [None]:
z_sulf = np.abs(stats.zscore(df_white['free sulfur dioxide']))
z_sulf_outliers = z_sulf[z_sulf >= 3]
z_sulf_outliers

In [None]:
# Drop rowas with high z-score across all features
df_white = df_white[(np.abs(stats.zscore(df_white)) < 3).all(axis=1)]
df_white
# 634 rows have been dropped

## Do the same with red wine data

In [None]:
df_red.shape
# (1599, 12)

In [None]:
df_red.isnull().sum()

In [None]:
df_white.describe()

In [None]:
df_red.dtypes

In [None]:
z_sulf = np.abs(stats.zscore(df_red['free sulfur dioxide']))
z_sulf_outliers = z_sulf[z_sulf >= 3]
z_sulf_outliers

In [None]:
# Drop rows with high z-score
df_red = df_red[(np.abs(stats.zscore(df_red)) < 3).all(axis=1)]
df_red
# 50 rows have been dropped

## Combine dataframes

In [None]:
df_white['color'] = 'white'
df_white

In [None]:
df_red['color'] = 'red'
df_red

In [None]:
df_combined = pd.concat([df_red, df_white], ignore_index=True)
df_combined

## Data exploration

In [None]:
df_combined.dtypes

In [None]:
df_combined.shape

In [None]:
df_combined.describe()

In [None]:
# here we see the median25th and 75th percentiles, the range, and the outliers
df_combined['total sulfur dioxide'].plot.box()

In [None]:
df_combined['free sulfur dioxide'].plot.box()

As we can see, the standard deviation is relatively small on most features. The features with higher STD f.ex 'free sulfur dioxide' has a few outliers in the upper end, but nothing serious.
Generally the data has normal distribution

### Comparing white and red

In [None]:
# Convert color to category
df_combined["color"] = df_combined["color"].astype('category')
df_combined.dtypes

In [None]:
df_combined.boxplot(column=['fixed acidity', 'residual sugar', 'alcohol', 'quality'], by='color')

From these plots we can see that red wines tend to have slighty more acidity, and white wines seems to usually have around the same acidity.
White wine seems to have a wide range of sugar amounts and a higher average, and red wines usually seem to have lower sugar content
Alcohol percentage seems to be around the same distribution for both types, which is around 10% in median. The white wine average seems to be slightly higher.

The overall quality seems to be around the same distribution for both types.


In [None]:
# Here we see an expected correlation between free sulfur and total sulfur
df_combined.plot.scatter(x='free sulfur dioxide', y='total sulfur dioxide', figsize=(6, 6))

Here is an interesting correlation between the alcohol percentage and the density of the wine.
The points are seperated in colors, showing us that red wine seems to have a higher overall density than white wine, which might be interesting to businesses or customers

In [None]:
white = df_combined[df_combined['color'] == 'white']
red = df_combined[df_combined['color'] == 'red']

plt.scatter(white['alcohol'], white['density'], color='green', label='white')
plt.scatter(red['alcohol'], red['density'], color='red', label='red')

plt.xlabel('Alcohol')
plt.ylabel('Density')
plt.legend()

### Creating subsets

In [None]:
bin_data = df_combined[['pH']]
bin_data['color'] = df_combined['color']

In [None]:
bin_data['pH_bin'] = pd.cut(df_combined['pH'], bins=[0, 2.9, 3.1, 3.3, 3.5, 3.8], labels=["Low", "Low-Mid", "Mid", "High-Mid", "High"])
bin_data

Here we can see that most of the wines have a pH value between 3.1and 3.3

In [None]:
bin_data['pH_bin'].value_counts().plot(kind='bar')

### Correlation

In [None]:
df_cat_num = df_combined
df_cat_num["color_cat"] = df_combined["color"].cat.codes
df_cat_num = df_cat_num.drop(['color'], axis=1)

In [None]:
df_corr = df_cat_num.corr()
df_corr

In this heatmap, we can see the strong negative correlation between alchohol and density.
We can also see that the quality of the wine doesn't strongly correlate to anything, but correlates partly to density, volatile acidity, chlorides and alcohol percentage

In [None]:
sns.heatmap(df_corr, annot=False, square=True)

Lets split it up into its categories:

In [None]:
df_cat_num_white = white
df_cat_num_white["color_cat"] = white["color"].cat.codes
df_cat_num_white = df_cat_num_white.drop(['color'], axis=1)

df_cat_num_red = red
df_cat_num_red["color_cat"] = red["color"].cat.codes
df_cat_num_red = df_cat_num_red.drop(['color'], axis=1)

In [None]:
df_corr_white = df_cat_num_white.corr()
df_corr_red = df_cat_num_red.corr()

White wine correlation map:

In [None]:
sns.heatmap(df_corr_white, annot=False, square=True)

Red wine correlation map:

In [None]:
sns.heatmap(df_corr_red, annot=False, square=True)

Funnily enough, it seems that the acidity correlates more to pH value in white wines than in red wines.
The white wines seem to have a stronger correlation between alchohol and density than red wine.
In white wines, it seems that the quality of the wine correlates most with density and chlorides,
and red wines quality seems to correlate closer to the pH value and sulphates of the wine.

## Preparing data for further analysis

In [None]:
# checking for outliers
df_combined['residual sugar'].plot.box()

In [None]:
df_combined['residual sugar'].max()

In [None]:
# 2 outlier rows
df_combined.loc[(df_combined['residual sugar'] == df_combined['residual sugar'].max())]

In [None]:
df_combined = df_combined.drop([1859, 4845])

In [None]:
# Get index of feature least correlating with quality
min_corr_column = df_corr['quality'].abs().idxmin()
min_corr_column

In [None]:
df_combined = df_combined.drop(['pH', 'color'], axis=1)
df_combined

In [None]:
df_combined = df_combined.drop(['color'], axis=1)

## PCA

In [None]:
X = df_combined.values
X

In [None]:
plt.figure()
plt.title('Input data')

# calculate the range of coordinates
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

# plot coordinates
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

# plot the points
plt.scatter(X[:,0], X[:,1], color='black', s=80, marker='o', facecolors='none')

plt.show()

In [None]:
# Method L1: Least Absolute Deviation
# if we add the normalized values in each row, the sum of the new values is always 1
nl1 = prep.normalize(X, norm='l1')
nl1


In [None]:
sc = StandardScaler()
X = nl1
X = sc.fit_transform(X)  
X

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
pca_data = pca.fit_transform(X)
pca_data

In [None]:
pcadf = pd.DataFrame(data = pca_data, columns = ['pc 1', 'pc 2', 'pc 3', 'pc 4'])
pcadf

In [None]:
# first principal component 29% variance
explained_variance = pca.explained_variance_ratio_  
explained_variance

In [None]:
# Plot the explained_variance
plt.plot(explained_variance, 'bx-', c='red')
plt.xlabel('component')
plt.ylabel('variance')
plt.title('The optimal number of components')
plt.show()

In [None]:
# Plot the cumulative explained_variance
cumulative = np.cumsum(explained_variance)
plt.plot(cumulative, 'b*-', c='green')
plt.xlabel('components')
plt.ylabel('cumulative')
plt.title('The optimal number of components')
plt.show()


It seems to me that the optimal number of principal components is 3, since we will retain about 75% variance of the dataset by using these 3 principal components.

In [None]:
loadings = pca.components_[0]
loadings_df = pd.DataFrame(loadings, index=df_combined.columns, columns=['Loadings'])
loadings_df

In [None]:
p_comps = loadings_df['Loadings'].abs().nlargest(3)
p_comps

In [None]:
# Create new dataframe from principal components
indexes = [*p_comps.index, 'quality']
p_df = df_combined[indexes]
p_df.sample(10)