In [87]:
import numpy as np
import pandas as pd

In [88]:
# Load red and white wine datasets.
red_wine_data = pd.read_csv("winequality-red.csv")
white_wine_data = pd.read_csv("winequality-white.csv", sep = ';')

# Preview both datasets.
print(red_wine_data.head(5))
print('*' * 120)
print(white_wine_data.head(5))

   fixed acidity  volatile acidity  citric acid  ...  sulphates  alcohol  quality
0            7.4              0.70         0.00  ...       0.56      9.4        5
1            7.8              0.88         0.00  ...       0.68      9.8        5
2            7.8              0.76         0.04  ...       0.65      9.8        5
3           11.2              0.28         0.56  ...       0.58      9.8        6
4            7.4              0.70         0.00  ...       0.56      9.4        5

[5 rows x 12 columns]
************************************************************************************************************************
   fixed acidity  volatile acidity  citric acid  ...  sulphates  alcohol  quality
0            7.0              0.27         0.36  ...       0.45      8.8        6
1            6.3              0.30         0.34  ...       0.49      9.5        6
2            8.1              0.28         0.40  ...       0.44     10.1        6
3            7.2              0.23  

In [89]:
# Checking for dtypes and missing values.
print(red_wine_data.info(), '\n')
print('Number of missing values in red wine dataset = ', red_wine_data.isnull().sum().sum())
print('*' * 120)
print(white_wine_data.info(), '\n')
print('Number of missing valeus in white wine dataset = ', white_wine_data.isnull().sum().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
None 

Number of missing values in red wine dataset =  0
*********************************************************************

<p>
Since both datasets have no missing values whatsoever, the imputation process will not be required, given the case.
</p>

In [90]:
# Describe red wine dataset.
red_wine_data.describe(include = 'all')

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [91]:
# Describe white wine dataset.
white_wine_data.describe(include = 'all')

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [92]:
# "Cardinality" means the number of unique values in a column.
# Select all categorical columns in both wine datasets for the encoding process if required, or not.
# Select numeric features as well.
red_wine_numeric_features = [cname for cname in red_wine_data.columns if red_wine_data[cname].dtype in ['int64', 'float64']]
red_wine_categorical_features = [cname for cname in red_wine_data.columns if red_wine_data[cname].dtype == "object"]

white_wine_numeric_features = [cname for cname in white_wine_data.columns if white_wine_data[cname].dtype in ['int64', 'float64']]
white_wine_categorical_features = [cname for cname in white_wine_data.columns if white_wine_data[cname].dtype == "object"]

In [93]:
# Inspect Categorical columns to be encoded.
# print(red_wine_categorical_features, white_wine_categorical_features)         # None within.
# print(white_wine_data[white_wine_categorical_features])                       # Empty DataFrame.
# print(red_wine_data[red_wine_categorical_features])                           # Empty Dataframe.

<p>
The lines above were to display categorical values in both datasets, and it seems that none of them contain columns having categorical data, this implies that feature encodings would not be necessary at all to be performed in this case.
</p>

In [94]:
'''
Preprocessing procedures(categorical feature encodings / imputation of missing values) would not be necessary here, 
but should they be needed in a different case, 
the author of this notebook is also able to perform them efficiently.
'''
# Since categorical features are not present at all, it is safe to keep only numeric features included in the final data.
final_red_wine_data = red_wine_data[red_wine_numeric_features]
final_white_wine_data = white_wine_data[white_wine_numeric_features]

In [95]:
# Preview final form of both datasets.
print(final_red_wine_data.head(10))
print('*' * 120)
print(final_white_wine_data.head(10))

   fixed acidity  volatile acidity  citric acid  ...  sulphates  alcohol  quality
0            7.4              0.70         0.00  ...       0.56      9.4        5
1            7.8              0.88         0.00  ...       0.68      9.8        5
2            7.8              0.76         0.04  ...       0.65      9.8        5
3           11.2              0.28         0.56  ...       0.58      9.8        6
4            7.4              0.70         0.00  ...       0.56      9.4        5
5            7.4              0.66         0.00  ...       0.56      9.4        5
6            7.9              0.60         0.06  ...       0.46      9.4        5
7            7.3              0.65         0.00  ...       0.47     10.0        7
8            7.8              0.58         0.02  ...       0.57      9.5        7
9            7.5              0.50         0.36  ...       0.80     10.5        5

[10 rows x 12 columns]
**************************************************************************

In [96]:
# Merge both final datasets then display it.
combined_wine_dataset = final_red_wine_data.append([final_white_wine_data])
display(combined_wine_dataset)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [97]:
# Export this merged dataset to be used for data visualization in the next notebook.
combined_wine_dataset.to_csv('merged.csv', index = False)