# Loading Libraries

In [1]:
import pandas as pd # Data analysis and manipulation tool
import numpy as np # Fundamental package for linear algebra and multidimensional arrays


# Loading Dataset

In [4]:
wine_dataset = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/Wine_Dataset/winequality-red.csv", sep=";")

# Initial Review

In [5]:
wine_dataset.head()  # .head() function of pandas library returns the first five observations of the dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
wine_dataset.shape    # .shape returns the total number of rows and columns in the dataset

(1599, 12)

In [8]:
wine_dataset.describe()  # .describe() returns various summary statistics

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


#  Exploring the Features

In [10]:
wine_dataset.columns   # quality is the target variable here as we are trying to know which of the two types of wine have better quality

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [12]:
wine_dataset['quality'].unique()   # Unique values of Quality(Target variable)

array([5, 6, 7, 4, 8, 3], dtype=int64)

# Frequency Counts of each Quality Value

In [13]:
wine_dataset['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

### Observations:
This tells us the vote count of each quality score in descending order.
“quality” has most values concentrated in the categories 5, 6 and 7.
Only a few observations made for the categories 3 & 8.

# Renaming Columns
Let's rename the columns which contain spaces in their names and replace the spaces with underscores.

In [14]:
wine_dataset.rename(columns={
    'fixed acidity' : 'fixed_acidity', 'citric acid' : 'citric_acid', 'volatile acidity' : 'volatile_acidity', 'residual sugar' : 'residual_sugar', 'free sulphur dioxide' : 'free_sulphur_dioxide', 'total sulphur dioxide' : 'total_sulphur_dioxide'
}, inplace = True)

### 
rename() function is used to rename the columns
implace = True, makes changes in the dataframe itself

#  Checking for Missing Values
Pandas provides isnull(), isna() functions to detect missing values. Both of them do the same thing.

df.isna() returns the dataframe with boolean values indicating missing values.
You can also choose to use notna() which is just the opposite of isna().
df.isna().any() returns a boolean value for each column. If there is at least one missing value in that column, the result is True.
df.isna().sum() returns the number of missing values in each column.
 

In [15]:
wine_dataset.isna().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [16]:
wine_dataset.isnull().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [17]:
wine_dataset.isnull()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,False,False,False,False,False,False,False,False,False,False,False,False
1595,False,False,False,False,False,False,False,False,False,False,False,False
1596,False,False,False,False,False,False,False,False,False,False,False,False
1597,False,False,False,False,False,False,False,False,False,False,False,False


### Observation:
There is no missing value in any column

# info()
df.info returns information about the data frame including the data types of each column, number of null values in each column and memory usage of the entire data.

In [19]:
wine_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         1599 non-null   float64
 1   volatile_acidity      1599 non-null   float64
 2   citric_acid           1599 non-null   float64
 3   residual_sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


### Observations:
The data has only float and integer values.

There are no missing values

# Checking for Duplicates
Duplicates might or might not affect the quality of data. Before deciding if they should be removed, it is essential to understand why they might have occurred in the first place.

Duplicates can be checked using the duplicated() method.

 

In [20]:
duplicate_entries = wine_dataset[wine_dataset.duplicated()]
duplicate_entries.shape

(240, 12)

### Observations:
There are 240 duplicates. The quality ratings for the same/similar wine were given by different wine tasters so there is a possibility of similar reviews. We can thus keep these duplicates.

 