## Q1. Install and load the latest versions of TensorFlow and Keras. Print their versions.

In [1]:
# !pip install --upgrade tensorflow

In [2]:
import tensorflow as tf
from tensorflow import keras

In [3]:
print(f"Tensorflow version is: {tf.__version__}")
print(f"Keras version is: {keras.__version__}")

Tensorflow version is: 2.9.1
Keras version is: 2.9.0


## Q2. Load the Wine Quality dataset and explore its dimensions.

#### Dataset link - https://www.kaggle.com/datasets/nareshbhat/wine-quality-binary-classification

In [4]:
import pandas as pd

wine_df = pd.read_csv(filepath_or_buffer="./data/wine.csv")
wine_df.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,good
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad


##### Information about the dataset

This Data set contains the information related red wine , Various factors affecting the quality. This data set was prepossessed and downloaded from the UCI Machine Learning Repository. This data set was simple, cleaned, practice data set for classification modelling.

**Input variables (based on physicochemical tests):**

1. fixed acidity  
2. volatile acidity  
3. citric acid  
4. residual sugar  
5. chlorides  
6. free sulfur dioxide  
7. total sulfur dioxide  
8. density  
9. pH  
10. sulphates  
11. alcohol  

**Output variable (based on sensory data):**

12. quality ('good' and 'bad' based on score >5 and <5)

In [5]:
print(f"The shape of the dataset is: {wine_df.shape}")

The shape of the dataset is: (1599, 12)


## Q3. Check for null values, identify categorical variables, and encode them.

In [6]:
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   object 
dtypes: float64(11), object(1)
memory usage: 150.0+ KB


In [7]:
wine_df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

##### So, there are no null values in the dataset

In [8]:
categorical_cols = wine_df.select_dtypes(include=["object", "category"]).columns
numerical_cols = wine_df.select_dtypes(exclude=["object", "category"]).columns

print(f"The categorical columns are: {categorical_cols}")
print(f"The numerical columns are: {numerical_cols}")

The categorical columns are: Index(['quality'], dtype='object')
The numerical columns are: Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')


##### Let's encode the only categorical column "Quality" in the dataset

In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
wine_df["quality"] = label_encoder.fit_transform(wine_df["quality"])
wine_df.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [10]:
wine_df["quality"].value_counts()

quality
1    855
0    744
Name: count, dtype: int64

##### Here, 0 represents "bad" and 1 represents "good" quality respectively.

## Q4. Separate the features and target variables from the dataframe.

In [11]:
features = wine_df.drop(columns=["quality"], axis=1)
target = wine_df["quality"]

## Q5. Perform a train-test split and divide the data into training, validation, and test datasets.

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print(f"The training dataset size: {X_train.shape, y_train.shape}")
# print(f"The validation dataset size: {X_val.shape, y_val.shape}")
print(f"The testing dataset size: {X_test.shape, y_test.shape}")

The training dataset size: ((1279, 11), (1279,))
The testing dataset size: ((320, 11), (320,))


## Q6. Perform scaling on the dataset.

In [13]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1279.0,8.32369,1.724243,4.6,7.1,7.9,9.2,15.9
volatile acidity,1279.0,0.530559,0.179275,0.12,0.4,0.52,0.64,1.58
citric acid,1279.0,0.272471,0.195448,0.0,0.1,0.26,0.43,1.0
residual sugar,1279.0,2.555473,1.43579,0.9,1.9,2.2,2.6,15.5
chlorides,1279.0,0.088448,0.049332,0.012,0.071,0.08,0.091,0.611
free sulfur dioxide,1279.0,15.876075,10.313517,1.0,7.0,14.0,21.0,68.0
total sulfur dioxide,1279.0,46.657154,32.941962,6.0,22.0,38.0,63.0,289.0
density,1279.0,0.996774,0.001856,0.99007,0.995655,0.9968,0.997845,1.00369
pH,1279.0,3.31165,0.154016,2.74,3.21,3.31,3.4,4.01
sulphates,1279.0,0.660023,0.174605,0.37,0.55,0.62,0.73,2.0


In [14]:
X_train.head(4)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
493,8.7,0.69,0.31,3.0,0.086,23.0,81.0,1.0002,3.48,0.74,11.6
354,6.1,0.21,0.4,1.4,0.066,40.5,165.0,0.9912,3.25,0.59,11.9
342,10.9,0.39,0.47,1.8,0.118,6.0,14.0,0.9982,3.3,0.75,9.8
834,8.8,0.685,0.26,1.6,0.088,16.0,23.0,0.99694,3.32,0.47,9.4


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
columns = X_train.columns
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=columns)
X_train.head(4)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.218332,0.889712,0.192092,0.309726,-0.049642,0.691007,1.042934,1.846696,1.0935,0.458223,1.123177
1,-1.290166,-1.788783,0.652753,-0.80508,-0.455214,2.388473,3.59387,-3.004491,-0.400439,-0.401197,1.408272
2,1.494753,-0.784347,1.011045,-0.526378,0.599272,-0.95796,-0.991742,0.768655,-0.075669,0.515517,-0.58739
3,0.276351,0.861811,-0.063831,-0.665729,-0.009085,0.01202,-0.718427,0.089488,0.054238,-1.088733,-0.967516


## Q7. Create at least 2 hidden layers and an output layer for the binary categorical variables.

In [16]:
import tensorflow as tf
from tensorflow.keras import layers, models

layer1 = layers.Dense(units=128, activation='relu', input_shape=(X_train.shape[1],))
layer2 = layers.BatchNormalization()
layer3 = layers.Dropout(rate=0.2)

layer4 = layers.Dense(units=64, activation='relu')
layer5 = layers.BatchNormalization()
layer6 = layers.Dropout(rate=0.2)

output = layers.Dense(1, activation='sigmoid')

## Q8. Create a Sequential model and add all the layers to it.

In [17]:
model = models.Sequential()
model.add(layer1)
model.add(layer2)
model.add(layer3)
model.add(layer4)
model.add(layer5)
model.add(layer6)
model.add(output)

##### Summary of the Neural Network Model

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1536      
                                                                 
 batch_normalization (BatchN  (None, 128)              512       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 batch_normalization_1 (Batc  (None, 64)               256       
 hNormalization)                                                 
                                                                 
 dropout_1 (Dropout)         (None, 64)                0

## Q9. Implement a TensorBoard callback to visualize and monitor the model's training process.

In [19]:
# !pip install tensorboard

In [20]:
from tensorflow.keras.callbacks import TensorBoard
import time

log_dir = "./logs/fit/" + time.strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

## Q10. Use Early Stopping to prevent overfitting by monitoring a chosen metric and stopping the training if no improvement is observed.

In [21]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor="val_loss", patience=10)

## Q11. Implement a ModelCheckpoint callback to save the best model based on a chosen metric during training.

In [22]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint_path = "./models/best_model.h5"
model_checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_loss', save_best_only=True, mode='min', verose=1)

## Q12. Print the model summary.

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1536      
                                                                 
 batch_normalization (BatchN  (None, 128)              512       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 batch_normalization_1 (Batc  (None, 64)               256       
 hNormalization)                                                 
                                                                 
 dropout_1 (Dropout)         (None, 64)                0

## Q13. Use binary cross-entropy as the loss function, Adam optimizer, and include the metric ['accuracy'].

In [24]:
from keras.optimizers import Adam

optimizer = Adam(learning_rate=0.0001)
loss = 'binary_crossentropy'
metrics = ['accuracy']

## Q14. Compile the model with the specified loss function, optimizer, and metrics.

In [25]:
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)

## Q15. Fit the model to the data, incorporating the TensorBoard, Early Stopping, and ModelCheckpoint callbacks.

In [30]:
# !pip uninstall -y cython h5py
!pip install cython h5py



In [31]:
from importlib import reload

reload(keras.models)

<module 'keras.api._v2.keras.models' from 'C:\\Users\\LENOVO\\anaconda3\\lib\\site-packages\\keras\\api\\_v2\\keras\\models\\__init__.py'>

In [32]:
model.fit(
    X_train,
    y_train,
    epochs=500,
    validation_split=0.2,
    batch_size=64,
    callbacks=[early_stopping, model_checkpoint, tensorboard_callback]
)

Epoch 1/500
 1/16 [>.............................] - ETA: 1s - loss: 0.8766 - accuracy: 0.5781

ImportError: `save_model()` using h5 format requires h5py. Could not import h5py.