## Import Libraries

In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [9]:
import numpy as np # conda install numpy
import tensorflow as tf 
import matplotlib.pyplot as plt # conda install matplotlib
import pandas as pd # conda install pandas
import seaborn as sns # conda install seaborn

from sklearn.preprocessing import StandardScaler
# %matplotlib inline

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


AttributeError: module 'numpy' has no attribute 'typeDict'

## Data Loading

In [1]:
data = pd.read_csv('./winequality-red.csv')
data.columns = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides'
                , 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
data

NameError: name 'pd' is not defined

## EDA and Exploring features

In [None]:
data.shape

In [None]:
#Check datatypes of each columns
data.dtypes

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
#Check for null values
data.isnull().sum()

We can see that there are no null values in any column.

In [None]:
#Check distinct values in quality column
data.quality.unique()

In [None]:
#Check for total number of values for each quality rating
data.quality.value_counts()

In [None]:
#Check correlation of every feature w.r.t the quality of the wine.

data.corr()['quality']

<h3>Heat Map of Correlation Matrix</h3>

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.corr(),annot=True,linewidth=0.5,center=0,cmap='coolwarm')
plt.show()

In [None]:
plt.hist(data.quality,bins=6,alpha=0.5,histtype='bar',ec='black')
plt.title('Distribution of the Quality')
plt.xlabel('Quality')
plt.ylabel('Count')
plt.show()

The above plot shows the distribution of the quality of the wine in the dataset, and represents that most of the wine is of average quality i.e. quality ranging from 5 to 7 .

In [None]:
sns.boxplot(x='quality',y='pH',data=data,palette='GnBu_d')
plt.title("Boxplot of Quality and pH")
plt.show()

In [None]:
ax = sns.boxplot(x='quality',y='alcohol',data=data,palette='GnBu_d')
plt.title("Boxplot of Quality and Alcohol")
plt.show()

The above plot shows the increase in the quality of wine with the increase in alcohol. The quality of the wine is directly related to the amount of alcohol in the wine. More the alcohol in the wine better will be the quality. Whereas it shows slight decrease in pH scale with good quality.

In [None]:
sns.boxplot(x="quality",y="residual_sugar",data=data,palette="GnBu_d")
plt.title("Boxplot of Quality and residual sugar")
plt.show()

Not much effect of sugar residual on quality.

In [None]:
sns.boxplot(x="quality",y="density",data=data,palette="GnBu_d")
plt.title("Boxplot of Quality and Density")
plt.show()

Lower the density of wine better will be the quality of the wine. From the above boxplot we acn visualize that the quality of wine increases with decrease in density.

In [None]:
sns.boxplot(x="quality",y="sulphates",data=data,palette="GnBu_d")
plt.title("Boxplot of Quality and Sulphates")
plt.show()

The above plot represents that the quality of alcohol increases with the increase in the amount of sulphates in the wine.

In [None]:
sns.boxplot(x="quality",y="chlorides",data=data,palette="GnBu_d")
plt.title("Boxplot of Quality and Chlorides")
plt.show()

Not much effect in the quality of wine for a particular amount of chlorides mixed in them.

In [None]:
sns.boxplot(x="quality",y="citric_acid",data=data,palette="coolwarm")
plt.title("Boxplot of Quality and Citric Acid")
plt.show()

The quality of the wine increses with increase in the amount of citric acid in the wine.

In [None]:
sns.boxplot(x="quality",y="volatile_acidity",data=data,palette="coolwarm")
plt.title("Boxplot of Quality and Volatile Acidity")
plt.show()

The quality of wine increases with the decrease in the amount of volatile acids.

In [None]:
sns.boxplot(x="quality",y="fixed_acidity",data=data,palette="coolwarm")
plt.title("Boxplot of Quality and Fixed Acidity")
plt.show()

There is not much effect of fixed acidity on the quality of the wine just a slight increase.

## Data preprocessing

In [None]:
features = data.drop(['quality'] , axis = 1)
target = data['quality']

we will try the random forrest to analyse the feature importance

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=1, max_depth=12)

data = pd.get_dummies(data)
model.fit(features, data.quality)
display(model.feature_importances_)
feat = data.columns

imp = model.feature_importances_
indices = np.argsort(imp)[:]

In [None]:
plt.title('Feature imp')
plt.barh(range(len(indices)), imp[indices], color='b', align='center')
plt.yticks(range(len(indices)), [feat[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

The top features to affect the quality of the wine are Alcohol,Sulphates, and volatile acidity and the least important features such as fixed acidity,fee sulpher dioxide and citric acid, so the least important can be removed.

In [None]:
del features['fixed_acidity']
del features['free_sulfur_dioxide']
del features['citric_acid']
features

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

le = LabelEncoder()
y = le.fit_transform(data.iloc[: , -1])
y = pd.DataFrame(y.reshape(len(y),1))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features,y, test_size = 0.2, random_state = 0)

In [None]:
y_train_cat = tf.keras.utils.to_categorical(y_train, 6)
y_test_cat = tf.keras.utils.to_categorical(y_test, 6)

## Tensorflow DNNs

In [None]:
def create_feature_column():
    feat_volatile_acidity = tf.feature_column.numeric_column('volatile_acidity')
    feat_residual_sugar = tf.feature_column.numeric_column('residual_sugar')
    feat_chlorides = tf.feature_column.numeric_column('chlorides')
    feat_total_sulfur_dioxide  = tf.feature_column.numeric_column('total_sulfur_dioxide')
    feat_density              = tf.feature_column.numeric_column('density')
    feat_pH  = tf.feature_column.numeric_column('pH')
    feat_sulphates  = tf.feature_column.numeric_column('sulphates')
    feat_alcohol  = tf.feature_column.numeric_column('alcohol')


    feature_column = [ feat_volatile_acidity,feat_residual_sugar, feat_chlorides, 
                       feat_total_sulfur_dioxide,feat_density,feat_pH,feat_sulphates,feat_alcohol] 

    return feature_column
feature_column = create_feature_column()

In [None]:
def traineval(x_train, y_train, training=False, batch_size=128):
    """An input function for training or evaluating"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(x_train), y_train))

    # Shuffle and repeat if you are in training mode.
    if training:
        dataset = dataset.shuffle(1000).repeat()

    return dataset.batch(batch_size)


In [None]:
my_feature_columns = []
for key in data.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))

In [None]:
dnn = tf.estimator.DNNClassifier(hidden_units = [20,20,20],
                                      feature_columns = feature_column,
                                      n_classes= 6,
                                      activation_fn=tf.nn.relu,
                                      dropout=None)

In [None]:
dnn.train(
    input_fn= lambda: traineval(x_train, y_train , training=True), steps=5000)

In [None]:
eval_result = dnn.evaluate(input_fn=lambda: traineval(x_test, y_test, training=False))
predictions = dnn.predict(input_fn=lambda: traineval(x_test, y_test, training=False))

In [None]:
print(eval_result)

In [None]:
nn = tf.keras.models.Sequential(layers = None , name = None)
nn.add(tf.keras.layers.Input(shape = 8,))
nn.add(tf.keras.layers.Dense(units = 16 , activation = "relu" ))
nn.add(tf.keras.layers.Dense(units = 8 , activation = "relu" ))
nn.add(tf.keras.layers.Dense(units = 6 , activation = "sigmoid"))
nn.summary()
nn.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' ,metrics= ['accuracy'])
history = nn.fit(x_train, y_train_cat,  batch_size= 32, epochs = 150 , validation_data = (x_test,y_test_cat))



<h2>Model Evaluation</h2>

In [None]:
#Here we plot the loss vs Validation loss using a line graph.
plt.plot(history.history['loss'], label='training data')
plt.plot(history.history['val_loss'], label='validation data')
plt.legend()
plt.title('MAE for model')
plt.ylabel('MAE')
plt.xlabel('epoch')
plt.show()

*Here* we are plotting the training accuracy vs validation accuracy. 

In [None]:
plt.plot(history.history['accuracy'], label='Accuracy training data')
plt.plot(history.history['val_accuracy'], label='Accuracy validation data')
plt.legend()
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('epoch')
plt.show()

<h2>CONCLUSION</h2>

We could see the loss is high in the beginning. However, it descreases with the iterations. The training data has more loss when compared to the validation data. Also, The model is not properly trained with few resons like small validation set, biased data because the model has test time more robust and can lead to higher testing accuracies. And the plot is showing us that it is giving better acuuracy in Validation testing.