In [1]:
#Importing required packages.
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
%matplotlib inline

  from numpy.core.umath_tests import inner1d


In [2]:
#Loading dataset
wine = pd.read_csv('/home/xtian209/Documents/Machine_Learning/workspace/red_wine_quality/dataset/winequality-red.csv')

In [3]:
# wine['fixed acidity'] = np.log(wine['fixed acidity'])

In [4]:
# Look at the values within the data set
wine[:10]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [5]:
# Descriptive distribution of the data in each column
# print("Data shape: (" + str(wine.shape[0]) + ", " + str(wine.shape[1]) + ")\n")
round(wine.describe(), 3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.32,0.528,0.271,2.539,0.087,15.875,46.468,0.997,3.311,0.658,10.423,5.636
std,1.741,0.179,0.195,1.41,0.047,10.46,32.895,0.002,0.154,0.17,1.066,0.808
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.996,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.997,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.998,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.004,4.01,2.0,14.9,8.0


<h2>Column Information</h2>
<p>The list below illustrates information with regards to how each column affects the wine's taste.</p>
<ul>
    <li>Fixed Acidity: acids that is known for the sourness or tartness that is a fundamental feature in wine taste. Predominant fixed acids in wine are tartaric, malic, citric, and succinic.</li>
    <li>Volatile Acidity: otherwise known as "wine spoilage," is a property known to determine the strength of the vinegar taste in the wine. This is due to the creatin of acetic acid in the fermentation process.</li>
    <li>Citric Acid: acid that is used as a flavor additive to the wine. However, it may cause the creation of more acetic acid and can cause the growth of unwanted microbes.</li>
    <li>Residual Sugar: measure of the amount of sugar solids in a given volume of wine following the end of fermentation and any sugar addition of the wine creation process. This is dependent on the type of wine that one is making. Sweetness in dry wine such as Chardonnay is considered a wine fault while a dessert wine such as Muscato is good.</li>
    <li>Chlorides: the amount of salty taste that exists in the wine</li>
    <li>Free Sulfur Dioxide: used as a preservative in wine and a cleaning agent for barrels and winery facilities. Free sulfur dioxide and pH levels have some relations to each other.</li>
    <li>Density: has more to do with the looks of the wine. However studies show a strong correlation between density of a wine's color and the total tannins, which makes wine dry, in the wine. This also is used to measure percent alcohol and sugar content.</li>
    <li>pH: due to the acidity contents, wine should be towards the acidic side of the pH scale.</li>
    <li>Sulphates: used for the preservation of wine and slows chemical reactions, letting wine last longer.</li>
    <li>Alcohol: the percentage of alcohol in the wine (ABV)</li>
</ul>

<p>Sources:
    <ul>
        <li>http://waterhouse.ucdavis.edu/whats-in-wine/fixed-acidity</li>
        <li>http://waterhouse.ucdavis.edu/whats-in-wine/volatile-acidity</li>
        <li>http://www.calwineries.com/learn/wine-chemistry/wine-acids/citric-acid</li>
        <li>https://winemakermag.com/501-measuring-residual-sugar-techniques</li>
        <li>http://wineoscope.com/2015/10/02/when-a-wine-is-salty-and-why-it-shouldnt-be/</li>
        <li>https://winobrothers.com/2011/10/11/sulfur-dioxide-so2-in-wine/</li>
        <li>https://www.winewordswisdom.com/winetastingtips.html</li>
        <li>https://winefolly.com/review/understanding-acidity-in-wine/</li>
        <li>https://winefolly.com/tutorial/sulfites-in-wine/</li>
        <li>You should know...</li>
    </ul>
        
</p>

<h2>Data Preprocessing</h2>

In [6]:
# Create a new column that splits the data set based on whether the quality is good (>= 7) or bad (< 7)
wine['category'] = wine['quality'].apply(lambda value: 0 if value < 7 else 1)

In [7]:
wine[:10]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,category
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,0
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5,0
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7,1
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7,1
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5,0


In [8]:
# Changing from DataFrame to matrix and column
X = wine.iloc[:, :-2].values
y = wine.iloc[:, -1].values

In [9]:
# Train, Cross Validation, and Test Split
#np.random.seed(0)
# np.random.randint(low = 0, high = 50)
# print(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_cross, y_train, y_cross = train_test_split(X_train, y_train, test_size =  0.3)

In [10]:
# Standardize Data Set
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
X_cross = sc.fit_transform(X_cross)

<h2>Principal Component Analysis (PCA) and K-Means Clustering</h2>

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
reduced_data = PCA(n_components = 2).fit_transform(X_train)

In [None]:
print(reduced_data)

In [None]:
fig, ax = plt.subplots(figsize = (20, 6))
sns.set_style('dark')
fig.suptitle('PCA', fontsize = 18)
sns.scatterplot(x = reduced_data[:, 0], y = reduced_data[:, 1], hue = y_train)
plt.locator_params(axis = 'x', nbins = 7)

In [None]:
kmeans = KMeans(n_clusters = 2, n_init = 100).fit(reduced_data)
centroids = kmeans.cluster_centers_
print(centroids)

In [None]:
fig, ax = plt.subplots(figsize = (20, 6))
sns.set_style('dark')
fig.suptitle('PCA', fontsize = 18)
sns.scatterplot(x = reduced_data[:, 0], y = reduced_data[:, 1], hue = y_train)
plt.locator_params(axis = 'x', nbins = 7)
plt.scatter(centroids[:, 0], centroids[:, 1], marker = 'x', s = 500, linewidths = 3, color = 'red')

In [None]:
del kmeans

<h2>Mini Batch K-Means Clustering</h2>

In [11]:
from sklearn.cluster import MiniBatchKMeans
from sklearn import metrics
from time import time

In [12]:
kmeans = MiniBatchKMeans(n_clusters = 2, n_init = 100, batch_size = 256).fit(X_train)

In [None]:
def k_means_benchmark():
    t0 = time()
    #estimator.fit(data)
    print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

In [13]:
print(X_train[:10,0])
print(X_train[:10,1])
print(y_train[:10])
print(kmeans.cluster_centers_[0])
print(kmeans.cluster_centers_[1])

[ 1.84932540e+00 -6.33538301e-01 -5.18056268e-01 -7.49020333e-01
 -5.61281387e-02 -3.44833220e-01  2.36899454e+00  1.61287755e-03
  1.74835926e-01 -4.60315252e-01]
[-0.46367338 -0.97306904  0.27212037  2.50780135 -0.12407626 -0.2372753
 -1.02966856  0.15892133 -0.2372753  -0.85987001]
[0 1 0 0 0 0 0 0 0 0]
[-0.48668229  0.31935685 -0.50386172 -0.04037364 -0.14513611  0.20219887
  0.20857847 -0.24365132  0.36145199 -0.24553686 -0.10054695]
[ 1.01845307 -0.66058668  1.02791872  0.12560581  0.24107312 -0.33895052
 -0.32103252  0.54134782 -0.78371841  0.49804223  0.15936607]


In [None]:
# fig, ax = plt.subplots(figsize = (20, 6))
# sns.set_style('dark')
# fig.suptitle('Fixed Acidity vs. Volatile Acidity', fontsize = 18)
# sns.scatterplot(x = X_train[:, 0], y = X_train[:, 1], hue = y_train[:])
# plt.locator_params(axis = 'x', nbins = 7)
# plt.scatter(kmeans.cluster_centers_[0], kmeans.cluster_centers_[1], marker = 'x', s = 500, linewidths = 3, color = 'red')

In [14]:
labels_true = y_train
labels_pred = kmeans.fit_predict(X_train)

print("Adjusted Rand Index:", metrics.adjusted_rand_score(labels_true, labels_pred))
print("Adjsuted Mutual Information:", metrics.adjusted_mutual_info_score(labels_true, labels_pred))
print("Homogenity:", metrics.homogeneity_score(labels_true, labels_pred))
print("Completeness:", metrics.completeness_score(labels_true, labels_pred))


Adjusted Rand Index: 0.10634088296351418
Adjsuted Mutual Information: 0.046048710541179844
Homogenity: 0.07586874064168166
Completeness: 0.046879663149107825


In [None]:
# correct = 0
# for i in range(len(X_train)):
#     if (y_train[i] == kmeans.labels_[i]):
#         correct += 1
# print("Number of correct guesses in training set: {} out of {}, which is {}".format(correct, len(X_train), round(correct/len(X_train), 5)))
# print("Number of incorrect guesses in training set: {} out of {}, which is {}".format(len(X_train) - correct, len(X_train), round((len(X_train) - correct)/len(X_train), 5)))

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.get_params

In [None]:
del kmeans

<h2>Random Forest Algorithm</h2>

In [None]:
# initial = time.clock()
random_forest = RandomForestClassifier(n_estimators = 275, criterion = 'entropy', max_depth = 15, min_samples_split = 3, max_features = 3, max_leaf_nodes = 23)
random_forest.fit(X_train, y_train)
pred_random_forest = random_forest.predict(X_cross)
pred_test_forest = random_forest.predict(X_test)
# print(str((time.clock() - initial) * 100) + " ms")
# , max_depth = 15, min_samples_split = 3

In [None]:
print("Accuracy on the training subset: {:.5f}".format(random_forest.score(X_train, y_train)))
print("Accuracy on the cross validation subset: {:.5f}".format(random_forest.score(X_cross, y_cross)))
print("Accuracy on the test subset: {:.5f}".format(random_forest.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
con_mat = confusion_matrix(y_cross, pred_random_forest)
print("Below is the confusion matrix results for the cross validation set:")
print(con_mat)
# row: prediction
# column : cross validation
#                 Actual Zero   Actual One
# Predicted Zero
# Predicted One

print("\nBelow is the classification report for the cross validation set:")
print(classification_report(y_cross, pred_random_forest))


con_mat2 = confusion_matrix(y_test, pred_test_forest)
print("\n\nBelow is the confusion matrix results for the test set:")
print(con_mat2)
# row: prediction
# column : cross validation
#                 Actual Zero   Actual One
# Predicted Zero
# Predicted One

print("\nBelow is the classification report for the test set:")
print(classification_report(y_test, pred_test_forest))

In [None]:
random_forest.get_params()

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

In [None]:
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# max_features = ['auto', 'sqrt']
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# min_samples_split = [2, 5, 10]
# min_samples_leaf = [1, 2, 4]
# bootstrap = [True, False]

In [None]:
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}

In [None]:
# print(random_grid)

In [None]:
# from sklearn.ensemble import RandomForestRegressor

# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestRegressor()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# # Fit the random search model
# rf_random.fit(X_train, X_test)

In [None]:
# rf_random.best_params_

<h2>Deep Neural Network</h2>

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
# from keras.optimizers import SGD

In [None]:
model = Sequential()
model.add(Dense(512, input_dim = 11, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()
# scores = model.evaluate(X_cross, y_cross)

In [None]:
model.fit(X_train, y_train, batch_size = 64, verbose = 2, epochs = 100, validation_data = (X_cross, y_cross))

In [None]:
scores = model.evaluate(X_test, y_test)
print("Test Loss: " + str(scores[0]) + "\nTest Accuracy: " + str(scores[1]))

In [None]:
predictions = model.predict(X_test)
print("First prediction: ", predictions[0])