In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, explained_variance_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [3]:
!pip install ucimlrepo
!pip install tensorflow



In [4]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
data = fetch_ucirepo(id=109).data 

In [5]:
data.features.head(3)

Unnamed: 0,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185


In [6]:
data.targets.head(3)

Unnamed: 0,class
0,1
1,1
2,1


In [7]:
# Join the features and target back into one dataset
dataset = data.features.join(data.targets)

dataset.head(3)

Unnamed: 0,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline,class
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1


In [8]:
dataset.isna().sum()

Alcohol                         0
Malicacid                       0
Ash                             0
Alcalinity_of_ash               0
Magnesium                       0
Total_phenols                   0
Flavanoids                      0
Nonflavanoid_phenols            0
Proanthocyanins                 0
Color_intensity                 0
Hue                             0
0D280_0D315_of_diluted_wines    0
Proline                         0
class                           0
dtype: int64

In [9]:
dataset.describe()

Unnamed: 0,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline,class
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,1.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,1.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,1.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,2.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,3.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,3.0


In [10]:
dataset['class'].value_counts()

2    71
1    59
3    48
Name: class, dtype: int64

In [11]:
wine_shuffled = dataset.sample(n=len(dataset), random_state=1)

wine_shuffled

Unnamed: 0,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline,class
161,13.69,3.26,2.54,20.0,107,1.83,0.56,0.50,0.80,5.88,0.96,1.82,680,3
117,12.42,1.61,2.19,22.5,108,2.00,2.09,0.34,1.61,2.06,1.06,2.96,345,2
19,13.64,3.10,2.56,15.2,116,2.70,3.03,0.17,1.66,5.10,0.96,3.36,845,1
69,12.21,1.19,1.75,16.8,151,1.85,1.28,0.14,2.50,2.85,1.28,3.07,718,2
53,13.77,1.90,2.68,17.1,115,3.00,2.79,0.39,1.68,6.30,1.13,2.93,1375,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,12.70,3.55,2.36,21.5,106,1.70,1.20,0.17,0.84,5.00,0.78,1.29,600,3
137,12.53,5.51,2.64,25.0,96,1.79,0.60,0.63,1.10,5.00,0.82,1.69,515,3
72,13.49,1.66,2.24,24.0,87,1.88,1.84,0.27,1.03,3.74,0.98,2.78,472,2
140,12.93,2.81,2.70,21.0,96,1.54,0.50,0.53,0.75,4.60,0.77,2.31,600,3


In [12]:
pd.get_dummies(wine_shuffled['class']).head()

Unnamed: 0,1,2,3
161,0,0,1
117,0,1,0
19,1,0,0
69,0,1,0
53,1,0,0


In [13]:
wine_shuffled.drop('class', axis=1)

Unnamed: 0,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline
161,13.69,3.26,2.54,20.0,107,1.83,0.56,0.50,0.80,5.88,0.96,1.82,680
117,12.42,1.61,2.19,22.5,108,2.00,2.09,0.34,1.61,2.06,1.06,2.96,345
19,13.64,3.10,2.56,15.2,116,2.70,3.03,0.17,1.66,5.10,0.96,3.36,845
69,12.21,1.19,1.75,16.8,151,1.85,1.28,0.14,2.50,2.85,1.28,3.07,718
53,13.77,1.90,2.68,17.1,115,3.00,2.79,0.39,1.68,6.30,1.13,2.93,1375
...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,12.70,3.55,2.36,21.5,106,1.70,1.20,0.17,0.84,5.00,0.78,1.29,600
137,12.53,5.51,2.64,25.0,96,1.79,0.60,0.63,1.10,5.00,0.82,1.69,515
72,13.49,1.66,2.24,24.0,87,1.88,1.84,0.27,1.03,3.74,0.98,2.78,472
140,12.93,2.81,2.70,21.0,96,1.54,0.50,0.53,0.75,4.60,0.77,2.31,600


In [14]:
wine_list = pd.concat([wine_shuffled.drop('class', axis=1), pd.get_dummies(wine_shuffled['class'])], axis=1)

wine_list

Unnamed: 0,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline,1,2,3
161,13.69,3.26,2.54,20.0,107,1.83,0.56,0.50,0.80,5.88,0.96,1.82,680,0,0,1
117,12.42,1.61,2.19,22.5,108,2.00,2.09,0.34,1.61,2.06,1.06,2.96,345,0,1,0
19,13.64,3.10,2.56,15.2,116,2.70,3.03,0.17,1.66,5.10,0.96,3.36,845,1,0,0
69,12.21,1.19,1.75,16.8,151,1.85,1.28,0.14,2.50,2.85,1.28,3.07,718,0,1,0
53,13.77,1.90,2.68,17.1,115,3.00,2.79,0.39,1.68,6.30,1.13,2.93,1375,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,12.70,3.55,2.36,21.5,106,1.70,1.20,0.17,0.84,5.00,0.78,1.29,600,0,0,1
137,12.53,5.51,2.64,25.0,96,1.79,0.60,0.63,1.10,5.00,0.82,1.69,515,0,0,1
72,13.49,1.66,2.24,24.0,87,1.88,1.84,0.27,1.03,3.74,0.98,2.78,472,0,1,0
140,12.93,2.81,2.70,21.0,96,1.54,0.50,0.53,0.75,4.60,0.77,2.31,600,0,0,1


In [15]:
wine_list = wine_list[['Alcohol','Malicacid','Ash','Alcalinity_of_ash',
                       'Magnesium','Total_phenols','Nonflavanoid_phenols',
                       'Proanthocyanins','Color_intensity','Hue','0D280_0D315_of_diluted_wines',
                       'Proline',1,2,3, 
                       'Flavanoids',]]

wine_list

Unnamed: 0,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline,1,2,3,Flavanoids
161,13.69,3.26,2.54,20.0,107,1.83,0.50,0.80,5.88,0.96,1.82,680,0,0,1,0.56
117,12.42,1.61,2.19,22.5,108,2.00,0.34,1.61,2.06,1.06,2.96,345,0,1,0,2.09
19,13.64,3.10,2.56,15.2,116,2.70,0.17,1.66,5.10,0.96,3.36,845,1,0,0,3.03
69,12.21,1.19,1.75,16.8,151,1.85,0.14,2.50,2.85,1.28,3.07,718,0,1,0,1.28
53,13.77,1.90,2.68,17.1,115,3.00,0.39,1.68,6.30,1.13,2.93,1375,1,0,0,2.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,12.70,3.55,2.36,21.5,106,1.70,0.17,0.84,5.00,0.78,1.29,600,0,0,1,1.20
137,12.53,5.51,2.64,25.0,96,1.79,0.63,1.10,5.00,0.82,1.69,515,0,0,1,0.60
72,13.49,1.66,2.24,24.0,87,1.88,0.27,1.03,3.74,0.98,2.78,472,0,1,0,1.84
140,12.93,2.81,2.70,21.0,96,1.54,0.53,0.75,4.60,0.77,2.31,600,0,0,1,0.50


In [16]:
wine_list.rename(columns={1: '1', 2: '2', 3: '3'}, inplace=True)

In [17]:
wine_list.corr()

Unnamed: 0,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline,1,2,3,Flavanoids
Alcohol,1.0,0.094397,0.211545,-0.310235,0.270798,0.289101,-0.155929,0.136698,0.546364,-0.071747,0.072343,0.64372,0.647232,-0.726383,0.114941,0.236815
Malicacid,0.094397,1.0,0.164045,0.2885,-0.054575,-0.335167,0.292977,-0.220746,0.248985,-0.561296,-0.36871,-0.192011,-0.205847,-0.295175,0.544042,-0.411007
Ash,0.211545,0.164045,1.0,0.443367,0.286587,0.12898,0.18623,0.009652,0.258887,-0.074667,0.003911,0.223626,0.229268,-0.362457,0.156738,0.115077
Alcalinity_of_ash,-0.310235,0.2885,0.443367,1.0,-0.083333,-0.321113,0.361922,-0.197327,0.018732,-0.273955,-0.276769,-0.440597,-0.519646,0.181764,0.35065,-0.35137
Magnesium,0.270798,-0.054575,0.286587,-0.083333,1.0,0.214401,-0.256294,0.236441,0.19995,0.055398,0.066004,0.393351,0.326171,-0.296972,-0.018306,0.195784
Total_phenols,0.289101,-0.335167,0.12898,-0.321113,0.214401,1.0,-0.449935,0.612413,-0.055136,0.433681,0.699949,0.498115,0.61496,-0.047301,-0.600119,0.864564
Nonflavanoid_phenols,-0.155929,0.292977,0.18623,0.361922,-0.256294,-0.449935,1.0,-0.365845,0.139057,-0.26264,-0.50327,-0.311385,-0.40768,0.011868,0.419347,-0.5379
Proanthocyanins,0.136698,-0.220746,0.009652,-0.197327,0.236441,0.612413,-0.365845,1.0,-0.02525,0.295544,0.519067,0.330417,0.3805,0.056208,-0.465629,0.652692
Color_intensity,0.546364,0.248985,0.258887,0.018732,0.19995,-0.055136,0.139057,-0.02525,1.0,-0.521813,-0.428815,0.3161,0.143221,-0.694679,0.614582,-0.172379
Hue,-0.071747,-0.561296,-0.074667,-0.273955,0.055398,0.433681,-0.26264,0.295544,-0.521813,1.0,0.565468,0.236183,0.323088,0.353213,-0.732443,0.543479


In [18]:
PREDICTIVE_FEATURES = ['Alcohol','Malicacid','Ash','Alcalinity_of_ash',
                       'Magnesium','Total_phenols','Nonflavanoid_phenols',
                       'Proanthocyanins','Color_intensity','Hue','0D280_0D315_of_diluted_wines',
                       'Proline','1','2','3']
TARGET = 'Flavanoids'

In [19]:
X, y = wine_list.values[:,:-1], wine_list.values[:,-1]


In [20]:
### Function from David to reduce features based on correlation
def corr_calc(df, target='', x=''):
    corr = df.corr()[target].abs() > x
    features=[]
    for i in corr.index:
        if corr[i] == True:
            features.append(i)
    features.pop(features.index(target))
    return features

In [21]:
features = corr_calc(wine_list, TARGET, x=0.2)

features

['Alcohol',
 'Malicacid',
 'Alcalinity_of_ash',
 'Total_phenols',
 'Nonflavanoid_phenols',
 'Proanthocyanins',
 'Hue',
 '0D280_0D315_of_diluted_wines',
 'Proline',
 '1',
 '3']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

In [23]:
from sklearn.metrics import mean_squared_error as mse

lm = LinearRegression().fit(X_train, y_train)
mse(lm.predict(X_train), y_train, squared=False), mse(lm.predict(X), y, squared=False)

(0.3177205740764004, 0.32345234829227204)

In [24]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=2).fit(X_train, y_train)
mse(knn.predict(X_train), y_train, squared=False), mse(knn.predict(X), y, squared=False)

(0.5321973119748346, 0.5617913763683079)

In [25]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(max_depth=12).fit(X_train, y_train)
mse(rfr.predict(X_train), y_train, squared=False), mse(rfr.predict(X), y, squared=False)


(0.14423832078471469, 0.1952640311347703)

In [26]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=30).fit(X_train, y_train)
mse(gbr.predict(X_train), y_train, squared=False), mse(gbr.predict(X), y, squared=False)


(0.16056936972785033, 0.1968213580284825)

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

simple_nn = Sequential()
simple_nn.add(InputLayer((15,)))
simple_nn.add(Dense(2, 'relu'))
simple_nn.add(Dense(1, 'linear'))

opt = Adam(learning_rate=.1)
cp = ModelCheckpoint('models/simple_nn', save_best_only=True)
simple_nn.compile(optimizer=opt, loss='mse', metrics=[RootMeanSquaredError()])
simple_nn.fit(x=X_train, y=y_train, validation_data=(X,y), callbacks=[cp], epochs=100)

Epoch 1/100
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: models\simple_nn\assets
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
1/5 [=====>........................] - ETA: 0s - loss: 7.8734 - root_mean_squared_error: 2.8060INFO:tensorflow:Assets written to: models\simple_nn\assets
Epoch 13/100
1/5 [=====>........................] - ETA: 0s - loss: 8.0439 - root_mean_squared_error: 2.8362INFO:tensorflow:Assets written to: models\simple_nn\assets
Epoch 14/100
1/5 [=====>........................] - ETA: 0s - loss: 5.7927 - root_mean_squared_error: 2.4068INFO:tensorflow:Assets written to: models\simple_nn\assets
Epoch 15/100
1/5 [=====>........................] - ETA: 0s - loss: 6.08

<tensorflow.python.keras.callbacks.History at 0x134c7ade370>