In [181]:
import pandas as pd

#Loading dataset
wine = pd.read_csv('../input/winequalityred/winequality-red.csv')
wine_test = pd.read_csv('../input/winequalityred/winequality-red_test.csv')

In [182]:
#Let's check how the data is distributed
wine.head()

In [183]:
wine_test.head()

In [184]:
#Information about the data columns
wine.info()

In [185]:
import seaborn
seaborn.set()

seaborn.pairplot(data=wine, height=5, aspect=1,
             x_vars=['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol'],
             y_vars=['quality']);#wine

In [186]:
bins = (0, 5, 10)
group_names = ['bad', 'good']
wine['quality'] = pd.cut(wine['quality'], bins = bins, labels = group_names)

In [187]:
# Check quality
wine.head()

In [188]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#Now lets assign a labels to our quality variable
label_quality = LabelEncoder()

In [189]:
#Bad becomes 0 and good becomes 1 
wine['quality'] = label_quality.fit_transform(wine['quality'])

In [190]:
wine['quality'].value_counts()

In [191]:
X = wine.drop('quality', axis = 1)
y = wine['quality']

In [192]:
# splitting the dataset
# train : val= 9 : 1

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state = 42, shuffle=False)

In [193]:
X_train

In [194]:
X_val

In [195]:
X_train = X_train.values
y_train = y_train.values

X_val = X_val.values
y_val = y_val.values

print(X_train.shape, y_train.shape)

# model

In [196]:
!pip install autograd

In [197]:
from autograd import numpy
from autograd import grad 

In [198]:
def logistic(z):
    return 1. / (1. + numpy.exp(-z))

def logistic_model(X, params):
    out = logistic(numpy.dot(X, params[0]) + params[1])
    return out

def log_loss(X, y, params, _lambda=1.0):
    y_pred = logistic_model(X, params)
    loss = - (
        numpy.dot(y, numpy.log(y_pred+1e-15)) +
        numpy.dot(1.-y, numpy.log(1.-y_pred+1e-15))
    ) + _lambda * numpy.sum(params[0]**2)
    return loss

In [199]:
def classify(X, params):
    probabilities = logistic_model(X, params)
    labels = (probabilities >= 0.5).astype(int)
    return labels

In [200]:
def performance(predictions, answers, beta=1.0):
    true_idx = (answers == 1)  # the location where the answers are 1
    false_idx = (answers == 0)  # the location where the answers are 0
    
    # true positive: answers are 1 and predictions are also 1
    n_tp = numpy.count_nonzero(predictions[true_idx] == 1)
    
    # false positive: answers are 0 but predictions are 1
    n_fp = numpy.count_nonzero(predictions[false_idx] == 1)
    
    # true negative: answers are 0 and predictions are also 0
    n_tn = numpy.count_nonzero(predictions[false_idx] == 0)
    
    # false negative: answers are 1 but predictions are 0
    n_fn = numpy.count_nonzero(predictions[true_idx] == 0)
    
    accuracy = (n_tp + n_tn) / (n_tp + n_fn + n_fp + n_tn)

    return accuracy

In [201]:
gradients = grad(log_loss, argnum=2)

In [202]:

mu = numpy.mean(X_train, axis=0)
sigma = numpy.std(X_train, axis=0)

X_train = (X_train - mu) / sigma
X_val = (X_val - mu) / sigma

# PCA

In [203]:
#from sklearn.decomposition import PCA
#pca = PCA()
#pca.n_components = 5
#pca.fit(X_train)
#X_train = pca.transform(X_train)
#X_val = pca.transform(X_val)

print(X_train.shape)

# training

In [204]:
#w = numpy.zeros(X_train.shape[1], dtype=float)
#w = numpy.ones(X_train.shape[1], dtype=float)
numpy.random.seed(777)
w = numpy.random.rand(X_train.shape[1])
b = 0.

In [205]:
lr = 1e-5

change = numpy.inf

i = 0

old_val_loss = 1e-15

while  change >= 1e-5 and i < 10000:
    
    # calculate gradients and use gradient descents
    grads = gradients(X_train, y_train, (w, b))
    w -= (grads[0] * lr)
    b -= (grads[1] * lr)
    
    # validation loss
    val_loss = log_loss(X_val, y_val, (w, b))
    
    # calculate f-scores against the validation dataset
    pred_labels_val = classify(X_val, (w, b))
    score = performance(pred_labels_val, y_val)

    # calculate the chage in validation loss
    change = numpy.abs((val_loss-old_val_loss)/old_val_loss)

    # update the counter and old_val_loss
    i += 1
    old_val_loss = val_loss
    
    # print the progress every 10 steps
    if i % 100 == 0:
        print("{}...".format(i), end="")
        print(val_loss)


print("")
print("")
print("Upon optimization stopped:")
print("    Iterations:", i)
print("    Validation loss:", val_loss)
print("    Validation Accuracy:", score)
print("    Change in validation loss:", change)

In [206]:
pred_labels_val = classify(X_val, (w,b))
perf = performance(pred_labels_val, y_val)

print("Final Accuracy: {:.1f}%".format(perf*100))

# Test

In [207]:
X_test = wine_test.values

X_test = (X_test - mu) / sigma

In [208]:
#X_test = pca.transform(X_test)

In [209]:
pred_labels_test = classify(X_test, (w,b))
print(pred_labels_test)

In [210]:
import csv

print(pred_labels_test)

fields = ['ID','quality']

with open('./test_no_pca.csv', 'w', encoding='euc-kr') as fd:
    writer = csv.writer(fd)
    writer.writerow(fields)
    for i in range(len(pred_labels_test)):
        writer.writerow([i+1,pred_labels_test[i]])