In [55]:
from pathlib import Path
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(1001)

datadir = Path('.').resolve().parents[0] / "data"
datadir

PosixPath('/home/mddevine/projects/ml-wine/data')

In [56]:
df = pd.read_csv(datadir / "interim" / "wine_df_nice_cols.csv")
print(f"{len(df)} observations.")
df.head()

178 observations.


Unnamed: 0,class,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280_over_od315,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [58]:
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
df['is_train'].value_counts()

True     142
False     36
Name: is_train, dtype: int64

In [59]:
train, test = df[df['is_train']].copy(), df[~df['is_train']].copy()
print(f"train = {len(train)}, test = {len(test)}")

train = 142, test = 36


In [60]:
# features == all columns except the class (our target), and `is_train`
features = df.columns[1:-1]
features

Index(['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
       'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
       'proanthocyanins', 'color_intensity', 'hue', 'od280_over_od315',
       'proline'],
      dtype='object')

In [61]:
y = train['class']
target_y = test['class']
print(f"dtype = {y.dtype}")
y.value_counts()

dtype = int64


2    55
1    48
3    39
Name: class, dtype: int64

In [62]:
test = test.drop(['class', 'is_train'], axis='columns')
train = train.drop(['class', 'is_train'], axis='columns')

In [63]:
# for sgd, we need to scale the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train)  # Don't cheat - fit only on training data
train_normalized = pd.DataFrame(scaler.transform(train), columns=train.columns)
# apply same transformation to test data
test_normalized = pd.DataFrame(scaler.transform(test), columns=test.columns)

In [64]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=20, shuffle=True)
clf.fit(train_normalized, y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=20,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [65]:
# predict the test data
predictions = clf.predict(test_normalized)
predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [66]:
pd.crosstab(target_y, predictions, rownames=['Actual Wine'], colnames=['Predicted Wine'])

Predicted Wine,1,2,3
Actual Wine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,11,0,0
2,0,16,0
3,0,0,9


In [67]:
def check_correct(x):
    if x['prediction'] == x['class']:
        return 1
    return 0


test['prediction'] = predictions
test['class'] = target_y
test['correct'] = test.apply(check_correct, axis=1)
print(f"accuracy = {round(test['correct'].mean(), 2)}")

accuracy = 1.0
