<a href="https://colab.research.google.com/github/danielegrattarola/ml-18-19/blob/master/04_model_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Cross-validation

In [0]:
# Load the data
import pandas as pd
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(url, delimiter=';')

# Extract features
x = data[data.columns[:-1]].values

# Normalize features
x -= np.mean(x, axis=0)
x /= np.std(x, axis=0)

# Extact targets
quality = data['quality'].values
y = (quality >= 6).astype(int)

# Split the data into training and test
from sklearn.model_selection import train_test_split
x, x_test, y, y_test = train_test_split(x, y, test_size=0.1, stratify=y)

print('Training set size: {}'.format(x.shape[0]))
print('Test set size: {}'.format(x_test.shape[0]))

print('\nX train')
print(x)

print('\nY train')
plt.hist(y);

In [0]:
# Setup K-fold cross-validation
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

k = 10
kf = StratifiedKFold(n_splits=k, shuffle=True)

scores_lr = []
for train_indices, val_indices in kf.split(x, y):
    print('###################################################################')
    print('Training set size:   {}'.format(train_indices.shape[0]))
    print('Validation set size: {}'.format(val_indices.shape[0]))
    
    x_train = x[train_indices]
    x_val = x[val_indices]
    y_train = y[train_indices]
    y_val = y[val_indices]
    
    model = LogisticRegression(solver='liblinear')  # Specify solver to disable warning
    model.fit(x_train, y_train)
    
    score = model.score(x_val, y_val)
    scores_lr.append(score)
    print('Accuracy: {}'.format(score))

In [0]:
# Evaluate estimated performance
acc_mean = np.mean(scores_lr)
acc_std = np.std(scores_lr)
print('Expected accuracy: {:.2f} +- {:.2f}'.format(acc_mean, acc_std))

# Evaluate actual performance
model = LogisticRegression(solver='liblinear')  # Specify solver to disable warning
model.fit(x, y)
score = model.score(x_test, y_test)
print('Actual accuracy:   {:.3f}'.format(score))

## Classification and regression trees

In [0]:
from sklearn.tree import DecisionTreeClassifier
k = 10
kf = StratifiedKFold(n_splits=k, shuffle=True)

scores_dt = []
for train_indices, val_indices in kf.split(x, y):
    print('###################################################################')
    print('Training set size:   {}'.format(train_indices.shape[0]))
    print('Validation set size: {}'.format(val_indices.shape[0]))
    
    x_train = x[train_indices]
    x_val = x[val_indices]
    y_train = y[train_indices]
    y_val = y[val_indices]
    
    model = DecisionTreeClassifier()
    model.fit(x_train, y_train)
    
    score = model.score(x_val, y_val)
    scores_dt.append(score)
    print('Accuracy: {}'.format(score))

In [0]:
# Evaluate estimated performance
acc_mean = np.mean(scores_dt)
acc_std = np.std(scores_dt)
print('Expected accuracy: {:.2f} +- {:.2f}'.format(acc_mean, acc_std))

# Evaluate actual performance
model = DecisionTreeClassifier()  # Specify solver to disable warning
model.fit(x, y)
score = model.score(x_test, y_test)
print('Actual accuracy:   {:.3f}'.format(score))

## Random forests

In [0]:
from sklearn.ensemble import RandomForestClassifier
k = 10
kf = StratifiedKFold(n_splits=k, shuffle=True)

scores_rf = []
for train_indices, val_indices in kf.split(x, y):
    print('###################################################################')
    print('Training set size:   {}'.format(train_indices.shape[0]))
    print('Validation set size: {}'.format(val_indices.shape[0]))
    
    x_train = x[train_indices]
    x_val = x[val_indices]
    y_train = y[train_indices]
    y_val = y[val_indices]
    
    model = RandomForestClassifier(n_estimators=100)
    model.fit(x_train, y_train)
    
    score = model.score(x_val, y_val)
    scores_rf.append(score)
    print('Accuracy: {}'.format(score))
    
print('###################################################################')
# Evaluate estimated performance
acc_mean = np.mean(scores_rf)
acc_std = np.std(scores_rf)
print('Expected accuracy: {:.2f} +- {:.2f}'.format(acc_mean, acc_std))

# Evaluate actual performance
model = RandomForestClassifier(n_estimators=100)  # Specify solver to disable warning
model.fit(x, y)
score = model.score(x_test, y_test)
print('Actual accuracy:   {:.3f}'.format(score))

## Compare model performance


In [0]:
print(scores_lr)
print(scores_dt)
print(scores_rf)

# Compare model averages
acc_mean = np.mean(scores_lr)
acc_std = np.std(scores_lr)
print('\nLR mean expected accuracy:: {:.2f} +- {:.2f}'.format(acc_mean, acc_std))
acc_mean = np.mean(scores_dt)
acc_std = np.std(scores_dt)
print('DT mean expected accuracy:: {:.2f} +- {:.2f}'.format(acc_mean, acc_std))
acc_mean = np.mean(scores_rf)
acc_std = np.std(scores_rf)
print('RF mean expected accuracy:: {:.2f} +- {:.2f}'.format(acc_mean, acc_std))

# Compute t-test
from scipy.stats import ttest_ind

_, p_val = ttest_ind(scores_lr, scores_dt)
print('\nProbability that LR has the same mean accuracy as DT: {:.4f}'.format(p_val))
_, p_val = ttest_ind(scores_lr, scores_rf)
print('Probability that LR has the same mean accuracy as RF: {:.4f}'.format(p_val))
_, p_val = ttest_ind(scores_dt, scores_rf)
print('Probability that DT has the same mean accuracy as RF: {:.4f}'.format(p_val))
