# Comments

In [None]:
# It should take less than about 20 s to run the
# entire notebook under the default settings.

# Import appropriate packages and set analysis options.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot') 
import seaborn as sns
sns.set(color_codes=True)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, train_test_split
from math import sqrt

# Setting randomize_seeding to True will
# randomize various operations throughout
# the notebook. Setting it to False will
# cause the seed to remain fixed to some
# specified value such that the notebook
# can be reran with the same randomized
# variables (see MAGIC_SEED below).
randomize_seeding = False

# Define convenient variables and functions.

In [None]:
# These paths indicate from where the training
# data will be loaded.
DATA_PATH = "./data/wine.data"

# The MAGIC_SEED optionally specifies a fixed
# random state/seed so that the notebook can be
# reran with the same randomized variables (see
# randomize_seeding above).
MAGIC_SEED = 1776
if (randomize_seeding):
    MAGIC_SEED = np.random.seed()

# The training data provided with this data
# set will be split into two subsets so that
# models can be trained on the first and tested
# on the second. TRAINING_DATA_TEST_SIZE
# indicates the proportion of the training
# data that will be used as test data for
# model evaluation and should be in the
# range [0.0, 1.0].
TRAINING_DATA_TEST_SIZE = 0.25

def load_data(path, header_names):
    """
    Load the file at 'path' into a Pandas
    DataFrame.
    """
    df = pd.read_csv(path, header=None, names=header_names)
    print("Loaded data dimensions: ", df.shape[0], "rows, ", df.shape[1], "columns")
    return df

def print_nan(nan_cols_counts, col_type):
    """
    Print each element of the list which should contain
    a DataFrame feature name and an int number of times
    the feature contains an NaN value.
    """
    print("\n", len(nan_cols_counts), " ", col_type, "-type columns with NaN values.", sep='')
    if(len(nan_cols_counts) > 0):
        print("    {:<16}{}".format("Feature", "NaN Count"))
        print("%s" % "    ---------------------")
    for index, element in enumerate(nan_cols_counts):
        print("{:>2}. {:<16}{}".format(index+1, element[0], element[1]))
        
def gather_nan(df, col_type, print_if_nan = True):
    """
    Find all DataFrame columns of type 'col_type'
    which contain NaN values.
    """
    if (col_type == "int"):
        columns = df.select_dtypes(include=['int']).columns
    elif (col_type == "float"):
        columns = df.select_dtypes(include=['float']).columns
    else:
        columns = df.select_dtypes(include=['object']).columns
    nan_cols_counts = []
    for col in np.sort(columns):
        num_nan = np.sum(df[col].isnull())
        if (num_nan > 0):
            nan_cols_counts.append((col, num_nan))
    if (print_nan):
        print_nan(nan_cols_counts, col_type)
    return nan_cols_counts

def evaluate_model(x_data, y_data, model):
    """
    Given an input model and data,
    split the data into training/testing subsets
    and use this to produce a fit and predictions.
    Indicate the goodness of the fit and plot
    the results.
    """
    # Split the data into two subsets.
    # Then, train the model on the target data
    # and use it to predict results.
    x_data1, x_data2, y_data1, y_data2 = train_test_split(
        x_data, y_data,
        test_size=TRAINING_DATA_TEST_SIZE,
        random_state=MAGIC_SEED)
    model.fit(x_data1, y_data1)
    y_data2_pred = model.predict(x_data2)
    
    # Evaluate the model & predictions by viewing
    # the cross-validation score, error, and
    # variance (where a variance of 1 indicates
    # a perfect prediction) and plotting the results.
    print("When using %0.1f%% of the data to perform the"
        " fit and %0.1f%% of the data to make the prediction,"
        " the model performed according to the following:"
        % (100.0*(1.0-TRAINING_DATA_TEST_SIZE), 100.0*TRAINING_DATA_TEST_SIZE))
    print(cross_val_score(model, x_data1, y_data1, cv=5))
    print("RMS Error: %.3f"
        % sqrt(mean_squared_error(y_data2, y_data2_pred)))
    print('Variance score: %.3f' % r2_score(y_data2, y_data2_pred))
    x = np.arange(6)
    y = [y_data2.tolist().count(1), y_data2_pred.tolist().count(1),
         y_data2.tolist().count(2), y_data2_pred.tolist().count(2),
         y_data2.tolist().count(3), y_data2_pred.tolist().count(3)]
    colors = ("green", "cyan", "red", "magenta", "black", "gray")
    plt.bar(x, y, color=colors)
    plt.xticks(x, ("True Class1", "Pred. Class1", "True Class2", "Pred. Class2",
        "True Class3", "Pred. Class3"))
    plt.title('k = %d' % model.get_params()['n_neighbors'])
    plt.show()
    
    # Finalize the model by fitting it to the entire data set.
    model.fit(x_data, y_data)
    
    # Return a score for this model.
    return metrics.accuracy_score(y_data2, y_data2_pred)

# Load and preview the wine data.

In [None]:
df = load_data(DATA_PATH, ['Class', 'Alcohol', 'Acid', 'Ash', 'Alcalinity', 'Magnesium', 'TotPhenols', 'Flavanoids', 'NonflavPhenols', 'Proanth', 'ColIntensity', 'Hue', 'OD', 'Proline'])
df.head()

In [None]:
df.describe()

In [None]:
# The data has several features with min or max values
# beyond three standard deviations from the mean - usually
# a good indicator of outliers.

# Clean and process the data.

In [None]:
# Partition data into features (X-data) and
# targets (Y-data).
x_data = df.iloc[:,1:]
y_data = df.iloc[:,0]

# Determine which features have missing values.
nan_int_cols = gather_nan(x_data, "int")
nan_float_cols = gather_nan(x_data, "float")
nan_string_cols = gather_nan(x_data, "string")

### The data is already tidy!

# Perform k-Nearest Neighbor classification by varying k and finding the optimal value.

In [None]:
k_values = list(range(1,50))
models = []
accuracy_scores = []
for k in k_values:
    model = KNeighborsClassifier(n_neighbors=k)
    accuracy_score = evaluate_model(x_data, y_data, model)
    accuracy_scores.append(accuracy_score)
    models.append(model)

In [None]:
plt.plot(k_values, accuracy_scores)
plt.xlabel('# Nearest Neighbors [k]')
plt.ylabel('Accuracy Score')
plt.show()

### The k=13/14 case seems to perform the best.

# Evaluate the k=13 model by seeing how well it classifies the full data set.

In [None]:
chosen_model = 13
model = models[chosen_model-1]
y_pred = model.predict(x_data)
metrics.accuracy_score(y_data, y_pred)