# Approach 1

In [16]:
import os
import numpy as np
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score 
from matplotlib import pyplot as plt


tile_size = 19
feature_count = 3
input_count = tile_size * tile_size * feature_count
DATA_FILES_ROOT = "../Model/DataFiles/"

In [3]:
def load_data(dataset_name: str) -> np.array:
    """
    Load data files for the given dataset.

    :param dataset_name: Name of the data set.
    :return: np.array of data files in (input_count, ) shape.
    """
    data_path = DATA_FILES_ROOT + dataset_name + "_data/"
    data_files = os.listdir(data_path)
    data_array = [None] * len(data_files)
    for i, file in enumerate(data_files):
        if i % 1000 == 0:
            print(f"loading {i}. file")
        data = np.loadtxt(data_path + file, skiprows=1, delimiter=',')
        data_array[i] = data.flatten()
    return np.array(data_array)


def load_labels(dataset_name: str) -> np.array:
    """
    Load data labels for the given dataset.

    :param dataset_name: Name of the data set.
    :return: np.array of the data labels.
    """
    labels_file = DATA_FILES_ROOT + dataset_name + "_truth.csv"
    labels = np.loadtxt(labels_file, delimiter=',')
    labels_array = [None] * len(labels)
    for i, label in enumerate(labels):
        labels_array[i] = label[2]
    return np.array(labels_array)

def load_datasets(datasets: list) -> (np.array, np.array):
    """
    Load both the input data vectors and labels of the datasets with the given names.

    :param datasets: List with the dataset names.
    :return: tuple of np.arrays, one with the input vectors, second with labels.
    """
    data = []
    labels = np.array([])
    for dataset in datasets:
        if (len(data) == 0):
            data = load_data(dataset)
        else:
            data = np.concatenate((data, load_data(dataset)), axis=0)
        labels = np.concatenate((labels, load_labels(dataset)), axis=0)
    return data, labels

datasets = ["fruit_03", "fruit_04"]
data, labels = load_datasets(datasets)

loading 0. file
loading 1000. file
loading 2000. file
loading 3000. file
loading 4000. file
loading 5000. file
loading 6000. file
loading 7000. file
loading 0. file
loading 1000. file
loading 2000. file
loading 3000. file
loading 4000. file
loading 5000. file
loading 6000. file
loading 7000. file
loading 8000. file
(16115, 1080)
(16115,)


In [17]:
print(data.shape)
print(labels.shape)

(16115, 1080)
(16115,)


# Normalization and Split

In [4]:
data -= np.mean(data, axis=0)
data /= np.std(data, axis=0)

In [5]:
train_X, test_X, train_y, test_y = train_test_split(data, labels, test_size=0.3, random_state=42)

print(len(train_X))
print(np.sum(train_y))

11280
11160.0


# Support Vector Machine

In [7]:
model_svm = svm.SVC(C=100, gamma=1/10000, kernel='rbf')
model_svm.fit(train_X, train_y)

print(f"SVM Train accuracy: {model_svm.score(train_X, train_y)}")
print(f"SVM Test accuracy: {model_svm.score(test_X, test_y)}")

SVM Train accuracy: 1.0
SVM Test accuracy: 0.9950361944157187


In [9]:
print(np.sum(test_y)/len(test_y))

0.9877973112719752


In [18]:
cross_val_score(model_svm, train_X, train_y, cv=3, scoring="accuracy")

array([0.99308511, 0.99760638, 0.99574468])

# Confusion Matrix

In [19]:
train_y_prediction = cross_val_predict(model_svm, train_X, train_y, cv=3)
confusion = confusion_matrix(train_y, train_y_prediction)
precision = precision_score(train_y, train_y_prediction)
recall = recall_score(train_y, train_y_prediction)

print(confusion)
print(precision)
print(recall)

[[   84    36]
 [   15 11145]]
0.9967802522135766
0.9986559139784946


# Prediction

In [14]:
def load_indices(dataset_name: str) -> list:
    labels_file = DATA_FILES_ROOT + dataset_name + "_truth.csv"
    labels = np.loadtxt(labels_file, delimiter=',')
    indices = [0] * len(labels)
    for i, label in enumerate(labels):
        indices[i] = int(label[1])
    return indices

def make_prediction(model, dataset_name: str, export: bool = False) -> np.array:
    data = load_data(dataset_name)
    labels = load_labels(dataset_name)
    prediction = model.predict(data)
    mistakes = 0
    zeroes = 0
    for i, label in enumerate(prediction):
        if round(prediction[i]) == 0:
            zeroes += 1
        if labels[i] != round(prediction[i]):
            mistakes += 1
    print(f"Made {mistakes} mistakes!")
    print(f"There were total of {zeroes} zeroes!")
    #print(f"Model accuracy: {model.score(data, labels)}")
    if export:
        export_prediction(dataset_name, prediction)
    return prediction

def export_prediction(dataset_name: str, prediction: np.array) -> None:
    indices = load_indices(dataset_name)
    prediction_file = DATA_FILES_ROOT + dataset_name + "_prediction.csv"
    with open(prediction_file, 'w') as file:
        for i, point_index in enumerate(indices):
            p = round(prediction[i])
            print(f"{i},{point_index},{int(p)}", file=file)
            
            
make_prediction(model_svm, "fruit_05", True)

loading 0. file
loading 1000. file
loading 2000. file
loading 3000. file
loading 4000. file
loading 5000. file
loading 6000. file
Made 327 mistakes!
There were total of 360 zeroes!


array([1., 1., 1., ..., 0., 0., 0.])