# Approach 1

In [1]:
import os
import numpy as np
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score 
from matplotlib import pyplot as plt


tile_size = 19
feature_count = 3
input_count = tile_size * tile_size * feature_count
DATA_FILES_ROOT = "DataFiles/"
PRED_FILES_ROOT = "Predictions/"

In [2]:
def load_data(dataset_name: str) -> np.array:
    """
    Load data files for the given dataset.

    :param dataset_name: Name of the data set.
    :return: np.array of data files in (input_count, ) shape.
    """
    data_path = DATA_FILES_ROOT + dataset_name + "_data/"
    data_files = os.listdir(data_path)
    data_array = [None] * len(data_files)
    for i, file in enumerate(data_files):
        if i % 1000 == 0:
            print(f"loading {i}. file")
        data = np.loadtxt(data_path + file, skiprows=1, delimiter=',')
        data_array[i] = data.flatten()
    return np.array(data_array)


def load_labels(dataset_name: str) -> np.array:
    """
    Load data labels for the given dataset.

    :param dataset_name: Name of the data set.
    :return: np.array of the data labels.
    """
    labels_file = DATA_FILES_ROOT + dataset_name + "_truth.csv"
    labels = np.loadtxt(labels_file, delimiter=',')
    labels_array = [None] * len(labels)
    for i, label in enumerate(labels):
        labels_array[i] = label[2]
    return np.array(labels_array)

def load_datasets(datasets: list) -> (np.array, np.array):
    """
    Load both the input data vectors and labels of the datasets with the given names.

    :param datasets: List with the dataset names.
    :return: tuple of np.arrays, one with the input vectors, second with labels.
    """
    data = []
    labels = np.array([])
    for dataset in datasets:
        if (len(data) == 0):
            data = load_data(dataset)
        else:
            data = np.concatenate((data, load_data(dataset)), axis=0)
        labels = np.concatenate((labels, load_labels(dataset)), axis=0)
    return data, labels

datasets = ["fruit_02", "fruit_03", "fruit_04"]
data, labels = load_datasets(datasets)
print("done loading datasets")

loading 0. file
loading 1000. file
loading 2000. file
loading 3000. file
loading 4000. file
loading 5000. file
loading 6000. file
loading 7000. file
loading 8000. file
loading 9000. file
loading 10000. file
loading 0. file
loading 1000. file
loading 2000. file
loading 3000. file
loading 4000. file
loading 5000. file
loading 6000. file
loading 7000. file
loading 0. file
loading 1000. file
loading 2000. file
loading 3000. file
loading 4000. file
loading 5000. file
loading 6000. file
loading 7000. file
loading 8000. file


In [3]:
print(data.shape)
print(labels.shape)

(26913, 1080)
(26913,)


# Normalization and Split

In [7]:
data -= np.mean(data, axis=0)
data /= np.std(data, axis=0)

In [8]:
train_X, test_X, train_y, test_y = train_test_split(data, labels, test_size=0.3, random_state=42)

print(len(train_X))
print(np.sum(train_y))

18839
18561.0


# Support Vector Machine

In [22]:
model_svm = svm.SVC(C=1000, gamma=1/10000, kernel='rbf')
#model_svm = svm.SVC(kernel='rbf')
model_svm.fit(train_X, train_y)

print(f"SVM Train accuracy: {model_svm.score(train_X, train_y)}")
print(f"Actual percentage of true points: {np.sum(train_y)/len(train_y)}")

SVM Train accuracy: 1.0
Actual percentage of true points: 0.9852433780986252


In [15]:
print(f"Cross validation scores: {cross_val_score(model_svm, train_X, train_y, cv=3)}")

Cross validation scores: [0.99140127 0.99124204 0.9906036 ]


In [None]:
print(f"SVM Test accuracy: {model_svm.score(test_X, test_y)}")
print(f"Actual percentage of true points: {np.sum(test_y)/len(test_y)}")

# Confusion Matrix

In [23]:
train_y_prediction = cross_val_predict(model_svm, train_X, train_y, cv=3)
confusion = confusion_matrix(train_y, train_y_prediction)
precision = precision_score(train_y, train_y_prediction)
recall = recall_score(train_y, train_y_prediction)

print(confusion)
print(precision)
print(recall)

[[  166   112]
 [   53 18508]]
0.9939849624060151
0.9971445504013793


# Random Forest

In [24]:
model_rfc = RandomForestClassifier(n_estimators=20, max_depth=10)
model_rfc.fit(train_X, train_y)

print(f"RFC Train accuracy: {model_svm.score(train_X, train_y)}")

RFC Train accuracy: 1.0


In [25]:
print(f"Cross validation scores: {cross_val_score(model_rfc, train_X, train_y, cv=3)}")
print(f"RFC Test accuracy: {model_rfc.score(test_X, test_y)}")

Cross validation scores: [0.99442675 0.99458599 0.99331104]
RFC Test accuracy: 0.9946742630666336


# Prediction

In [29]:
def load_indices(dataset_name: str) -> list:
    """
    Load point ids for the given dataset.

    :param dataset_name: Name of the data set.
    :return: List of point ids.
    """
    labels_file = DATA_FILES_ROOT + dataset_name + "_truth.csv"
    labels = np.loadtxt(labels_file, delimiter=',')
    indices = [0] * len(labels)
    for i, label in enumerate(labels):
        indices[i] = int(label[1])
    return indices

def make_prediction(model, dataset_name: str, export: bool = False) -> np.array:
    """
    Make a prediction the given dataset using the provided model.

    :model: Instance of a model with the predict method.
    :param dataset_name: Name of the dataset.
    :export: Whether to export the prediction into a .csv file.
    :return: np.array of prediction vector.
    """
    data = load_data(dataset_name)
    labels = load_labels(dataset_name)
    prediction = model.predict(data)
    mistakes = 0
    zeroes = 0
    for i, label in enumerate(prediction):
        if round(prediction[i]) == 0:
            zeroes += 1
        if labels[i] != round(prediction[i]):
            mistakes += 1
    print(f"Made {mistakes} mistakes!")
    print(f"There were total of {zeroes} zeroes!")
    #print(f"Model accuracy: {model.score(data, labels)}")
    if export:
        export_prediction(dataset_name, prediction)
    return prediction

def export_prediction(dataset_name: str, prediction: np.array) -> None:
    indices = load_indices(dataset_name)
    prediction_file = PRED_FILES_ROOT + dataset_name + "_prediction.csv"
    with open(prediction_file, 'w') as file:
        for i, point_index in enumerate(indices):
            p = round(prediction[i])
            print(f"{i},{point_index},{int(p)}", file=file)
            
            
make_prediction(model_rfc, "fruit_01", True)

loading 0. file
loading 1000. file
loading 2000. file
loading 3000. file
loading 4000. file
loading 5000. file
loading 6000. file
loading 7000. file
loading 8000. file
loading 9000. file
loading 10000. file
loading 11000. file
loading 12000. file
Made 1936 mistakes!
There were total of 2006 zeroes!


array([0., 0., 0., ..., 0., 0., 0.])