# Project

In [1]:
import os
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt


tile_size = 19
feature_count = 3
input_count = tile_size * tile_size * feature_count
DATA_FILES_ROOT = "../Model/DataFiles/"

In [2]:
def load_train_data(dataset_name: str) -> np.array:
    """
    Load data files for the given dataset.

    :param dataset_name: Name of the data set.
    :return: np.array of data files in (input_count, ) shape
    """
    data_path = DATA_FILES_ROOT + dataset_name + "_data/"
    data_files = os.listdir(data_path)
    data_array = [None] * len(data_files)
    for i, file in enumerate(data_files):
        if i % 1000 == 0:
            print(f"loading {i}. file")
        data = np.loadtxt(data_path + file, skiprows=1, delimiter=',')
        data_array[i] = data.flatten()
    return np.array(data_array)


def load_train_labels(dataset_name: str) -> np.array:
    """
    Load data labels for the given dataset.

    :param dataset_name: Name of the data set.
    :return: np.array of the data labels
    """
    labels_file = DATA_FILES_ROOT + dataset_name + "_truth.csv"
    labels = np.loadtxt(labels_file, delimiter=',')
    labels_array = [None] * len(labels)
    for i, label in enumerate(labels):
        labels_array[i] = label[2]
    return np.array(labels_array)

In [3]:
datasets = ["bmw_02", "fruit_02", "toilet_01"]

data = []
labels = np.array([])

for dataset in datasets:
    if (len(data) == 0):
        data = load_train_data(dataset)
    else:
        data = np.concatenate((data, load_train_data(dataset)), axis=0)
    labels = np.concatenate((labels, load_train_labels(dataset)), axis=0)

print(data.shape)
print(labels.shape)

loading 0. file
loading 1000. file
loading 2000. file
loading 3000. file
loading 4000. file
loading 5000. file
loading 6000. file
loading 0. file
loading 1000. file
loading 2000. file
loading 3000. file
loading 4000. file
loading 5000. file
loading 6000. file
loading 7000. file
loading 8000. file
loading 0. file
loading 1000. file
loading 2000. file
loading 3000. file
loading 4000. file
loading 5000. file
loading 6000. file
(21276, 1080)
(21276,)


In [5]:
train_X, eval_X, train_y, eval_y = train_test_split(data, labels, test_size=0.9, random_state=42)
train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size=0.3, random_state=42)

print(len(train_X))
print(np.sum(train_y))

1488
634.0


# Model fitting

In [11]:
rbf_svm = svm.SVC(C=10, gamma=1/10000, kernel='rbf')
rbf_svm.fit(train_X, train_y)

print("Train accuracy: {}".format(rbf_svm.score(train_X, train_y)))
print("Test accuracy: {}".format(rbf_svm.score(test_X, test_y)))

Train accuracy: 0.6559139784946236
Test accuracy: 0.5743348982785602


Final test accuracy confirms, that the model is right about 80% of the time, not bad! I suppose this confirms the claim, that there really is a correlation between the location of the passanger, his willingness to pay the tip and the type of payment he has used.

In [0]:
print("Test accuracy: {}".format(rbf_svm.score(test_X, test_y)))

Test accuracy: 0.79125




---
