# Project

In [6]:
import os
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt


tile_size = 19
feature_count = 3
input_count = tile_size * tile_size * feature_count
DATA_FILES_ROOT = "../Model/DataFiles/"

In [19]:
def load_train_data(dataset_name: str) -> np.array:
    """
    Load data files for the given dataset.

    :param dataset_name: Name of the data set.
    :return: np.array of data files in (input_count, ) shape
    """
    data_path = DATA_FILES_ROOT + dataset_name + "_data/"
    data_files = os.listdir(data_path)
    data_array = [None] * len(data_files)
    for i, file in enumerate(data_files):
        if i % 1000 == 0:
            print(f"loading {i}. file")
        data = np.loadtxt(data_path + file, skiprows=1, delimiter=',')
        data_array[i] = data.flatten()
    return np.array(data_array)


def load_train_labels(dataset_name: str) -> np.array:
    """
    Load data labels for the given dataset.

    :param dataset_name: Name of the data set.
    :return: np.array of the data labels
    """
    labels_file = DATA_FILES_ROOT + dataset_name + "_truth.csv"
    labels = np.loadtxt(labels_file, delimiter=',')
    labels_array = [None] * len(labels)
    for i, label in enumerate(labels):
        labels_array[i] = label[2]
    return np.array(labels_array)

In [41]:
datasets = ["small_bmw"]

data = None
labels = np.array([])

for dataset in datasets:
    if (data == None):
        data = load_train_data(dataset)
    else:
        data = np.concatenate((data, load_train_data(dataset)), axis=0)
    labels = np.concatenate((labels, load_train_labels(dataset)), axis=0)

print(data[0])
print(data.shape)
print(labels[0:10])
print(labels.shape)

Loading 0. file.
Loading 1000. file.
Loading 2000. file.
Loading 3000. file.
Loading 4000. file.
Loading 5000. file.
Loading 6000. file.
[1.881135e+00 1.000732e+00 5.479591e+00 ... 3.179000e-03 9.857000e-03
 1.899386e+00]
(6485, 1080)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
(6485,)


In [42]:
#train, test = train_test_split(df, test_size=0.2)

data = data[1000:2000]
labels = labels[1000:2000]

split = int(len(data)*0.8)
train_X = data[:split]
train_y = labels[:split]
test_X = data[split:]
test_y = labels[split:]

print(len(train_X))
print(len(test_X))
print(np.sum(train_y))

800
200
383.0


# Model fitting

In [31]:
rbf_svm = svm.SVC(C=1000, gamma=1/100000, kernel='rbf')
rbf_svm.fit(train_X, train_y)

print("Train accuracy: {}".format(rbf_svm.score(train_X, train_y)))
print("Test accuracy: {}".format(rbf_svm.score(test_X, test_y)))

ValueError: The number of classes has to be greater than one; got 1 class

Final test accuracy confirms, that the model is right about 80% of the time, not bad! I suppose this confirms the claim, that there really is a correlation between the location of the passanger, his willingness to pay the tip and the type of payment he has used.

In [0]:
print("Test accuracy: {}".format(rbf_svm.score(test_X, test_y)))

Test accuracy: 0.79125




---
