In [2]:
import numpy as np
from sklearn.svm import LinearSVC
from os.path import abspath, join, exists, splitext#, split
from lab1 import (save_data, load_data, load_scene_categories, n_per_class_split, extract_multiscale_dense_features,
                  compute_features, sample_feature_set, kmeans_fit, compute_bovw, split_into_X_y)


# Random

In [3]:
random_state = np.random.RandomState(12345)

# Data Preparation

## Paths

In [4]:
dataset_path = abspath('scene_categories')
output_path = 'cache_n_50'

## Load dataset

dataset es un diccionario con la siguiente estructura:

```
{
'cname': cname,  # lista de los nombres de los directorios(clases)
'cid': cid,      # lista de indices de cname que dicen el directorio(clase) a la que pertenece
'fname': fname   # lista de nombres de path/imagen.png
}
```

In [5]:
dataset = load_scene_categories(dataset_path)
n_classes = len(dataset['cname'])
n_images = len(dataset['fname'])
print('{} images of {} categories'.format(n_images, n_classes))

4485 images of 15 categories


## Train-Test Split

train_set y test_set son listas de tuplas suffleadas de:
("CALsuburb/image_0159.jpg", 0)

In [6]:
train_set, test_set = n_per_class_split(dataset, n=100)
n_train = len(train_set)
n_test = len(test_set)
print('{} training samples / {} testing samples'.format(n_train, n_test))

1500 training samples / 2985 testing samples


## Compute and store low level features for all images

Esto crea los archivos *.feat que tienen las features para todas las imagenes

In [7]:
compute_features(dataset_path, dataset['fname'], output_path)

cache/CALsuburb/image_0131.feat already exists
cache/CALsuburb/image_0159.feat already exists
cache/CALsuburb/image_0004.feat already exists
cache/CALsuburb/image_0072.feat already exists
cache/CALsuburb/image_0082.feat already exists
cache/CALsuburb/image_0175.feat already exists
cache/CALsuburb/image_0241.feat already exists
cache/CALsuburb/image_0112.feat already exists
cache/CALsuburb/image_0110.feat already exists
cache/CALsuburb/image_0184.feat already exists
cache/CALsuburb/image_0100.feat already exists
cache/CALsuburb/image_0045.feat already exists
cache/CALsuburb/image_0105.feat already exists
cache/CALsuburb/image_0235.feat already exists
cache/CALsuburb/image_0129.feat already exists
cache/CALsuburb/image_0200.feat already exists
cache/CALsuburb/image_0119.feat already exists
cache/CALsuburb/image_0143.feat already exists
cache/CALsuburb/image_0001.feat already exists
cache/CALsuburb/image_0202.feat already exists
cache/CALsuburb/image_0099.feat already exists
cache/CALsubu

# Unsupervied Dictionary Learning

In [8]:
n_samples = int(1e5)
n_clusters = 100
vocabulary_file = join(output_path, 'vocabulary{:d}.dat'.format(n_clusters))
if exists(vocabulary_file):
    #vocabulary = pickle.load(open(vocabulary_file, 'rb'))
    vocabulary = load_data(vocabulary_file)
else:
    train_files = [fname for (fname, cid) in train_set]
    sample = sample_feature_set(output_path, train_files, output_path,
                                n_samples, random_state=random_state)
    vocabulary = kmeans_fit(sample, n_clusters=n_clusters,
                            random_state=random_state)
    save_data(vocabulary, vocabulary_file)

print('{}: {} clusters'.format(vocabulary_file, vocabulary.shape[0]))

cache/vocabulary100.dat: 100 clusters


# Compute BoVW Vectors

In [9]:
from datetime import datetime
start_time = datetime.now()
for fname in dataset['fname']:
    # low-level features file
    featfile = join(output_path, splitext(fname)[0] + '.feat')

    # check if destination file already exists
    bovwfile = join(output_path, splitext(fname)[0] + '.bovw')
    if exists(bovwfile):
        #print('{} already exists'.format(bovwfile))
        continue

    #feat = pickle.load(open(featfile, 'rb'))
    feat = load_data(featfile)
    bovw = compute_bovw(vocabulary, feat, norm=2)

    save_data(bovw, bovwfile)
    #print('{}'.format(bovwfile))
stop_time = datetime.now()
time_lapse = stop_time - start_time
print("time lapse:", time_lapse.total_seconds())

cache/CALsuburb/image_0131.bovw already exists
cache/CALsuburb/image_0159.bovw already exists
cache/CALsuburb/image_0004.bovw already exists
cache/CALsuburb/image_0072.bovw already exists
cache/CALsuburb/image_0082.bovw already exists
cache/CALsuburb/image_0175.bovw already exists
cache/CALsuburb/image_0241.bovw already exists
cache/CALsuburb/image_0112.bovw already exists
cache/CALsuburb/image_0110.bovw already exists
cache/CALsuburb/image_0184.bovw already exists
cache/CALsuburb/image_0100.bovw already exists
cache/CALsuburb/image_0045.bovw already exists
cache/CALsuburb/image_0105.bovw already exists
cache/CALsuburb/image_0235.bovw already exists
cache/CALsuburb/image_0129.bovw already exists
cache/CALsuburb/image_0200.bovw already exists
cache/CALsuburb/image_0119.bovw already exists
cache/CALsuburb/image_0143.bovw already exists
cache/CALsuburb/image_0001.bovw already exists
cache/CALsuburb/image_0202.bovw already exists
cache/CALsuburb/image_0099.bovw already exists
cache/CALsubu

# Train Classifiers

In [10]:
# setup training data
X_train, y_train = split_into_X_y(train_set)

svm = LinearSVC(C=1.0, verbose=1)
svm.fit(X_train, y_train)

# setup testing data
X_test, y_test = split_into_X_y(test_set)

y_pred = svm.predict(X_test)

tp = np.sum(y_test == y_pred)
print('accuracy = {:.3f}'.format(float(tp) / len(y_test)))

[LibLinear]accuracy = 0.653
