In [1]:
# import std libraries
import os, sys
import csv
import numpy as np
from multiprocessing import Pool, TimeoutError
from functools import partial

# Create model from training file

In [2]:
def read_from_csv(file_path):
    """
    returns:
        - label "cell_id" + list of feature labels
        - cell_id + vector of features + class
    """
    cell_id_and_features_labels = []
    cell_counts = []
    with open(file_path) as csvfile:
        # read the file into rows
        rows = csv.reader(csvfile, delimiter='\t')

        # get labels
        count = 0
        for row in rows:
            if count == 0:
                cell_id_and_features_labels = row
            else:
                cell_counts.append(row)
            count += 1

    return cell_id_and_features_labels[:-1], cell_counts

def extract_features_and_classes(file_path):
    """
    Read the CSV at file_path
    and returns X and y
    """
    # contains the "cell_id" label + all the other feature labels
    cell_id_and_features_labels = []

    # contains the row
    # each row represents the features vector of a cell
    # x[i] where x is the features vector is the count of how many POIs
    # there are in that cell
    cell_counts = []

    cell_id_and_features_labels, cell_counts = read_from_csv(file_path)

    training_classes = [x[-1] for x in cell_counts]
    training_classes_without_duplicates = list(set(training_classes))
    
# DEBUG:    print("number of features", len(cell_id_and_features_labels) - 1)
# DEBUG:   print("number of classes", len(training_classes_without_duplicates))   
    
    # from each row
    xs = [x[1:-1] for x in cell_counts] # remove cell_id and y

    # note, first element of xs corresponds to first in ys and so on
    # this because with list comprehension, order is preserved

    # note that, we need that all elements in xs are integers
    # and, because we read it from CSV actually are strings
    # to convert them
    xs = [list(map(int, x)) for x in xs]

    # convert xs and ys to numpy arrays    
    X = np.array(xs)
    y = training_classes
    
    return X, y

In [3]:
# define the training file
training_file_path = '../../../data/w2v_urban/mdetail/baseline/training200.csv'
X_train, y_train = extract_features_and_classes(training_file_path)

test_file_path = '../../../data/w2v_urban/mdetail/baseline/test200.csv'
X_test, y_test = extract_features_and_classes(test_file_path)

# Create a SVM model

In [4]:
from sklearn.svm import SVC
import itertools
from sklearn import model_selection, metrics

def divide_sets_by_index(dataset, training_index, test_index):
    training_set = []
    test_set = []
    
    for i in range(len(dataset)):
        if i in training_index:
            training_set.append(dataset[i])
        else:
            test_set.append(dataset[i])
    
    return training_set, test_set

def inner_cross_process(combination_parameters, X_train, y_train):
    cvalue, gammavalue = combination_parameters

    innerf1 = []
    skf = model_selection.StratifiedKFold(n_splits=5, random_state=1234, shuffle=True)
    
    for train_indexes, test_indexes in skf.split(X_train, y_train):

        X2_train, X2_test = divide_sets_by_index(X_train, train_indexes, test_indexes)
        y2_train, y2_test = divide_sets_by_index(y_train, train_indexes, test_indexes)

        sclf = SVC(C=cvalue, kernel='rbf', gamma=gammavalue)
        sclf.fit(X2_train, y2_train)

        ipred = sclf.predict(X2_test)
        innerf1.append(metrics.f1_score(ipred, y2_test, average="macro"))
                         
    return combination_parameters, np.mean(innerf1)    

def innerFoldCrossValidation(X_train, y_train):
    
    # Values of C which I have to test
    Cvalues = [1e-02, 1e-01, 1e00, 1e01, 1e02]
    
    # Values of Gamma which I have to test
    Gammavalues = [1e-02, 1e-01, 1e00, 1e01, 1e02]
    
    # Get the combination of Cs and Gamma parameters
    combination_parameters = list(itertools.product(Cvalues, Gammavalues))
    
    with Pool(processes=4) as pool:
        combination_and_scores = pool.map(partial(inner_cross_process, X_train=X_train, y_train=y_train), combination_parameters)

    assert len(combination_and_scores) == len(combination_parameters)
    
    max_score = 0
    best_combination = None
    
    for combination, score in combination_and_scores:
        if score > max_score:
            max_score = score
            best_combination = combination
            
    return best_combination

### Outer cross-validation

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

final_accuracy = []
final_f1 = []
counter = 1

skf = model_selection.StratifiedKFold(n_splits=5, random_state=1234, shuffle=True)
for train_indexes, test_indexes in skf.split(X_train, y_train):
    print("Fold - ", counter)
    counter += 1
    
    X2_train, X2_test = divide_sets_by_index(X_train, train_indexes, test_indexes)
    y2_train, y2_test = divide_sets_by_index(y_train, train_indexes, test_indexes)
    
    best_c, best_gamma = innerFoldCrossValidation(X2_train, y2_train)
    clf = SVC(C=best_c, kernel='rbf', gamma=best_gamma)
    model = clf.fit(X2_train, y2_train)
    results = model.predict(X2_test)
    
    acc = accuracy_score(y2_test, results)
    f1 = f1_score(y2_test, results, average="macro")
    
    final_accuracy.append(acc)
    final_f1.append(f1)

Fold -  1


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Fold -  2


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Fold -  3


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Fold -  4


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Fold -  5


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [6]:
print("accuracy", np.mean(final_accuracy))
print("f1", np.mean(final_f1))

accuracy 0.578967119528
f1 0.473802859186
