In [1]:
# import std libraries
import os, sys
import csv
import numpy as np

# Create model from training file

In [2]:
def read_from_csv(file_path):
    """
    returns:
        - label "cell_id" + list of feature labels
        - cell_id + vector of features + class
    """
    cell_id_and_features_labels = []
    cell_counts = []
    with open(file_path) as csvfile:
        # read the file into rows
        rows = csv.reader(csvfile, delimiter='\t')

        # get labels
        count = 0
        for row in rows:
            if count == 0:
                cell_id_and_features_labels = row
            else:
                cell_counts.append(row)
            count += 1

    return cell_id_and_features_labels[:-1], cell_counts

def extract_features_and_classes(file_path):
    """
    Read the CSV at file_path
    and returns X and y
    """
    # contains the "cell_id" label + all the other feature labels
    cell_id_and_features_labels = []

    # contains the row
    # each row represents the features vector of a cell
    # x[i] where x is the features vector is the count of how many POIs
    # there are in that cell
    cell_counts = []

    cell_id_and_features_labels, cell_counts = read_from_csv(file_path)

    training_classes = [x[-1] for x in cell_counts]
    training_classes_without_duplicates = list(set(training_classes))
    
# DEBUG:    print("number of features", len(cell_id_and_features_labels) - 1)
# DEBUG:   print("number of classes", len(training_classes_without_duplicates))   
    
    # from each row
    xs = [x[1:-1] for x in cell_counts] # remove cell_id and y

    # note, first element of xs corresponds to first in ys and so on
    # this because with list comprehension, order is preserved

    # note that, we need that all elements in xs are integers
    # and, because we read it from CSV actually are strings
    # to convert them
    xs = [list(map(int, x)) for x in xs]

    # convert xs and ys to numpy arrays    
    X = np.array(xs)
    y = training_classes
    
    return X, y

In [3]:
# define the training file
training_file_path = '../../../data/w2v_urban/mdetail/baseline/training50.csv'
X_train, y_train = extract_features_and_classes(training_file_path)

test_file_path = '../../../data/w2v_urban/mdetail/baseline/test50.csv'
X_test, y_test = extract_features_and_classes(test_file_path)

# Create a Naive Bayes model using the MultinomialNB
Note. We are using the naive assumption that the features are indipendent each other.
It is not so correct in this case; because the presence of Shop can be explained given an high number
of residential places.

In [4]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
model = clf.fit(X_train, y_train)

In [5]:
from sklearn.metrics import accuracy_score, f1_score
results = model.predict(X_test)
acc = accuracy_score(y_test, results)
f1 = f1_score(y_test, results, average="macro")
print("accuracy", acc)
print("f1", f1)

accuracy 0.492095301715
f1 0.449333102135
