In [1]:
import os
import csv
from tqdm import tqdm

def load_pois(cell_size):
    """
    Input: cell_size
    Output: (list of pois, labels)
    Description: each poi is a identified by a list of features that you find in labels
    """
    base_path = '../../data/w2v_urban/Foursquare/Manhattan/foursquare_Manhattan_'
    pois_list_path = base_path + str(cell_size) + ".csv"
    
    # fill pois list
    pois = []
    with open(pois_list_path,  encoding="utf8") as csvfile:
        pois_reader = csv.reader(csvfile, delimiter=',')
        # do not read the first element of each row
        # unamed ?
        for i, row in enumerate(pois_reader):
            if i == 0:
                labels = row[1:]
            else:
                pois.append(row[1:])
    
    return pois, labels

def load_cells(cell_size):
    """
    Input: cell_size
    Output: list of cells
    Description: each cells is a dictionary containing keys: cell_id, predominant_class, data_type
    data_type = 0 if is part of training set, 1 otherwise
    """
    test_predominant_per_cell = '../../data/w2v_urban/test_classifier/test_Manhattan/' + str(cell_size) + '.csv'
    training_predominant_per_cell = '../../data/w2v_urban/train_classifier/train_Manhattan/' + str(cell_size) + '.csv'
    
    cells = []
    for f, file_path in enumerate([training_predominant_per_cell, test_predominant_per_cell]):
        with open(file_path) as csvfile:
            pois_reader = csv.reader(csvfile, delimiter='\t')
            for i, row in enumerate(pois_reader):
                if i == 0:
                    continue
                else:
                    cell_id = int(row[0])
                    predominant_class = row[-1]
                    cells.append({'cell_id': cell_id, 'predominant_class': predominant_class, 'data_type': f})
    
    return cells

In [2]:
from collections import Counter

cell_sizes = ['50', '100', '200', '250']
for cell_size in cell_sizes:
    pois, _ = load_pois(cell_size)
    cells = load_cells(cell_size)
    
    # remove pois without any category
    pois = [poi for poi in pois if len(poi[3]) > 0]
    
    # extract categories
    categories = [poi[3].replace(' ', '_').split(':')[-1] for poi in pois]
    
    # remove duplicates
    most_detailed_categories = list(set(categories))
    
    training_cells_count = []
    test_cells_count = []
    for cell in tqdm(cells):
        cell_id = cell['cell_id']
        predominant_class = cell['predominant_class']
        cell_pois = [poi for poi in pois if poi[-1] == str(cell_id)]
        cell_pois_category = [poi[3].replace(' ', '_').split(':')[-1] for poi in cell_pois]
        cell_count = dict(Counter(cell_pois_category))
        counter_array = [cell_count[category] if category in cell_count else 0 for category in most_detailed_categories]
        if cell['data_type'] == 0:
            training_cells_count.append([cell_id] + counter_array + [predominant_class])
        else:
            test_cells_count.append([cell_id] + counter_array + [predominant_class])
            
    # write on files
    # training
    folder = '../../data/w2v_urban/mdetail/baseline/'
    with open(folder + 'training' + str(cell_size) + '.csv', 'w', newline='') as csvfile:
        datawriter = csv.writer(csvfile, delimiter='\t')
        datawriter.writerow(['cellID'] + most_detailed_categories + ['t_predominant'])
        for count_array in training_cells_count:
            datawriter.writerow(count_array)
            
    # training
    with open(folder + 'test' + str(cell_size) + '.csv', 'w', newline='') as csvfile:
        datawriter = csv.writer(csvfile, delimiter='\t')
        datawriter.writerow(['cellID'] + most_detailed_categories + ['t_predominant'])
        for count_array in test_cells_count:
            datawriter.writerow(count_array)

100%|██████████| 16132/16132 [06:37<00:00, 40.57it/s]
100%|██████████| 5223/5223 [02:02<00:00, 42.48it/s]
100%|██████████| 1502/1502 [00:39<00:00, 37.56it/s]
100%|██████████| 1005/1005 [00:25<00:00, 40.10it/s]
