Converting integer CSV to LPU's format:

- features with value zero should be ignored
- features with positive value should be written as "Feature:Value", such as "354:3"
- such pairs are separated with spaces
- the features must be ordered
- if classes are provided, they should be either -1 or +1 at the start of the row
- features start at 1

In [5]:
import os
import numpy as np
import pandas as pd

In [66]:
def to_lpu(x, y=None):
    d1, d2 = x.shape
    rows = []
    if y is not None:
        assert y.shape == (d1,)
    
    for row in range(d1):
        rows.append([])
        if y is not None:
            rows[-1].append(str(y[row]))
        for attr in range(d2):
            v = x[row, attr]
            if v > 0:
                rows[-1].append(f'{attr + 1}:{v}')
        rows[-1] = ' '.join(rows[-1])
    
    return '\n'.join(rows)

In [67]:
x = np.array([[4, 5, 0, 6],
              [0, 3, 4, 8],
              [0, 0, 1, 0]])
print(to_lpu(x))

1:4 2:5 4:6
2:3 3:4 4:8
3:1


In [68]:
y = np.array([1, 1, -1])
print(to_lpu(x, y))

1 1:4 2:5 4:6
1 2:3 3:4 4:8
-1 3:1


In [44]:
dfX = pd.read_csv(os.path.expanduser('~/Documents/code/trabalho-disciplina/set-expansion-structured-data/data/CNAE-9.data'), header=None)
dfquery = pd.read_csv(os.path.expanduser('~/Documents/code/trabalho-disciplina/set-expansion-structured-data/data/CNAE-9_query.csv'))

In [73]:
for query in dfquery.iterrows():
    positive = np.array([int(x) for x in query[1]['query'].split(',')])
    target = np.array([int(x) for x in query[1]['target'].split(',')])
    positive_data = dfX.iloc[positive, 1:]
    target_data = dfX.iloc[target, 1:]
    
    mask = np.zeros(dfX.shape[0], dtype=np.bool)
    mask[target] = True
    target_y = np.zeros(dfX.shape[0], dtype=np.int)
    target_y[mask] = 1
    target_y[~mask] = -1
    
    positive_lpu = to_lpu(positive_data.as_matrix())
    unlabeled_lpu = to_lpu(target_data.as_matrix())
    test_lpu = to_lpu(dfX.iloc[:, 1:].as_matrix(), target_y)
    
    with open(os.path.expanduser('~/Documents/code/trabalho-disciplina/set-expansion-structured-data/data/cnae-query/'
                                 + str(query[0]) + '.pos'), 'w') as f:
        f.write(positive_lpu)
    with open(os.path.expanduser('~/Documents/code/trabalho-disciplina/set-expansion-structured-data/data/cnae-query/'
                                 + str(query[0]) + '.unlabel'), 'w') as f:
        f.write(unlabeled_lpu)
    with open(os.path.expanduser('~/Documents/code/trabalho-disciplina/set-expansion-structured-data/data/cnae-query/'
                                 + str(query[0]) + '.test'), 'w') as f:
        f.write(test_lpu)