In [None]:
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 2.9 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.10.0-py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3160889 sha256=2a165dc9dc8d1faea6301e822f7de7dad647d5375d9a8b4432dcbaa47580c57c
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
from urllib.request import urlopen
import socket
import numpy as np
import pandas as pd
import re
import fasttext.util
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#Socrata API

## Download metadata sets

In [None]:
def get_metadata(category, pages, limit, offset):
  url = 'http://api.us.socrata.com/api/catalog/v1?only=datasets&categories='+category
  metaDL = []
  for i in range(pages):
    # offset = i * limit + init_offset
    response = urlopen(url+'&offset='+str(offset)+'&limit='+str(limit))
    metadata_set = json.loads(response.read())['results']

    for metadata in metadata_set:
        if( len(metadata['resource']['columns_field_name']) > 0 
            and len(metadata['classification']['domain_tags']) > 0 
            and ('Text' in metadata['resource']['columns_datatype']) ):
            metaDL.append(metadata)

    offset += limit
  
  print("MetaDL Size: %d"%len(metaDL))
  return metaDL

In [None]:
keys = ['Finance', 'Social+Services', 'Demographics', 'Infrastructure', 'Health', 'Environment', 'Public+Safety', 'Education', 'Economy', 'Transportation']
for key in keys:
  metaDL = get_metadata(key, 1, 30000, 0)

MetaDL Size: 3057
MetaDL Size: 1174
MetaDL Size: 2796
MetaDL Size: 2371
MetaDL Size: 2149
MetaDL Size: 2090
MetaDL Size: 1856
MetaDL Size: 1749
MetaDL Size: 1766
MetaDL Size: 1624


## Download data sets

##FastText

In [None]:
# fasttext.util.download_model('en', if_exists='ignore')
# ft_model = fasttext.load_model('/content/drive/MyDrive/Data Lake/cc.en.300.bin')
#Or manually download from https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
ft_model = fasttext.load_model('/content/drive/MyDrive/Data Lake/cc.en.100.bin')



In [None]:
ft_model.get_nearest_neighbors('man')

[(0.8106521368026733, 'boy'),
 (0.7907684445381165, 'woman'),
 (0.7292563319206238, 'man--he'),
 (0.7282201051712036, 'gentleman'),
 (0.7252405285835266, 'family-man'),
 (0.7247761487960815, 'person'),
 (0.7210360765457153, 'man-child'),
 (0.7201767563819885, 'father'),
 (0.7127508521080017, 'policeman'),
 (0.7111547589302063, 'man--a')]

In [None]:
ft_model.get_word_vector('man').shape

(100,)

##Download and process data sets

In [None]:
def get_data(metadata, limit = 100):

  url = "http://" + metadata['metadata']['domain'] + "/resource/" + metadata['resource']['id'] + ".json?$limit="+str(limit)

  #FILTER COLUMNS WITH TEXT DATA TYPE
  types = np.array(metadata['resource']['columns_datatype'])
  columns = np.array(metadata['resource']['columns_field_name'])
  columns = columns[np.where(types == 'Text')]

  url += "&$select="+",".join(columns)
  # print(url)

  data = pd.read_json(url)

  # REMOVE COLUMNS WITHOUT TEXT
  data = data.applymap(lambda s : str(s).lower()) #LOWERCASE
  data = data.applymap(lambda s : re.sub(r'[^a-z\s]', '', s)) #REMOVE NO-LETTERS
  data = data.applymap(lambda s : re.sub(r'\s+', ' ', s)) #REMOVE EXTRA BLANK SPACES

  data.replace('', float('NaN'), inplace = True) # '' -> NaN
  data.dropna(how='all', axis=1, inplace=True) # Remove columns only filled with NaN

  return data

In [None]:
def build_DL(sizeDL, category, limit, black_list, ft_model):
  # print(limit)
  # pages = int(10*sizeDL/limit)
  pages = 1
  metaDL = get_metadata(category, pages, limit, 0)
  DL = list()
  vec_DL = list()
  i = 0
  while len(DL) < sizeDL and i < len(metaDL):
      metadata = metaDL[i]
      i += 1
      try:
          str_id = metadata['metadata']['domain'] + "/" + metadata['resource']['id']
          if str_id in black_list :
            raise Exception('Duplicated Dataset')

          data = get_data(metadata)

          #EXTRACT TOPIC VECTORS
          data_vec = data.applymap(lambda w : ft_model.get_word_vector(str(w)))
          data_vec = data_vec.sum(axis = 0)
          #REMOVE COLUMNS WITH ZERO VECTOR REPRESENTATION
          drop_columns = list()
          for col in data_vec.index:
            if data_vec[col].sum() == 0:
              drop_columns.append(col)
          data_vec = data_vec.drop(drop_columns)

          if data_vec.shape[0] == 0:
            raise Exception('Empty Dataset')
            
          black_list.append(str_id)
          DL.append(data)
          vec_DL.append(data_vec)

          # print(len(DL))

      except Exception as e: 
          # print(e)
          i -= 1
          metaDL.pop(i)

  metaDL = metaDL[:sizeDL]
  return metaDL, DL, vec_DL, black_list

In [None]:
def syntactic_norm(metaDL, ft_model):
  stemmer = nltk.stem.LancasterStemmer()
  #COLLECTING THE DL TAGS INDEXING DATASETS
  tagsDL = dict()
  raw_tags = set()
  for i in range(len(metaDL)):
    tags = metaDL[i]['classification']['domain_tags']
    raw_tags.update(tags)
    for text in tags:
      text = re.sub('\W+',' ', text) #REMOVE PUNCTUATION
      tokens = word_tokenize(text) #SPLIT THE TEXT IN WORDS
      for t in tokens:
        if not t in stopwords.words(): #IGNORE STOP WORDS
          t = stemmer.stem(t) #STEMMING (Syntactic Normalization)
          datatypes = np.array(metaDL[i]['resource']['columns_datatype'])
          num_text_cols = np.sum(datatypes == 'Text')

          if t not in tagsDL:
            tagsDL[t] = [{i}, num_text_cols, ft_model.get_word_vector(t)]
          elif i not in tagsDL[t][0]:
            tagsDL[t][0].add(i)
            tagsDL[t][1] += num_text_cols

  print('Raw tags space size: %d'%len(raw_tags))
  print('Tags space after Syntatic Norm.: %d'%len(tagsDL))

  return tagsDL

def semantic_norm(metaDL, tagsDL):
  #Semantic Normalization
  keys = list( np.sort( list( tagsDL.keys() ) ) )
  groups = [ [ keys.pop(0) ] ]
  i = 0
  while len(keys) > 0 :
    next_tag = keys[i]
    vec = tagsDL[next_tag][2]
    flag = True
    for k in groups[-1]:
      vec_k = tagsDL[k][2]
      similarity = np.dot(vec, vec_k) / ( np.linalg.norm(vec) * np.linalg.norm(vec_k) )
      if similarity <= 0.9 :
        flag = False
        break
    if flag :
      groups[-1].append(next_tag)
      keys.pop(i)
    else:
      i = (i + 1) % len(keys)
      if i == 0:
        groups.append( [ keys.pop(0) ] )

  for i in range( len(groups) ) :
    group = groups[i]
    tagsDL[ group[0] ].pop() #REMOVE VECTOR
    if len(group) > 1 :
      meta_tag = group[0]
      for j in range(1, len(group)):
        tag_j = group[j]

        domain_diff = 0
        intersect = tagsDL[meta_tag][0] & tagsDL[tag_j][0]
        for d in list(intersect):
          datatypes = np.array(metaDL[d]['resource']['columns_datatype'])
          domain_diff += np.sum(datatypes == 'Text')

        tagsDL[meta_tag][0].update( tagsDL[tag_j][0] )
        tagsDL[meta_tag][1] += tagsDL[tag_j][1] - domain_diff
        tagsDL.pop(tag_j)

  print('Tags space after Semantic Norm.: %d'%len(tagsDL))

  return tagsDL

def structural_norm(tagsDL):
  # Structural Normalization
  #   1-subsumed with support <= 20
  redundants = set()
  keys = list( tagsDL.keys() )
  for i in range(len(keys)):
    domain_i = tagsDL[keys[i]][0]
    for j in range(i+1, len(keys)):
      domain_j = tagsDL[keys[j]][0]
      if domain_j.issubset( domain_i ) and tagsDL[keys[j]][1] <= 20 :
        redundants.add(keys[j])
        # print(keys[i] + " - " + keys[j] )
        # print(str(domain_i) + " - " + str(domain_j) )
      elif domain_i.issubset( domain_j ) and tagsDL[keys[i]][1] <= 20 :
        redundants.add(keys[i])
        # print(keys[j] + " - " + keys[i] )
        # print(str(domain_j) + " - " + str(domain_i) )
        break
  
  print('1-redundant tags: %d'%len(redundants))

  for key in list(redundants):
    tagsDL.pop(key)

  #   2-subsumed with support <= 20
  redundants = set()
  black_list = list()
  keys = list( tagsDL.keys() )

  for i in range(len(keys)) :
    if tagsDL[keys[i]][1] <= 20 and keys[i] not in black_list :
      domain_i = tagsDL[keys[i]][0]
      for j in range(len(keys)) :
        if keys[j] != keys[i] and keys[j] not in redundants:
          domain_j = tagsDL[keys[j]][0]
          for k in range(len(keys)) :
            domain_k = tagsDL[keys[k]][0]
            if keys[k] != keys[i] and keys[k] not in redundants and domain_i.issubset( domain_j | domain_k ):
              redundants.add(keys[i])
              black_list.append(keys[j])
              black_list.append(keys[k])

  
  print('2-redundant tags: %d'%len(redundants))

  for key in list(redundants):
    tagsDL.pop(key)

  print('Tags space after Structural Norm.: %d'%len(tagsDL))

  return tagsDL

def process_metadata(metaDL, ft_model):
  tagsDL = syntactic_norm(metaDL, ft_model)
  tagsDL = semantic_norm(metaDL, tagsDL)
  tagsDL = structural_norm(tagsDL)

  # print(tagsDL)

  tags_id = list()
  keys = list( tagsDL.keys() )
  for i in range( len(metaDL) ) :
    tags_id.append( list() )
    for k in range( len(keys) ) :
      if i in tagsDL[keys[k]][0] :
        tags_id[-1].append(k)
  return tags_id, len(keys)

In [None]:
def generate_instance(DL, vec_DL, metaDL, sizeDL, ft_model):
  tagsDL, num_tags = process_metadata(metaDL, ft_model)
  vecDL = str(sizeDL) + ' ' + str(num_tags) + ' '+ str(ft_model.get_dimension()) +'\n'
  DL_json = {'resource': list()}
  num_columns = 0

  for i in range(len(DL)) :
    data = DL[i]
    tags = tagsDL[i]
    #GENERATE JSON VERSION
    data_json = data.to_dict(orient="records")
    DL_json['resource'].append(data_json)

    data = vec_DL[i]

    num_columns += data.shape[0]
    vecDL += '100 ' + str(data.shape[0]) + ' ' + str(len(tags)) + '\n'
    vecDL += str(tags).replace('[', '').replace(']', '').replace(',', ' ') + '\n'
    for col in range(data.shape[0]):
      vecDL += '0\n' # No tags on columns
      vecDL += str(list(data[col])).replace('[', '').replace(']', '').replace(',', ' ')
      vecDL += '\n'
    
  print("Total number of columns: %d"%num_columns)
  return vecDL, DL_json

In [None]:
def generate_output_instances(final_sizeDL) :
  keys = ['Finance', 'Social+Services', 'Demographics', 'Infrastructure', 'Health', 'Environment', 'Public+Safety', 'Education', 'Economy', 'Transportation']
  # offsets = dict(zip(keys, 10*[0]))
  black_list = list()
  limit = 5000

  socket.setdefaulttimeout(10)

  # category_sets = [['Education']]
  # for i in range(1, len(keys)) :
  #   category_sets.append(category_sets[-1].copy())
  #   category_sets[-1].append(keys[i])

  category_sets = list()
  category_sets.append(keys[0:1])
  category_sets.append(keys[1:3])
  category_sets.append(keys[3:6])
  category_sets.append(keys[6:])

  for i in range(5, -1, -1) :
    category_sets.append(category_sets[-1].copy())
    category_sets[-1].append(keys[i])

  print(category_sets)

  for categories in category_sets :
    DL = list()
    vec_DL = list()
    metaDL = list()
    # print(categories)
    for i in range(len(categories)) :
      category = categories[i]
      sizeDL = int( final_sizeDL / len(categories) )
      sizeDL += ( i < ( final_sizeDL % len(categories) ) )
      add_metaDL, add_DL, add_vec_DL, black_list = build_DL(sizeDL, category, limit, black_list, ft_model)
      print(category + " : %d"%len(add_DL))
      metaDL += add_metaDL
      DL += add_DL
      vec_DL += add_vec_DL

    print("DL size : %d"%len(metaDL))

    vecDL, DL_json = generate_instance(DL, vec_DL, metaDL, final_sizeDL, ft_model)
    metaDL = { 'resource' : metaDL }
    path = '/content/drive/MyDrive/Data Lake/Socrata/'

    suffix = '-'+str(final_sizeDL)+'-'+str(len(categories))
    with open(path + 'Data/metaDL'+suffix+'.json', 'w', encoding='utf-8') as f:
        json.dump(metaDL, f, ensure_ascii=False, indent=4)

    with open(path + 'Data/DL'+suffix+'.json', 'w', encoding='utf-8') as f:
        json.dump(DL_json, f, ensure_ascii=True, indent = 4)

    text_file = open(path + 'Processed_Data/topic_vectors'+suffix+'.txt', "w")
    text_file.write(vecDL)
    text_file.close()

In [None]:
generate_output_instances(500)

[['Finance'], ['Social+Services', 'Demographics'], ['Infrastructure', 'Health', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services', 'Finance']]
MetaDL Size: 3057




Finance : 500
DL size : 500
Raw tags space size: 1613
Tags space after Syntatic Norm.: 1215
Tags space after Semantic Norm.: 1195
1-redundant tags: 667
2-redundant tags: 45
Tags space after Structural Norm.: 483
Total number of columns: 3466
MetaDL Size: 1174
Social+Services : 250
MetaDL Size: 2796
Demographics : 250
DL size : 500
Raw tags space size: 1851
Tags space after Syntatic Norm.: 1446
Tags space after Semantic Norm.: 1418
1-redundant tags: 858
2-redundant tags: 54
Tags space after Structural Norm.: 506
Total number of columns: 3628
MetaDL Size: 2371
Infrastructure : 167
MetaDL Size: 2149
Health : 167
MetaDL Size: 2090
Environment : 166
DL size : 500
Raw tags space size: 1891
Tags space after Syntatic Norm.: 1484
Tags space after Semantic Norm.: 1466
1-redundant tags: 811
2-redundant tags: 44
Tags space after Structural Norm.: 611
Total number of columns: 3896
MetaDL Size: 1856
Public+Safety : 125
MetaDL Size: 1749
Education : 125
MetaDL Size: 1766
Economy : 125
MetaDL Size: 16

In [None]:
generate_output_instances(50)

[['Finance'], ['Social+Services', 'Demographics'], ['Infrastructure', 'Health', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services', 'Finance']]
MetaDL Size: 3097
Finance : 50
DL size : 50
Raw tags space size: 195
Tags space after Syntatic Norm.: 231
Tags space after Semantic Norm.: 230
1-redundant tags: 191
2-r

In [None]:
generate_output_instances(100) #IRACE

[['Finance'], ['Social+Services', 'Demographics'], ['Infrastructure', 'Health', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services', 'Finance']]
MetaDL Size: 3096
Finance : 100
DL size : 100
Raw tags space size: 403
Tags space after Syntatic Norm.: 398
Tags space after Semantic Norm.: 392
1-redundant tags: 286
2



Education : 10
MetaDL Size: 1791
Economy : 10
MetaDL Size: 1640
Transportation : 10
MetaDL Size: 2107
Environment : 10
MetaDL Size: 2157
Health : 10
MetaDL Size: 2396
Infrastructure : 10
MetaDL Size: 2818
Demographics : 10
MetaDL Size: 1183
Social+Services : 10
MetaDL Size: 3096
Finance : 10
DL size : 100
Raw tags space size: 595
Tags space after Syntatic Norm.: 582
Tags space after Semantic Norm.: 579
1-redundant tags: 387
2-redundant tags: 11
Tags space after Structural Norm.: 181
Total number of columns: 932


In [None]:
generate_output_instances(100)

[['Finance'], ['Social+Services', 'Demographics'], ['Infrastructure', 'Health', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services', 'Finance']]
MetaDL Size: 3095
Finance : 100
DL size : 100
Raw tags space size: 380
Tags space after Syntatic Norm.: 375
Tags space after Semantic Norm.: 367
1-redundant tags: 248
2

In [None]:
generate_output_instances(300)

[['Finance'], ['Social+Services', 'Demographics'], ['Infrastructure', 'Health', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services', 'Finance']]
MetaDL Size: 3095
Finance : 300
DL size : 300
Raw tags space size: 1092
Tags space after Syntatic Norm.: 906
Tags space after Semantic Norm.: 892
1-redundant tags: 543


In [None]:
generate_output_instances(500) #iRace

[['Finance'], ['Social+Services', 'Demographics'], ['Infrastructure', 'Health', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services', 'Finance']]
MetaDL Size: 3095
Finance : 500
DL size : 500
Raw tags space size: 1600
Tags space after Syntatic Norm.: 1209
Tags space after Semantic Norm.: 1189
1-redundant tags: 66

In [None]:
generate_output_instances(750)

[['Finance'], ['Social+Services', 'Demographics'], ['Infrastructure', 'Health', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services', 'Finance']]
MetaDL Size: 3095
Finance : 750
DL size : 750
Raw tags space size: 2081
Tags space after Syntatic Norm.: 1467
Tags space after Semantic Norm.: 1440
1-redundant tags: 73

In [None]:
black_list = generate_output_instances(1000)

[['Finance'], ['Social+Services', 'Demographics'], ['Infrastructure', 'Health', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services', 'Finance']]
MetaDL Size: 3095




Finance : 1000
DL size : 1000
Raw tags space size: 2556
Tags space after Syntatic Norm.: 1708
Tags space after Semantic Norm.: 1676
1-redundant tags: 820
2-redundant tags: 84
Tags space after Structural Norm.: 772
Total number of columns: 6995
MetaDL Size: 1183
Social+Services : 500
MetaDL Size: 2818
Demographics : 500
DL size : 1000
Raw tags space size: 2778
Tags space after Syntatic Norm.: 1971
Tags space after Semantic Norm.: 1936
1-redundant tags: 1075
2-redundant tags: 68
Tags space after Structural Norm.: 793
Total number of columns: 7755
MetaDL Size: 2396
Infrastructure : 334
MetaDL Size: 2157
Health : 333
MetaDL Size: 2107
Environment : 333
DL size : 1000
Raw tags space size: 2999
Tags space after Syntatic Norm.: 2078
Tags space after Semantic Norm.: 2050
1-redundant tags: 1070
2-redundant tags: 57
Tags space after Structural Norm.: 923
Total number of columns: 7569
MetaDL Size: 1881
Public+Safety : 250
MetaDL Size: 1793
Education : 250
MetaDL Size: 1791
Economy : 250
MetaDL Si

In [None]:
generate_output_instances(10)

[['Finance'], ['Social+Services', 'Demographics'], ['Infrastructure', 'Health', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services', 'Finance']]
MetaDL Size: 3095
Finance : 10
DL size : 10
Raw tags space size: 36
Tags space after Syntatic Norm.: 51
Tags space after Semantic Norm.: 51
1-redundant tags: 33
2-redun

In [None]:
generate_output_instances(1000) #IRACE

[['Finance'], ['Social+Services', 'Demographics'], ['Infrastructure', 'Health', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services'], ['Public+Safety', 'Education', 'Economy', 'Transportation', 'Environment', 'Health', 'Infrastructure', 'Demographics', 'Social+Services', 'Finance']]
MetaDL Size: 3057




Finance : 1000
DL size : 1000
Raw tags space size: 2543
Tags space after Syntatic Norm.: 1695
Tags space after Semantic Norm.: 1663
1-redundant tags: 817
2-redundant tags: 78
Tags space after Structural Norm.: 768
Total number of columns: 7238
MetaDL Size: 1174
Social+Services : 500
MetaDL Size: 2796
Demographics : 500
DL size : 1000
Raw tags space size: 2800
Tags space after Syntatic Norm.: 1992
Tags space after Semantic Norm.: 1957
1-redundant tags: 1094
2-redundant tags: 70
Tags space after Structural Norm.: 793
Total number of columns: 7531
MetaDL Size: 2371
Infrastructure : 334
MetaDL Size: 2149
Health : 333
MetaDL Size: 2090
Environment : 333
DL size : 1000
Raw tags space size: 2968
Tags space after Syntatic Norm.: 2048
Tags space after Semantic Norm.: 2022
1-redundant tags: 1033
2-redundant tags: 60
Tags space after Structural Norm.: 929
Total number of columns: 7750
MetaDL Size: 1856
Public+Safety : 250
MetaDL Size: 1749
Education : 250
MetaDL Size: 1766
Economy : 250
MetaDL Si