In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches, SequenceMatcher
from sklearn.metrics import mutual_info_score
from sklearn.metrics import precision_score, recall_score, f1_score
from random import choice, randrange
import requests
import time
import string
import random
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import json
import nltk
nltk.download('words')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import words
from nltk.corpus import stopwords
!pip install wikipedia
import wikipedia
!pip install pyinterval
import os
import tarfile
import time
import pickle

In [None]:
## data prep
ps = PorterStemmer()
stemmed_words_list = [ps.stem(w) for w in words.words()]

In [None]:
## utils funct
def saveCache(fileName, cache):
    tempDict = {}
    for key, value in cache.items():
        tempDict[key] = list(value)
    with open(fileName, 'w') as fp:
        json.dump(tempDict, fp)
        
def loadCache(fileName):
    data = None
    with open(fileName, 'r') as fp:
        data = json.load(fp)
    cache = {}
    for key, value in data.items():
        cache[key] = set(value)
    return cache

def findFromCached(cachedAlias, cachedProvenance, columns):
    aliasReturn = {}
    provenanceReturn = {}
    columnsToSearch = set()
    for column in columns:
        if column in cachedAlias:
            aliasReturn[column] = cachedAlias[column]
        else:
            columnsToSearch.add(column)
        if column in cachedProvenance:
            provenanceReturn[column] = cachedProvenance[column]
        else:
            columnsToSearch.add(column)
     
    return aliasReturn, provenanceReturn, columnsToSearch

In [None]:
## caches
cacheSynonym = {}
cacheRelatedTo = {}
cacheIsA = {}
cacheDerivedFrom = {}
cacheWikipedia = {}
cachedAlias = {}
cachedProvenance = {}

In [None]:
## load caches
cacheSynonym = loadCache('./cacheSynonym-small.json')
cacheRelatedTo = loadCache('./cacheRelatedTo-small.json')
cacheIsA = loadCache('./cacheIsA-small.json')
cacheDerivedFrom = loadCache('./cacheDerivedFrom-small.json')
cacheWikipedia = loadCache('./cacheWikipedia-small.json')
cachedAlias = loadCache('./cacheAlias.json')
#cachedProvenance = loadCache('./cacheProvenance.json')

In [None]:
a_file = open("cacheProvenance.pkl", "rb")
cachedProvenance = pickle.load(a_file)

In [None]:
def getLCS(string1, string2):
    match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
    return string1[match.a: match.a + match.size]
    #return string2[match.b: match.b + match.size]

## CONCEPTNET.IO ##
def getSynonym(word, limit=10, dropWord=True, useCache=True):
    processedText = word.lower().replace(' ', '_')
    synonyms = set()
    if (len(processedText) <= 1):
        return synonyms, True
    if useCache and word in cacheSynonym:
        return cacheSynonym[word], True
    baseUrl = "https://api.conceptnet.io/query?node=/c/en/$WORD$&other=/c/en&limit=$LIMIT$&rel=/r/Synonym"
    processedURL = baseUrl.replace("$WORD$", processedText)
    processedURL = processedURL.replace("$LIMIT$", str(limit))
    #print("Processed URL: ", processedURL)
    obj = None
    try:
        obj = requests.get(processedURL).json()
    except Exception:
        return synonyms, False
    #obj = requests.get(processedURL).json()
    if 'edges' not in obj:
        return synonyms, False
    resultsLen = len(obj['edges'])
    edges = obj['edges']
    #print("Results:", resultsLen)
    
    for i in range(0, resultsLen):
        edge = edges[i]
        label = edge['end']['label']
        synonyms.add(label.lower())
        label = edge['start']['label']
        synonyms.add(label.lower())
    if dropWord:
        synonyms.discard(word.lower())
    if useCache:
        cacheSynonym[word] = synonyms
    return synonyms, False

def getRelatedTo(word, limit=10, dropWord=True, useCache=True):
    processedText = word.lower().replace(' ', '_')
    relatedTo = set()
    if (len(processedText) <= 1):
        return relatedTo, True
    if useCache and word in cacheRelatedTo:
        return cacheRelatedTo[word], True
    baseUrl = "https://api.conceptnet.io/query?node=/c/en/$WORD$&other=/c/en&limit=$LIMIT$&rel=/r/RelatedTo"
    processedURL = baseUrl.replace("$WORD$", processedText)
    processedURL = processedURL.replace("$LIMIT$", str(limit))
    #print("Processed URL: ", processedURL)
    obj = None
    try:
        obj = requests.get(processedURL).json()
    except Exception:
        return relatedTo, False
    #obj = requests.get(processedURL).json()
    if 'edges' not in obj:
        return relatedTo, False
    resultsLen = len(obj['edges'])
    edges = obj['edges']
    #print("Results:", resultsLen)
    for i in range(0, resultsLen):
        edge = edges[i]
        label = edge['end']['label']
        relatedTo.add(label.lower())
        label = edge['start']['label']
        relatedTo.add(label.lower())
    if dropWord:
        relatedTo.discard(word.lower())
    if useCache:
        cacheRelatedTo[word] = relatedTo
    return relatedTo, False

def getIsA(word, limit=10, dropWord=True, useCache=True):
    processedText = word.lower().replace(' ', '_')
    isA = set()
    if (len(processedText) <= 1):
        return isA, True
    if useCache and word in cacheIsA:
        return cacheIsA[word], True
    baseUrl = "https://api.conceptnet.io/query?node=/c/en/$WORD$&other=/c/en&limit=$LIMIT$&rel=/r/IsA"
    processedURL = baseUrl.replace("$WORD$", processedText)
    processedURL = processedURL.replace("$LIMIT$", str(limit))
    #print("Processed URL: ", processedURL)
    obj = None
    try:
        obj = requests.get(processedURL).json()
    except Exception:
        return isA, False
    #obj = requests.get(processedURL).json()
    if 'edges' not in obj:
        return isA, False
    resultsLen = len(obj['edges'])
    edges = obj['edges']
    #print("Results:", resultsLen)
    for i in range(0, resultsLen):
        edge = edges[i]
        label = edge['end']['label']
        isA.add(label.lower())
    if dropWord:
        isA.discard(word.lower())
    if useCache:
        cacheIsA[word] = isA
    return isA, False

def getDerivedFrom(word, limit=10, dropWord=True, useCache=True):
    processedText = word.lower().replace(' ', '_')
    derivedFrom = set()
    if (len(processedText) <= 1):
        return derivedFrom, True
    if useCache and word in cacheDerivedFrom:
        return cacheDerivedFrom[word], True
    baseUrl = "https://api.conceptnet.io/query?node=/c/en/$WORD$&other=/c/en&limit=$LIMIT$&rel=/r/DerivedFrom"
    processedURL = baseUrl.replace("$WORD$", processedText)
    processedURL = processedURL.replace("$LIMIT$", str(limit))
    #print("Processed URL: ", processedURL)
    obj = None
    try:
        obj = requests.get(processedURL).json()
    except Exception:
        return derivedFrom, False
    #obj = requests.get(processedURL).json()
    if 'edges' not in obj:
        return derivedFrom, False
    resultsLen = len(obj['edges'])
    edges = obj['edges']
    #print("Results:", resultsLen)
    for i in range(0, resultsLen):
        edge = edges[i]
        label = edge['end']['label']
        if (label != word):
            derivedFrom.add(label.lower())
    if dropWord:
        derivedFrom.discard(word.lower())
    if useCache:
        cacheDerivedFrom[word] = derivedFrom
    return derivedFrom, False

## LCS ##
def getAmbiguousWithLCS(useStemming, col1, columns):
    translate_table = dict((ord(char), None) for char in string.punctuation)
    ambiguousValues = set()     
    for col2 in columns:
        if (col1 != col2):
            lcs = getLCS(col1, col2)
            minLen = min(len(col1), len(col2))
            maxLen = max(len(col1), len(col2))
            if (2 * minLen) < maxLen:
                continue
            if (len(lcs) >= (0.5*minLen) and len(lcs) > 1):
                lcs = lcs.translate(translate_table)
                if (lcs.lower() in stopwords.words('english')):
                    continue
                if (useStemming):
                    words_in_lcs = word_tokenize(lcs)
                    lcs_stemmed = ""
                    for w in words_in_lcs:
                        stemmedW = ps.stem(w)
                        if (stemmedW in stemmed_words_list):
                            lcs_stemmed += stemmedW + " "
                    if (len(lcs_stemmed) > 0):
                        #print(col1, col2, lcs, lcs_stemmed, sep="\t")
                        ambiguousValues.add(lcs)
                else:
                    #print(col1, col2, lcs, sep="\t")
                    ambiguousValues.add(lcs)
    return ambiguousValues

## WIKIPEDIA ##
def getAmbiguityFromWikipedia(column, useStemming=True, results=2, useCache=True):
        if useCache and column in cacheWikipedia:
            return cacheWikipedia[column], True
        wikipediaResults = set()
        try:
            wikipediaResults = set(wikipedia.search(column.replace("-"," "), results))
            wikipediaResultsStrip = set()
            for result in wikipediaResults:
                wikipediaResultsStrip.add(result.lower().strip())
            wikipediaResults = wikipediaResultsStrip
        except Exception:
                pass
        if (useStemming):
            translate_table = dict((ord(char), None) for char in string.punctuation)
            columnNoPunct = column.translate(translate_table)
            stemmed_columns = [ps.stem(w) for w in word_tokenize(columnNoPunct)]
            wikipediaResultsStemmed = set()
            for wikipediaResult in wikipediaResults:
                wikipediaResult = wikipediaResult.translate(translate_table)
                words_in_wiki = word_tokenize(wikipediaResult)
                stemmed_wiki = ""
                for w in words_in_wiki:
                    w_stemmed = ps.stem(w)
                    if w_stemmed in stemmed_columns:
                        stemmed_wiki = w + " "
                if (len(stemmed_wiki) > 0):
                    wikipediaResultsStemmed.add(stemmed_wiki.lower().strip())
            if useCache:
                cacheWikipedia[column] = wikipediaResultsStemmed
            return wikipediaResultsStemmed, True
        else:
            if useCache:
                cacheWikipedia[column] = wikipediaResults
            return wikipediaResults, False

## FUNCTIONS ##

def getLabel(columns, limit=10, useStemming=True, useLCS=False):
    labels = {}
    provenance = {}
    for column in columns:
        provenanceMap = {}
        #start_time = time.time()
        synonyms, cachedSynonyms = getSynonym(column.replace("-"," "), limit=limit, dropWord=True, useCache=True)
        #print("Synonyms time: %s" %(time.time() - start_time))
        #start_time = time.time()
        relatedTo, cachedrelatedTo = getRelatedTo(column.replace("-"," "), limit=limit, dropWord=True, useCache=True)
        #print("RelatedTo time: %s" %(time.time() - start_time))
        #start_time = time.time()
        isA, cachedIsA = getIsA(column.replace("-"," "), limit=limit, dropWord=True, useCache=True)
        #print("IsA time: %s" %(time.time() - start_time))
        #start_time = time.time()
        derivedFrom, cachedDerivedFrom = getDerivedFrom(column.replace("-"," "), limit=limit, dropWord=True, useCache=True)
        #print("DerivedFrom time: %s" %(time.time() - start_time))
        #start_time = time.time()
        #wikipediaResults = set(wikipedia.search(column.replace("-"," "), results=2))
        wikipediaResults, cachedWikipediaResults = getAmbiguityFromWikipedia(column.replace("-"," "), useStemming=True, results=2, useCache=True)
        #print("Wikipedia time: %s" %(time.time() - start_time))
        #start_time = time.time()
        lcsAmb = getAmbiguousWithLCS(useStemming, column, columns)
        #print("LSC time: %s" %(time.time() - start_time))
        labelsForColumn = set()
        if len(synonyms) > 0:
            labelsForColumn = labelsForColumn | set(synonyms)
        if len(relatedTo) > 0:
            labelsForColumn = labelsForColumn | set(relatedTo)
        if len(wikipediaResults) > 0:
            labelsForColumn = labelsForColumn | set(wikipediaResults)
        if len(isA) > 0:
            labelsForColumn = labelsForColumn | set(isA)
        if len(derivedFrom) > 0:
            labelsForColumn = labelsForColumn | set(derivedFrom)
        #labelsForColumn = synonyms | relatedTo | wikipediaResults | isA | derivedFrom
        if len(labelsForColumn) == 0:
            labelsForColumn = labelsForColumn | lcsAmb
        if useLCS:
            labelsForColumn = labelsForColumn | lcsAmb
        provenanceMap['synonyms'] = synonyms
        provenanceMap['relatedTo'] = relatedTo
        provenanceMap['isA'] = isA
        provenanceMap['derivedFrom'] = derivedFrom
        provenanceMap['wikipediaResults'] = wikipediaResults
        provenanceMap['lcsAmb'] = lcsAmb
        labelsForColumn = [w.lower().strip() for w in list(labelsForColumn)]
        labels[column] = set(labelsForColumn)
        provenance[column] = provenanceMap
        if not (cachedSynonyms or cachedrelatedTo or cachedIsA or cachedDerivedFrom):
            time.sleep(1)
    return labels, provenance

def getAmbiguousColumns(columns, aliasDict):
    colSet = set()
    ambiguousLabels = {}
    ambiguousAttr = {}
    for col1 in columns:
        for col2 in columns:
            if col1 != col2:
                alias1 = aliasDict[col1]
                alias2 = aliasDict[col2]
                ambiguousValues = set(alias1).intersection(set(alias2))
                if (len(ambiguousValues) > 0):
                    colSet.add(col1)
                    colSet.add(col2)
                    if col1 not in ambiguousLabels:
                        ambiguousLabels[col1] = set(ambiguousValues)
                    else:
                        setValue = ambiguousLabels[col1]
                        setValue = setValue | ambiguousValues
                        ambiguousLabels[col1] = setValue
                    if col2 not in ambiguousLabels:
                        ambiguousLabels[col2] = set(ambiguousValues)
                    else:
                        setValue = ambiguousLabels[col2]
                        setValue = setValue | ambiguousValues
                        ambiguousLabels[col2] = setValue
                    if col1 not in ambiguousAttr:
                        setAttr = set()
                        ambiguousAttr[col1] = setAttr
                    setAttr = ambiguousAttr[col1]
                    setAttr.add(col2)
    return colSet, ambiguousLabels, ambiguousAttr

def updateAlias(dic, dicAdd, dicRemove, blackList = set()):
    for key, value in dicAdd.items():
        if key not in dic:
            dic[key] = set(value)
        else:
            setValue = dic[key]
            for v in value:
                setValue.add(v)
    for key, value in dicRemove.items():
        if key in dic:
            setValue = dic[key]
            for v in value:
                setValue.discard(v)
    #print("BLACKLIST:")
    #print(blackList)
    for key, value in dic.items():
        #print("BEFORE:")
        #print(value)
        value = value - blackList
        dic[key] = value
        #print("AFTER:")
        #print(value)
    return dic


In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

def getStatsForValues(df, columns): 
    numerical = {}
    categorical = {}
    text = {}
    for column in columns:
        df[column] = pd.to_numeric(df[column], errors='ignore')
        columnType = df[column].dtype
        if (columnType in numerics):
            stats = df[column].describe()
            min = stats['min']
            max = stats['max']
            mean = stats['mean']
            median = stats['50%']
            std = stats['std']
            stats = (min, max, mean, median, std)
            numerical[column] = stats
        else:
            lenValue = df[column].nunique()
            lenColumn = df[column].shape[0]
            threshold = lenColumn * 0.9
            if (lenValue < threshold):
                #compute frequencies
                freq = df[column].value_counts()
                categorical[column] = freq
            else:
                mean = df[column].map(len).mean()
                std = df[column].map(len).std()
                median = df[column].map(len).median()
                tmpCol = df[column].sort_values()
                min = tmpCol.iloc[0]
                max = tmpCol.iloc[-1]
                stats = (min, max, mean, median, std)
                numerical[column] = stats

    return numerical, categorical, text

In [None]:
def findAmbiguousForCategorical(categorical):
    colSet = set()
    ambiguousValuesForAttr = {}
    setPairs = {}
    for c1, freq1 in categorical.items():
        for c2, freq2 in categorical.items():
            if (c1 != c2):
                keys1 = freq1.keys()
                keys2 = freq2.keys()
                ambiguousValues = set(keys1).intersection(set(keys2))
                if (len(ambiguousValues) < 1):
                    continue
                colSet.add(c1)
                colSet.add(c2)
                if c1 not in ambiguousValuesForAttr:
                    ambiguousValuesForAttr[c1] = set(ambiguousValues)
                else:
                    setValue = ambiguousValuesForAttr[c1]
                    setValue = setValue | ambiguousValues
                    ambiguousValuesForAttr[c1] = setValue
                if c1 not in setPairs:
                    attrs = set()
                    attrs.add(c2)
                    setPairs[c1] = attrs
                else:
                    attrs = setPairs[c1]
                    attrs.add(c2)
    return colSet, ambiguousValuesForAttr, setPairs

def getIntersection(min1, max1, min2, max2):
    ## check type
    if (type(min1) == str) or (type(max1) == str) or (type(min2) == str) or (type(max2) == str):
        return None
    ## check intervals
    if (max1 < min1) or (max2 < min2):
        print("Error min1-max1: {}-{} min2-max2:{}-{}".format(min1, max1, min2, max2))
        return None
    # case min1 - max1, min2 - max2 --> no intersection
    if (max1 < min2):
        return None
    # case min2 - max2, min1 - max2 --> no intersection
    if (max2 < min1):
        return None
    # case min1, min2, max1, max2 --> min2-max1
    if (min1 <= min2) and (min2 <= max1) and (max1 <= max2):
        return (min2, max1)
    # case min2, min1, max2, max1 --> min2-max1
    if (min2 <= min1) and (min1 <= max2) and (max2 <= max1):
        return (min1, max2)
    # case min1, min2, max2, max1 --> min2-max2
    if (min1 <= min2) and (min2 <= max2) and (max2 <= max1):
        return (min2, max2)
    # case min2, min1, max1, max2 --> min1-max1
    return (min1, max1)


def findAmbiguousForNumerical(numerical):
    ## naive strategy: intersection of intervals
    colSet = set()
    ambiguousValuesForAttr = {}
    setPairs = {}
    for c1, stats1 in numerical.items():
        for c2, stats2 in numerical.items():
            if (c1 != c2):
                interval = getIntersection(stats1[0], stats1[1], stats2[0], stats2[1])
                if (interval is None):
                    continue
                colSet.add(c1)
                colSet.add(c2)
                if c1 not in ambiguousValuesForAttr:
                    listValue = []
                    listValue.append(interval)
                    ambiguousValuesForAttr[c1] = listValue
                else:
                    listValue = ambiguousValuesForAttr[c1]
                    listValue.append(interval)
                if c1 not in setPairs:
                    attrs = set()
                    attrs.add(c2)
                    setPairs[c1] = attrs
                else:
                    attrs = setPairs[c1]
                    attrs.add(c2)
    return colSet, ambiguousValuesForAttr, setPairs

In [None]:
def findAmbiguousForLabelAndData(setPairs, setPairsCategorical, setPairsNumerical, ambiguousValuesForAttrCategorical, ambiguousValuesForAttrNumerical):
    setPairsLabelData = {}
    ambiguousValues = {}
    for attributeLabel, ambiguousAttributesLabel in setPairs.items():
        setAmb = set()
        for attributeCategorical, ambiguousAttributesCategorical in setPairsCategorical.items():
            if (attributeLabel == attributeCategorical):
                intersection = ambiguousAttributesLabel.intersection(ambiguousAttributesCategorical)
                #intersection = list(intersection)
                if (len(intersection) > 0):
                    for amb in intersection:
                        setAmb.add(amb)
                    for attr in intersection:
                        valuesAmb = ambiguousValuesForAttrCategorical[attr]
                        for value in valuesAmb:
                            if value in ambiguousValues:
                                setAttrs = ambiguousValues[value]
                                setAttrs.add(attr)
                            else:
                                setAttrs = set()
                                setAttrs.add(attr)
                                ambiguousValues[value] = setAttrs
        for attributeNumerical, ambiguousAttributesNumerical in setPairsNumerical.items():
            if (attributeLabel == attributeNumerical):
                intersection = ambiguousAttributesLabel.intersection(ambiguousAttributesNumerical)
                #intersection = list(intersection)
                if (len(intersection) > 0):
                    for amb in intersection:
                        setAmb.add(amb)
                    print(intersection)
                    for attr in intersection:
                        intervals = ambiguousValuesForAttrNumerical[attr]
                        for interval in intervals:
                            minI = interval[0]
                            maxI = interval[1]
                            if minI in ambiguousValues:
                                setAttrs = ambiguousValues[minI]
                                setAttrs.add(attr)
                            else:
                                setAttrs = set()
                                setAttrs.add(attr)
                                ambiguousValues[minI] = setAttrs
                            if maxI in ambiguousValues:
                                setAttrs = ambiguousValues[maxI]
                                setAttrs.add(attr)
                            else:
                                setAttrs = set()
                                setAttrs.add(attr)
                                ambiguousValues[maxI] = setAttrs
        setPairsLabelData[attributeLabel] = setAmb
    return setPairsLabelData, ambiguousValues

def reversedMap(map):
    reversed = {}
    for attrName, values in map.items():
        for value in values:
            #print(attrName, value)
            if (value not in reversed):
                    attrSet = set()
                    attrSet.add(attrName)
                    reversed[value] = attrSet
            else:
                    attrSet = reversed[value]
                    attrSet.add(attrName)
            #if type(value) is tuple:
            #    print("TUPLE")    
    return reversedMap

# Data retrieve

In [None]:
## WDC download
## one time execution

skipDownload = True
if skipDownload == False:
    #for i in range(0, 25):
    for i in range(0, 2):    
        s = str(i)
        if i < 10:
            s = '0' + str(i)
        url = "http://data.dws.informatik.uni-mannheim.de/webtables/2015-07/englishCorpus/compressed/" + s + ".tar.gz"
        response = requests.get(url, stream=True)
        file = tarfile.open(fileobj=response.raw, mode="r|gz")
        path = "./" + s + "/"
        file.extractall(path=path)
        file.close()
        tarFile = "./"+s+"/"+s+".tar"
        !python -m tarfile -e {tarFile} ./WDC/

        #os.remove(path)
        os.remove(tarFile)
        #os.remove('./WDC/')

In [None]:
## WDC Web Table Corpus 2015
## All data
relationalTables = []

path_to_json = './WDC/0'
#path_to_json = './WDC/1'
#path_to_json = './WDC/2'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

start_time = time.time()
for json_file in json_files:
    path = path_to_json + "/" + json_file
    with open(path) as f:
        data = json.load(f)
        if (data['tableType'] == 'RELATION') and (data['hasHeader'] == True) and (data['tableOrientation'] == "HORIZONTAL") and (data['headerPosition'] == "FIRST_ROW"):
            relationalTables.append(data)
end_time = time.time()
elapsed_time = end_time - start_time
print("Loaded Relational JSON Files:", len(relationalTables))
print("Loaded in ", elapsed_time)

#parsedSchemas = {}
parsedSchemas = []
start_time = time.time()
for table in relationalTables:
#for table in relationalTables[:5000]:
    relationData = table['relation']
    #pageTitle = table['pageTitle'].strip()
    #title = table['title'].strip()
    #print("PageTitle:", pageTitle)
    #print("Title:",title)
    columns = []
    data = {}
    for column in relationData:
        columnName = column[0].strip()
        if columnName != '':
            columns.append(columnName)
            data[columnName] = column[1:]
    if len(data) > 0:
        t = (columns, data)
        parsedSchemas.append(t)

print("Parsed schemas: ", len(parsedSchemas))
end_time = time.time()
elapsed_time = end_time - start_time
print("Parsed in ", elapsed_time)
del relationalTables

In [None]:
structuredData = []
#K = 1500000
#K = 1000
#K = 5000
#K = 10000
#K = 25000
#K = 50000
#K = 75000
#K = 100000
#K = 150000
K = 200000
#K = 400000
#K = 500000
#K = 750000
#K = 1000000
#K = 1500000

stats = {"tables": 0, "columns": 0, 'synonyms':0, 'relatedTo':0, 'isA':0, 'derivedFrom':0, 'wikipediaResults':0, 'lcsAmb':0}

#for table, schemaAndData in parsedSchemas.items():
#for schemaAndData in parsedSchemas:
count = 0
start_time = time.time()
for schemaAndData in parsedSchemas:
    columns = schemaAndData[0]
    #print(columns)
    aliasDict, provenance, colsToFind = findFromCached(cachedAlias, cachedProvenance, columns)
    if (len(colsToFind) > 0):
        #search online
        #print("Search online since not stored in the cache for alias")
        aliasDictOnline, provenanceOnline = getLabel(colsToFind)
        #print(aliasDictOnline)
        #print(provenanceOnline)
        aliasDict.update(aliasDictOnline)
        provenance.update(provenanceOnline)
        cachedAlias.update(aliasDict)
        cachedProvenance.update(provenance)
    #aliasDict, provenance = getLabel(columns)
    #print(aliasDict)
    #print(provenance)
    ## TODO: periodically save caches
    colSet, ambiguousLabels, setPairs = getAmbiguousColumns(columns, aliasDict)
    #print(colSet)
    #print(ambiguousLabels)
    #print(setPairs)
    #print(provenance)
    colAmbiguous = list(colSet)
    count += 1
    if (count % 100) == 0:
        print("Processed: ", count)
        print("Elapsed time: %s" %(time.time() - start_time))
        print("StructuredData size: ", len(structuredData))
    if (len(colSet) > 0):
        stats['tables'] = stats['tables'] + 1
        stats['columns'] = stats['columns'] + len(columns)
        table = schemaAndData[1]
        t = (table, columns, setPairs, ambiguousLabels, provenance)
        structuredData.append(t)
    if len(structuredData) == K:
        break
print("Process ended in time: %s" %(time.time() - start_time))
print("Structured data: ", len(structuredData))

In [None]:
## caches
start_time = time.time()
saveCache('./cacheSynonym-small.json', cacheSynonym)
saveCache('./cacheRelatedTo-small.json', cacheRelatedTo)
saveCache('./cacheIsA-small.json', cacheIsA)
saveCache('./cacheDerivedFrom-small.json', cacheDerivedFrom)
saveCache('./cacheWikipedia-small.json', cacheWikipedia)
saveCache('./cacheAlias.json', cachedAlias)
total_time = time.time() - start_time
print("Process ended in time:", total_time)

In [None]:
a_file = open("cacheProvenance.pkl", "wb")
pickle.dump(cachedProvenance, a_file)
a_file.close()

In [None]:
def generateRow(schemaString, attr1, attr2, valueString):
    return schemaString + " attr1: " + attr1 + " attr2: " + attr2 + "\t" + valueString+"\n"

start_time = time.time()
examples = []
distinctLabels = set()
distinctForType = {}
for t in structuredData:
    table = t[0]
    columns = t[1]
    setPairs = t[2]
    ambiguousLabels = t[3]
    provenance = t[4]
    schemaString = "|".join(columns)
    #TODO: implement it
    #schemaString = generateFromData(table, columns)
    countGenerated = 0
    for attr1, attrSet in setPairs.items():
        for attr2 in attrSet:
            amb1 = ambiguousLabels[attr1]
            amb2 = ambiguousLabels[attr2]
            ambValues = set(amb1).intersection(set(amb2))
            ambValues.discard(attr1.lower())
            ambValues.discard(attr2.lower())
            prov1 = provenance[attr1]
            prov2= provenance[attr2]
            for ambValue in ambValues:
                distinctLabels.add(ambValue)
                for key, valueSet in prov1.items():
                    if ambValue in valueSet:
                        stats[key] = stats[key] + 1
                        if key not in distinctForType:
                            distinctForType[key] = set()
                        distLabelType = distinctForType[key]
                        distLabelType.add(ambValue)
                for key, valueSet in prov2.items():
                    if ambValue in valueSet:
                        stats[key] = stats[key] + 1
                        if key not in distinctForType:
                            distinctForType[key] = set()
                        distLabelType = distinctForType[key]
                        distLabelType.add(ambValue)
                example = generateRow(schemaString, attr1, attr2, ambValue)
                examples.append(example)
                countGenerated += 1
    for i in range(0, countGenerated):
        #TODO improve the column selection
        attr1 = choice(list(columns))
        attr2 = choice(list(columns))
        if (attr1 not in setPairs):
            example = generateRow(schemaString, attr1, attr2, "None")
            examples.append(example)
        else:
            attr1AmbSet = setPairs[attr1]
            if ((attr2 != attr1) and (attr2 not in attr1AmbSet)):
                example = generateRow(schemaString, attr1, attr2, "None")
                examples.append(example)
                
print(stats)
print("**** DISTINCT *****")
for key, value in distinctForType.items():
    print(key, len(value))
print("Unique labels: ", len(distinctLabels))
total_time = time.time() - start_time
print("Process ended in time:", total_time)

In [None]:
print("Examples:", len(examples))
examplesNoDuplicates = set(examples)
print("Examples no duplicates:", len(examplesNoDuplicates))
start_time = time.time()
fileName = "./generated/trainWDC-00-" + str(K) + "-v1.2.tsv"
#f = open("./trainWDC-00-5k-v1.1.tsv", "w")
f = open(fileName, "w")
#f.writelines(examples)
f.writelines(examplesNoDuplicates)
f.close()
total_time = time.time() - start_time
print("Process ended in time:", total_time)