In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches, SequenceMatcher
from sklearn.metrics import mutual_info_score
from sklearn.metrics import precision_score, recall_score, f1_score
from random import choice, randrange
import requests
import time
import string
import random
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import json
import nltk
nltk.download('words')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import words
from nltk.corpus import stopwords
!pip install wikipedia
import wikipedia
!pip install pyinterval
import os
import tarfile
import time
import pickle

[nltk_data] Downloading package words to
[nltk_data]     /Users/enzoveltri/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/enzoveltri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/enzoveltri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [2]:
## data prep
ps = PorterStemmer()
stemmed_words_list = [ps.stem(w) for w in words.words()]

In [3]:
## utils funct
def saveCache(fileName, cache):
    tempDict = {}
    for key, value in cache.items():
        tempDict[key] = list(value)
    with open(fileName, 'w') as fp:
        json.dump(tempDict, fp)
        
def loadCache(fileName):
    data = None
    with open(fileName, 'r') as fp:
        data = json.load(fp)
    cache = {}
    for key, value in data.items():
        cache[key] = set(value)
    return cache

def findFromCached(cachedAlias, cachedProvenance, columns):
    aliasReturn = {}
    provenanceReturn = {}
    columnsToSearch = set()
    for column in columns:
        if column in cachedAlias:
            aliasReturn[column] = cachedAlias[column]
        else:
            columnsToSearch.add(column)
        if column in cachedProvenance:
            provenanceReturn[column] = cachedProvenance[column]
        else:
            columnsToSearch.add(column)
     
    return aliasReturn, provenanceReturn, columnsToSearch

In [4]:
## caches
cacheSynonym = {}
cacheRelatedTo = {}
cacheIsA = {}
cacheDerivedFrom = {}
cacheWikipedia = {}
cachedAlias = {}
cachedProvenance = {}

In [None]:
## load caches  
cacheSynonym = loadCache('./cacheSynonym-small.json')
cacheRelatedTo = loadCache('./cacheRelatedTo-small.json')
cacheIsA = loadCache('./cacheIsA-small.json')
cacheDerivedFrom = loadCache('./cacheDerivedFrom-small.json')
cacheWikipedia = loadCache('./cacheWikipedia-small.json')
cachedAlias = loadCache('./cacheAlias.json')
a_file = open("cacheProvenance.pkl", "rb")
cachedProvenance = pickle.load(a_file)

## Annotator Functions

In [5]:
def getLCS(string1, string2):
    string1 = string1.lower()
    string2 = string2.lower()
    match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
    return string1[match.a: match.a + match.size].lower().strip()

## CONCEPTNET.IO ##
def getSynonym(word, limit=10, dropWord=True, useCache=True):
    processedText = word.lower().replace(' ', '_')
    synonyms = set()
    if (len(processedText) <= 1):
        return synonyms, True
    if useCache and word in cacheSynonym:
        return cacheSynonym[word], True
    baseUrl = "https://api.conceptnet.io/query?node=/c/en/$WORD$&other=/c/en&limit=$LIMIT$&rel=/r/Synonym"
    processedURL = baseUrl.replace("$WORD$", processedText)
    processedURL = processedURL.replace("$LIMIT$", str(limit))
    #print("Processed URL: ", processedURL)
    obj = None
    try:
        obj = requests.get(processedURL).json()
    except Exception:
        return synonyms, False
    #obj = requests.get(processedURL).json()
    if 'edges' not in obj:
        return synonyms, False
    resultsLen = len(obj['edges'])
    edges = obj['edges']
    #print("Results:", resultsLen)
    
    for i in range(0, resultsLen):
        edge = edges[i]
        label = edge['end']['label']
        synonyms.add(label.lower())
        label = edge['start']['label']
        synonyms.add(label.lower())
    if dropWord:
        synonyms.discard(word.lower())
    if useCache:
        cacheSynonym[word] = synonyms
    return synonyms, False

def getRelatedTo(word, limit=10, dropWord=True, useCache=True):
    processedText = word.lower().replace(' ', '_')
    relatedTo = set()
    if (len(processedText) <= 1):
        return relatedTo, True
    if useCache and word in cacheRelatedTo:
        return cacheRelatedTo[word], True
    baseUrl = "https://api.conceptnet.io/query?node=/c/en/$WORD$&other=/c/en&limit=$LIMIT$&rel=/r/RelatedTo"
    processedURL = baseUrl.replace("$WORD$", processedText)
    processedURL = processedURL.replace("$LIMIT$", str(limit))
    #print("Processed URL: ", processedURL)
    obj = None
    try:
        obj = requests.get(processedURL).json()
    except Exception:
        return relatedTo, False
    #obj = requests.get(processedURL).json()
    if 'edges' not in obj:
        return relatedTo, False
    resultsLen = len(obj['edges'])
    edges = obj['edges']
    #print("Results:", resultsLen)
    for i in range(0, resultsLen):
        edge = edges[i]
        label = edge['end']['label']
        relatedTo.add(label.lower())
        label = edge['start']['label']
        relatedTo.add(label.lower())
    if dropWord:
        relatedTo.discard(word.lower())
    if useCache:
        cacheRelatedTo[word] = relatedTo
    return relatedTo, False

def getIsA(word, limit=10, dropWord=True, useCache=True):
    processedText = word.lower().replace(' ', '_')
    isA = set()
    if (len(processedText) <= 1):
        return isA, True
    if useCache and word in cacheIsA:
        return cacheIsA[word], True
    baseUrl = "https://api.conceptnet.io/query?node=/c/en/$WORD$&other=/c/en&limit=$LIMIT$&rel=/r/IsA"
    processedURL = baseUrl.replace("$WORD$", processedText)
    processedURL = processedURL.replace("$LIMIT$", str(limit))
    #print("Processed URL: ", processedURL)
    obj = None
    try:
        obj = requests.get(processedURL).json()
    except Exception:
        return isA, False
    #obj = requests.get(processedURL).json()
    if 'edges' not in obj:
        return isA, False
    resultsLen = len(obj['edges'])
    edges = obj['edges']
    #print("Results:", resultsLen)
    for i in range(0, resultsLen):
        edge = edges[i]
        label = edge['end']['label']
        isA.add(label.lower())
    if dropWord:
        isA.discard(word.lower())
    if useCache:
        cacheIsA[word] = isA
    return isA, False

def getDerivedFrom(word, limit=10, dropWord=True, useCache=True):
    processedText = word.lower().replace(' ', '_')
    derivedFrom = set()
    if (len(processedText) <= 1):
        return derivedFrom, True
    if useCache and word in cacheDerivedFrom:
        return cacheDerivedFrom[word], True
    baseUrl = "https://api.conceptnet.io/query?node=/c/en/$WORD$&other=/c/en&limit=$LIMIT$&rel=/r/DerivedFrom"
    processedURL = baseUrl.replace("$WORD$", processedText)
    processedURL = processedURL.replace("$LIMIT$", str(limit))
    #print("Processed URL: ", processedURL)
    obj = None
    try:
        obj = requests.get(processedURL).json()
    except Exception:
        return derivedFrom, False
    #obj = requests.get(processedURL).json()
    if 'edges' not in obj:
        return derivedFrom, False
    resultsLen = len(obj['edges'])
    edges = obj['edges']
    #print("Results:", resultsLen)
    for i in range(0, resultsLen):
        edge = edges[i]
        label = edge['end']['label']
        if (label != word):
            derivedFrom.add(label.lower())
    if dropWord:
        derivedFrom.discard(word.lower())
    if useCache:
        cacheDerivedFrom[word] = derivedFrom
    return derivedFrom, False

## LCS ##
def getAmbiguousWithLCS(useStemming, col1, columns):
    translate_table = dict((ord(char), None) for char in string.punctuation)
    ambiguousValues = set()     
    for col2 in columns:
        if (col1 != col2):
            lcs = getLCS(col1, col2)
            minLen = min(len(col1), len(col2))
            maxLen = max(len(col1), len(col2))
            if (2 * minLen) < maxLen:
                continue
            if (len(lcs) >= (0.5*minLen) and len(lcs) > 1):
                lcs = lcs.translate(translate_table)
                if (lcs.lower() in stopwords.words('english')):
                    continue
                if (useStemming):
                    words_in_lcs = word_tokenize(lcs)
                    lcs_stemmed = ""
                    for w in words_in_lcs:
                        stemmedW = ps.stem(w)
                        if (stemmedW in stemmed_words_list):
                            lcs_stemmed += stemmedW + " "
                    if (len(lcs_stemmed) > 0):
                        #print(col1, col2, lcs, lcs_stemmed, sep="\t")
                        ambiguousValues.add(lcs)
                else:
                    #print(col1, col2, lcs, sep="\t")
                    ambiguousValues.add(lcs)
    return ambiguousValues

## WIKIPEDIA ##
def getAmbiguityFromWikipedia(column, useStemming=True, results=2, useCache=True):
        if useCache and column in cacheWikipedia:
            return cacheWikipedia[column], True
        wikipediaResults = set()
        try:
            wikipediaResults = set(wikipedia.search(column.replace("-"," "), results))
            wikipediaResultsStrip = set()
            for result in wikipediaResults:
                wikipediaResultsStrip.add(result.lower().strip())
            wikipediaResults = wikipediaResultsStrip
        except Exception:
                pass
        if (useStemming):
            translate_table = dict((ord(char), None) for char in string.punctuation)
            columnNoPunct = column.translate(translate_table)
            stemmed_columns = [ps.stem(w) for w in word_tokenize(columnNoPunct)]
            wikipediaResultsStemmed = set()
            for wikipediaResult in wikipediaResults:
                wikipediaResult = wikipediaResult.translate(translate_table)
                words_in_wiki = word_tokenize(wikipediaResult)
                stemmed_wiki = ""
                for w in words_in_wiki:
                    w_stemmed = ps.stem(w)
                    if w_stemmed in stemmed_columns:
                        stemmed_wiki = w + " "
                if (len(stemmed_wiki) > 0):
                    wikipediaResultsStemmed.add(stemmed_wiki.lower().strip())
            if useCache:
                cacheWikipedia[column] = wikipediaResultsStemmed
            return wikipediaResultsStemmed, True
        else:
            if useCache:
                cacheWikipedia[column] = wikipediaResults
            return wikipediaResults, False

## FUNCTIONS ##

def getLabel(columns, limit=10, useStemming=True, useLCS=False):
    labels = {}
    provenance = {}
    for column in columns:
        provenanceMap = {}
        #start_time = time.time()
        synonyms, cachedSynonyms = getSynonym(column.replace("-"," "), limit=limit, dropWord=True, useCache=True)
        #print("Synonyms time: %s" %(time.time() - start_time))
        #start_time = time.time()
        relatedTo, cachedrelatedTo = getRelatedTo(column.replace("-"," "), limit=limit, dropWord=True, useCache=True)
        #print("RelatedTo time: %s" %(time.time() - start_time))
        #start_time = time.time()
        isA, cachedIsA = getIsA(column.replace("-"," "), limit=limit, dropWord=True, useCache=True)
        #print("IsA time: %s" %(time.time() - start_time))
        #start_time = time.time()
        derivedFrom, cachedDerivedFrom = getDerivedFrom(column.replace("-"," "), limit=limit, dropWord=True, useCache=True)
        #print("DerivedFrom time: %s" %(time.time() - start_time))
        #start_time = time.time()
        #wikipediaResults = set(wikipedia.search(column.replace("-"," "), results=2))
        wikipediaResults, cachedWikipediaResults = getAmbiguityFromWikipedia(column.replace("-"," "), useStemming=True, results=2, useCache=True)
        #print("Wikipedia time: %s" %(time.time() - start_time))
        #start_time = time.time()
        lcsAmb = getAmbiguousWithLCS(useStemming, column, columns)
        #print("LSC time: %s" %(time.time() - start_time))
        labelsForColumn = set()
        if len(synonyms) > 0:
            labelsForColumn = labelsForColumn | set(synonyms)
        if len(relatedTo) > 0:
            labelsForColumn = labelsForColumn | set(relatedTo)
        if len(wikipediaResults) > 0:
            labelsForColumn = labelsForColumn | set(wikipediaResults)
        if len(isA) > 0:
            labelsForColumn = labelsForColumn | set(isA)
        if len(derivedFrom) > 0:
            labelsForColumn = labelsForColumn | set(derivedFrom)
        #labelsForColumn = synonyms | relatedTo | wikipediaResults | isA | derivedFrom
        if len(labelsForColumn) == 0:
            labelsForColumn = labelsForColumn | lcsAmb
        if useLCS:
            labelsForColumn = labelsForColumn | lcsAmb
        provenanceMap['synonyms'] = synonyms
        provenanceMap['relatedTo'] = relatedTo
        provenanceMap['isA'] = isA
        provenanceMap['derivedFrom'] = derivedFrom
        provenanceMap['wikipediaResults'] = wikipediaResults
        provenanceMap['lcsAmb'] = lcsAmb
        labelsForColumn = [w.lower().strip() for w in list(labelsForColumn)]
        labels[column] = set(labelsForColumn)
        provenance[column] = provenanceMap
        if not (cachedSynonyms or cachedrelatedTo or cachedIsA or cachedDerivedFrom):
            time.sleep(1)
    return labels, provenance

def getAmbiguousColumns(columns, aliasDict):
    colSet = set()
    ambiguousLabels = {}
    ambiguousAttr = {}
    for col1 in columns:
        for col2 in columns:
            if col1 != col2:
                alias1 = aliasDict[col1]
                alias2 = aliasDict[col2]
                ambiguousValues = set(alias1).intersection(set(alias2))
                if (len(ambiguousValues) > 0):
                    colSet.add(col1)
                    colSet.add(col2)
                    if col1 not in ambiguousLabels:
                        ambiguousLabels[col1] = set(ambiguousValues)
                    else:
                        setValue = ambiguousLabels[col1]
                        setValue = setValue | ambiguousValues
                        ambiguousLabels[col1] = setValue
                    if col2 not in ambiguousLabels:
                        ambiguousLabels[col2] = set(ambiguousValues)
                    else:
                        setValue = ambiguousLabels[col2]
                        setValue = setValue | ambiguousValues
                        ambiguousLabels[col2] = setValue
                    if col1 not in ambiguousAttr:
                        setAttr = set()
                        ambiguousAttr[col1] = setAttr
                    setAttr = ambiguousAttr[col1]
                    setAttr.add(col2)
    return colSet, ambiguousLabels, ambiguousAttr

def updateAlias(dic, dicAdd, dicRemove, blackList = set()):
    for key, value in dicAdd.items():
        if key not in dic:
            dic[key] = set(value)
        else:
            setValue = dic[key]
            for v in value:
                setValue.add(v)
    for key, value in dicRemove.items():
        if key in dic:
            setValue = dic[key]
            for v in value:
                setValue.discard(v)
    #print("BLACKLIST:")
    #print(blackList)
    for key, value in dic.items():
        #print("BEFORE:")
        #print(value)
        value = value - blackList
        dic[key] = value
        #print("AFTER:")
        #print(value)
    return dic


In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

def getStatsForValues(df, columns): 
    numerical = {}
    categorical = {}
    text = {}
    for column in columns:
        df[column] = pd.to_numeric(df[column], errors='ignore')
        columnType = df[column].dtype
        if (columnType in numerics):
            stats = df[column].describe()
            min = stats['min']
            max = stats['max']
            mean = stats['mean']
            median = stats['50%']
            std = stats['std']
            stats = (min, max, mean, median, std)
            numerical[column] = stats
        else:
            lenValue = df[column].nunique()
            lenColumn = df[column].shape[0]
            threshold = lenColumn * 0.9
            if (lenValue < threshold):
                #compute frequencies
                freq = df[column].value_counts()
                categorical[column] = freq
            else:
                mean = df[column].map(len).mean()
                std = df[column].map(len).std()
                median = df[column].map(len).median()
                tmpCol = df[column].sort_values()
                min = tmpCol.iloc[0]
                max = tmpCol.iloc[-1]
                stats = (min, max, mean, median, std)
                numerical[column] = stats

    return numerical, categorical, text

In [None]:
def findAmbiguousForCategorical(categorical):
    colSet = set()
    ambiguousValuesForAttr = {}
    setPairs = {}
    for c1, freq1 in categorical.items():
        for c2, freq2 in categorical.items():
            if (c1 != c2):
                keys1 = freq1.keys()
                keys2 = freq2.keys()
                ambiguousValues = set(keys1).intersection(set(keys2))
                if (len(ambiguousValues) < 1):
                    continue
                colSet.add(c1)
                colSet.add(c2)
                if c1 not in ambiguousValuesForAttr:
                    ambiguousValuesForAttr[c1] = set(ambiguousValues)
                else:
                    setValue = ambiguousValuesForAttr[c1]
                    setValue = setValue | ambiguousValues
                    ambiguousValuesForAttr[c1] = setValue
                if c1 not in setPairs:
                    attrs = set()
                    attrs.add(c2)
                    setPairs[c1] = attrs
                else:
                    attrs = setPairs[c1]
                    attrs.add(c2)
    return colSet, ambiguousValuesForAttr, setPairs

def getIntersection(min1, max1, min2, max2):
    ## check type
    if (type(min1) == str) or (type(max1) == str) or (type(min2) == str) or (type(max2) == str):
        return None
    ## check intervals
    if (max1 < min1) or (max2 < min2):
        print("Error min1-max1: {}-{} min2-max2:{}-{}".format(min1, max1, min2, max2))
        return None
    # case min1 - max1, min2 - max2 --> no intersection
    if (max1 < min2):
        return None
    # case min2 - max2, min1 - max2 --> no intersection
    if (max2 < min1):
        return None
    # case min1, min2, max1, max2 --> min2-max1
    if (min1 <= min2) and (min2 <= max1) and (max1 <= max2):
        return (min2, max1)
    # case min2, min1, max2, max1 --> min2-max1
    if (min2 <= min1) and (min1 <= max2) and (max2 <= max1):
        return (min1, max2)
    # case min1, min2, max2, max1 --> min2-max2
    if (min1 <= min2) and (min2 <= max2) and (max2 <= max1):
        return (min2, max2)
    # case min2, min1, max1, max2 --> min1-max1
    return (min1, max1)


def findAmbiguousForNumerical(numerical):
    ## naive strategy: intersection of intervals
    colSet = set()
    ambiguousValuesForAttr = {}
    setPairs = {}
    for c1, stats1 in numerical.items():
        for c2, stats2 in numerical.items():
            if (c1 != c2):
                interval = getIntersection(stats1[0], stats1[1], stats2[0], stats2[1])
                if (interval is None):
                    continue
                colSet.add(c1)
                colSet.add(c2)
                if c1 not in ambiguousValuesForAttr:
                    listValue = []
                    listValue.append(interval)
                    ambiguousValuesForAttr[c1] = listValue
                else:
                    listValue = ambiguousValuesForAttr[c1]
                    listValue.append(interval)
                if c1 not in setPairs:
                    attrs = set()
                    attrs.add(c2)
                    setPairs[c1] = attrs
                else:
                    attrs = setPairs[c1]
                    attrs.add(c2)
    return colSet, ambiguousValuesForAttr, setPairs

In [None]:
def findAmbiguousForLabelAndData(setPairs, setPairsCategorical, setPairsNumerical, ambiguousValuesForAttrCategorical, ambiguousValuesForAttrNumerical):
    setPairsLabelData = {}
    ambiguousValues = {}
    for attributeLabel, ambiguousAttributesLabel in setPairs.items():
        setAmb = set()
        for attributeCategorical, ambiguousAttributesCategorical in setPairsCategorical.items():
            if (attributeLabel == attributeCategorical):
                intersection = ambiguousAttributesLabel.intersection(ambiguousAttributesCategorical)
                #intersection = list(intersection)
                if (len(intersection) > 0):
                    for amb in intersection:
                        setAmb.add(amb)
                    for attr in intersection:
                        valuesAmb = ambiguousValuesForAttrCategorical[attr]
                        for value in valuesAmb:
                            if value in ambiguousValues:
                                setAttrs = ambiguousValues[value]
                                setAttrs.add(attr)
                            else:
                                setAttrs = set()
                                setAttrs.add(attr)
                                ambiguousValues[value] = setAttrs
        for attributeNumerical, ambiguousAttributesNumerical in setPairsNumerical.items():
            if (attributeLabel == attributeNumerical):
                intersection = ambiguousAttributesLabel.intersection(ambiguousAttributesNumerical)
                #intersection = list(intersection)
                if (len(intersection) > 0):
                    for amb in intersection:
                        setAmb.add(amb)
                    print(intersection)
                    for attr in intersection:
                        intervals = ambiguousValuesForAttrNumerical[attr]
                        for interval in intervals:
                            minI = interval[0]
                            maxI = interval[1]
                            if minI in ambiguousValues:
                                setAttrs = ambiguousValues[minI]
                                setAttrs.add(attr)
                            else:
                                setAttrs = set()
                                setAttrs.add(attr)
                                ambiguousValues[minI] = setAttrs
                            if maxI in ambiguousValues:
                                setAttrs = ambiguousValues[maxI]
                                setAttrs.add(attr)
                            else:
                                setAttrs = set()
                                setAttrs.add(attr)
                                ambiguousValues[maxI] = setAttrs
        setPairsLabelData[attributeLabel] = setAmb
    return setPairsLabelData, ambiguousValues

def reversedMap(map):
    reversed = {}
    for attrName, values in map.items():
        for value in values:
            #print(attrName, value)
            if (value not in reversed):
                    attrSet = set()
                    attrSet.add(attrName)
                    reversed[value] = attrSet
            else:
                    attrSet = reversed[value]
                    attrSet.add(attrName)
            #if type(value) is tuple:
            #    print("TUPLE")    
    return reversedMap

In [None]:
def getValues(line):
    s = line.strip()
    splitsAttr1 = s.split('attr1: ')
    prevVal = splitsAttr1[0]
    splitsAttr1 = splitsAttr1[1]
    attr2 = splitsAttr1.split('attr2: ')
    a1 = attr2[0]
    splita2 = attr2[1].split("\t")
    attr1Value = a1.strip()
    attr2Value = splita2[0].strip()
    labels = splita2[1].strip()
    return attr1Value, attr2Value, labels, prevVal

In [None]:
# ConceptNet Func
def getConceptNet(word, mode='intersection', limit=10, dropWord=True, useCache=True):
    ## modes = ['intersection', 'union']
    synonym, _ = getSynonym(word, limit=10, dropWord=True, useCache=True)
    relatedTo, _ = getRelatedTo(word, limit=10, dropWord=True, useCache=True)
    derivedFrom, _ = getDerivedFrom(word, limit=10, dropWord=True, useCache=True)
    isA, _ = getIsA(word, limit=10, dropWord=True, useCache=True)
    if mode == 'intersection':
        return synonym.intersection(relatedTo, derivedFrom, isA)
    else:
        return synonym.union(relatedTo, derivedFrom, isA)

In [None]:
## BASELINES IMPLEMENTATION
def baseline_labels(column1, column2):
    labels1, _ = getLabel([column1], limit=10, useStemming=True, useLCS=False)
    labels2, _ = getLabel([column2], limit=10, useStemming=True, useLCS=False)
    commonLabels = labels1[column1].intersection(labels2[column2])
    return commonLabels

def baseline_conceptnet_function(column1, column2, function):
    ## functions = ['synonym', 'relatedTo', 'derivedFrom','isA']
    set1 = {}
    set2 = {}
    if function == 'synonym':
        set1, _ = getSynonym(column1, limit=10, dropWord=True, useCache=True)
        set2, _ = getSynonym(column2, limit=10, dropWord=True, useCache=True)
    if function == 'relatedTo':
        set1, _ = getRelatedTo(column1, limit=10, dropWord=True, useCache=True)
        set2, _ = getRelatedTo(column2, limit=10, dropWord=True, useCache=True)
    if function == 'derivedFrom':
        set1, _ = getDerivedFrom(column1, limit=10, dropWord=True, useCache=True)
        set2, _ = getDerivedFrom(column2, limit=10, dropWord=True, useCache=True)
    if function == 'isA':
        set1, _ = getIsA(column1, limit=10, dropWord=True, useCache=True)
        set2, _ = getIsA(column2, limit=10, dropWord=True, useCache=True)
    return set1.intersection(set2)

def baseline_conceptnet(column1, column2, mode):
    set1 = getConceptNet(column1, mode, limit=10, dropWord=True, useCache=True)
    set2 = getConceptNet(column2, mode, limit=10, dropWord=True, useCache=True)
    return set1.intersection(set2)

def baseline_wikipedia(column1, column2):
    wikipediaColumn1,_ = getAmbiguityFromWikipedia(column1, useStemming=True, results=2, useCache=True)
    wikipediaColumn2,_ = getAmbiguityFromWikipedia(column2, useStemming=True, results=2, useCache=True)
    return wikipediaColumn1.intersection(wikipediaColumn2)

In [None]:
# Test Labels Baseline
column1 = "soccer_player"
column2 = "player"
labels1, _ = getLabel([column1], limit=10, useStemming=True, useLCS=False)
labels2, _ = getLabel([column2], limit=10, useStemming=True, useLCS=False)
#print(labels1[column1])
#print(labels2[column2])
commonLabels = labels1[column1].intersection(labels2[column2])
print("Labels: ", commonLabels)

In [None]:
# Test Synonyms Baseline
column1 = "soccer_player"
column2 = "player"
synonym1, _ = getSynonym(column1, limit=10, dropWord=True, useCache=True)
synonym2, _ = getSynonym(column2, limit=10, dropWord=True, useCache=True)
print(synonym1)
print(synonym2)
commonLabels = synonym1.intersection(synonym2)
print("Labels: ", commonLabels)

In [None]:
# Test ConceptNet Baseline
column1 = "soccer_player"
column2 = "player"
conceptnet1Intersection = getConceptNet(column1, mode='intersection', limit=10, dropWord=True, useCache=True)
conceptnet2Intersection = getConceptNet(column2, mode='intersection', limit=10, dropWord=True, useCache=True)
print(conceptnet1Intersection)
print(conceptnet2Intersection)
conceptnet1Union = getConceptNet(column1, mode='union', limit=10, dropWord=True, useCache=True)
conceptnet2Union = getConceptNet(column2, mode='union', limit=10, dropWord=True, useCache=True)
print(conceptnet1Union)
print(conceptnet2Union)
commonLabelsIntersection = conceptnet1Intersection.intersection(conceptnet2Intersection)
commonLabelsUnion = conceptnet1Union.intersection(conceptnet2Union)
print(commonLabelsIntersection)
print(commonLabelsUnion)

In [None]:
# Test Wikipedia Baseline
column1 = "soccer_player"
column2 = "player"
wikipediaColumn1,_ = getAmbiguityFromWikipedia(column1, useStemming=True, results=2, useCache=True)
wikipediaColumn2,_ = getAmbiguityFromWikipedia(column2, useStemming=True, results=2, useCache=True)
#print(wikipediaColumn1)
#print(wikipediaColumn2)
commonLabels = wikipediaColumn1.intersection(wikipediaColumn2)
print("Labels: ", commonLabels)

In [None]:
# Test LCS Baseline
column1 = "soccer_player"
column2 = "player"
lcs = getLCS(column1, column2)
print("Labels:", lcs)

In [None]:
def extractValues(x):
    s = x.replace('[', '').replace(']','')
    if len(s) == 0:
        return set()
    else:
        splits = s.split(sep=',')
        values = []
        for split in splits:
            tmp = split.strip().replace("'", "")
            values.append(tmp)
    return set(values)

# Run all baselines

In [None]:
## LOAD TEST DATA
file = './../data/test-task1-manual.tsv'
fileTask = open(file, 'r')
lines = fileTask.readlines()
examples = []
count = 0
for line in lines:
    column1, column2, labelsActual, _ = getValues(line)
    labelsPrediction = baseline_labels(column1, column2)
    synonym = baseline_conceptnet_function(column1, column2, 'synonym')
    relatedTo = baseline_conceptnet_function(column1, column2, 'relatedTo')
    derivedFrom = baseline_conceptnet_function(column1, column2, 'derivedFrom')
    isA = baseline_conceptnet_function(column1, column2, 'isA')
    conceptNetUnion = baseline_conceptnet(column1, column2, 'union')
    conceptNetIntersection = baseline_conceptnet(column1, column2, 'intersection')
    wikipedia = baseline_wikipedia(column1, column2)
    lcs = getLCS(column1, column2)
    t = (column1, column2, labelsActual, labelsPrediction, synonym, relatedTo, derivedFrom, isA, conceptNetUnion, conceptNetIntersection, wikipedia, lcs)
    examples.append(t)
    count += 1
    print("Examples processed: ", count)

In [None]:
def accuracy(targets, predictions):
    hits = 0
    for tgt, pred in zip(targets, predictions):
        if len(tgt) == 0:
            if len(pred) == 0:
                hits += 1
        else:
            valuesInCommon = tgt.intersection(pred)
            if len(valuesInCommon) > 0:
                hits += 1
    total = len(targets)
    accuracy = hits/total
    return accuracy

def countStatsLabelsStats(targets, predictions):
    count_pairs = 0
    hits = 0
    for tgt, pred in zip(targets, predictions):
        if len(tgt) > 0:
            count_pairs += 1
        valuesInCommon = tgt.intersection(pred)
        if len(valuesInCommon) > 0:
            hits += 1
    return count_pairs, hits
            
def countStats(targets, predictions):
    tn = 0
    fp = 0
    tp = 0
    fn = 0
    wrongLabel = 0
    for tgt, pred in zip(targets, predictions):
        if len(tgt) == 0:
            if len(pred) == 0:
                tn += 1
                continue
            else:
                fp += 1
                continue
        if len(pred) == 0:
            if len(tgt) > 0:
                fn += 1
                continue
        valuesInCommon = tgt.intersection(pred)
        if len(valuesInCommon) > 0:
            tp += 1
        else:
            wrongLabel += 1
    return tn, fp, tp, fn, wrongLabel

def precision(targets, predictions):
    tn, fp, tp, fn, wrongLabel = countStats(targets, predictions)
    if (tp + fp) == 0:
        return 0, wrongLabel, 0
    precision = tp /(tp + fp)
    precisionBinary = (tp + wrongLabel)/(tp + wrongLabel + fp)
    return precision, wrongLabel, precisionBinary
        
def recall(targets, predictions):
    tn, fp, tp, fn, wrongLabel = countStats(targets, predictions)
    if (tp + fn) == 0:
        return 0, wrongLabel, 0
    #recall = tp /(tp + fn)
    recall = tp /(tp + fn + wrongLabel)
    recallBinary = (tp + wrongLabel) / (tp + fn + wrongLabel)
    return recall, wrongLabel, recallBinary


In [None]:
labels = []
actuals = []
synonyms = []
relatedsTo = []
derivedsFrom = []
isAs = []
conceptNetUnions = []
conceptNetIntersections = []
wikipedias = []
lcss = []
columns1 = []
columns2 = []
for example in examples:
    columns1.append(example[0])
    columns2.append(example[1])
    actual = example[2]
    labelsPrediction = example[3]
    synonym = example[4]
    relatedTo = example[5]
    derivedFrom = example[6]
    isA = example[7]
    conceptNetUnion = example[8]
    conceptNetIntersection = example[9]
    wikipedia = example[10]
    lcs = example[11]
    actuals.append(extractValues(actual))
    labels.append(labelsPrediction)
    synonyms.append(synonym)
    relatedsTo.append(relatedTo)
    derivedsFrom.append(derivedFrom)
    isAs.append(isA)
    conceptNetUnions.append(conceptNetUnion)
    conceptNetIntersections.append(conceptNetIntersection)
    wikipedias.append(wikipedia)
    lcss.append(lcs)

In [None]:
print("Baseline","Accuracy", "Precision", 'Recall','Precision Binary', 'Recall Binary','Hits','Ambiguous Pairs', sep="\t")

accuracyLabels = accuracy(actuals, labels)
precisionLabels, _, precisionBinary_labels = precision(actuals, labels)
recallLabels, _ ,recallBinary_labels = recall(actuals, labels)
count_pairs, hits = countStatsLabelsStats(actuals, labels)
print("labels",accuracyLabels, precisionLabels, recallLabels, precisionBinary_labels, recallBinary_labels,count_pairs, hits, sep="\t")

accuracysynonyms = accuracy(actuals, synonyms)
precisionsynonyms, _ , precisionBinary_synonyms= precision(actuals, synonyms)
recallsynonyms, _ ,recallBinary_synonyms  = recall(actuals, synonyms)
count_pairs, hits = countStatsLabelsStats(actuals, synonyms)
print("synonyms",accuracysynonyms, precisionsynonyms, recallsynonyms,precisionBinary_synonyms,recallBinary_synonyms,count_pairs, hits,  sep="\t")

accuracyrelatedTo = accuracy(actuals, relatedsTo)
precisionrelatedTo, _ , precisionBinary_relatedsTo = precision(actuals, relatedsTo)
recallrelatedTo, _ ,recallBinary_relatedsTo  = recall(actuals, relatedsTo)
count_pairs, hits = countStatsLabelsStats(actuals, relatedsTo)
print("relatedsTo",accuracyrelatedTo, precisionrelatedTo, recallrelatedTo,precisionBinary_relatedsTo,recallBinary_relatedsTo ,count_pairs, hits, sep="\t")

accuracyderivedsFrom = accuracy(actuals, derivedsFrom)
precisionderivedsFrom, _ , precisionBinary_derivedsFrom = precision(actuals, derivedsFrom)
recallderivedsFrom, _ ,recallBinary_derivedsFrom  = recall(actuals, derivedsFrom)
count_pairs, hits = countStatsLabelsStats(actuals, derivedsFrom)
print("derivedsFrom",accuracyderivedsFrom, precisionderivedsFrom, recallderivedsFrom,precisionBinary_derivedsFrom,recallBinary_derivedsFrom, count_pairs, hits, sep="\t")

accuracyisAs = accuracy(actuals, isAs)
precisionisAs, _ , precisionBinary_derivedsFrom = precision(actuals, isAs)
recallisAs, _ ,recallBinary_isAs  = recall(actuals, isAs)
count_pairs, hits = countStatsLabelsStats(actuals, isAs)
print("isAs",accuracyisAs, precisionisAs, recallisAs,precisionBinary_derivedsFrom,recallBinary_isAs,count_pairs, hits,  sep="\t")

accuracyconceptNetUnions = accuracy(actuals, conceptNetUnions)
precisionconceptNetUnions, _ , precisionBinary_conceptNetUnions = precision(actuals, conceptNetUnions)
recallconceptNetUnions, _ ,recallBinary_conceptNetUnions  = recall(actuals, conceptNetUnions)
count_pairs, hits = countStatsLabelsStats(actuals, conceptNetUnions)
print("conceptNetUnions",accuracyconceptNetUnions, precisionconceptNetUnions, recallconceptNetUnions,precisionBinary_conceptNetUnions,recallBinary_conceptNetUnions, count_pairs, hits, sep="\t")

accuracyconceptNetIntersections = accuracy(actuals, conceptNetIntersections)
precisionconceptNetIntersections, _ , precisionBinary_conceptNetIntersections = precision(actuals, conceptNetIntersections)
recallconceptNetIntersections, _ ,recallBinary_conceptNetIntersections  = recall(actuals, conceptNetIntersections)
count_pairs, hits = countStatsLabelsStats(actuals, conceptNetIntersections)
print("conceptNetIntersections",accuracyconceptNetIntersections, precisionconceptNetIntersections, recallconceptNetIntersections,precisionBinary_conceptNetIntersections, recallBinary_conceptNetIntersections, count_pairs, hits, sep="\t")

accuracywikipedias = accuracy(actuals, wikipedias)
precisionwikipedias, _ , precisionBinary_wikipedias = precision(actuals, wikipedias)
recallwikipedias, _ ,recallBinary_wikipedias  = recall(actuals, wikipedias)
count_pairs, hits = countStatsLabelsStats(actuals, wikipedias)
print("wikipedias",accuracywikipedias, precisionwikipedias, recallwikipedias,precisionBinary_wikipedias, recallBinary_wikipedias,count_pairs, hits, sep="\t")

accuracylcss = accuracy(actuals, lcss)
precisionlcss, _ , precisionBinary_lcss = precision(actuals, lcss)
recalllcss, _ ,recallBinary_lcss  = recall(actuals, lcss)
count_pairs, hits = countStatsLabelsStats(actuals, lcss)
print("lcss",accuracylcss, precisionlcss, recalllcss,precisionBinary_lcss,recallBinary_lcss,count_pairs, hits,  sep="\t")



In [None]:
for c1, c2, actual, label, lcs in zip(columns1, columns2, actuals, labels, lcss):
    print(c1, c2, actual, label, lcs)