Many factors can help us decide whether a term matches with an entity, e.g. the number of word/character overlaps in names and baike definitions, type of query, word association etc. I plan to use a linear model to glue these factors together.

In this framework, I build a logistic regression to assign a probability for whether an entity matches with a given term. Training data would be every query term and entity candiate pair, and the gold label; decoding would be to rank entities based on the matching probabilities.

Questions:

1. How well can we predict in every category?
2. What features would work well? Should we use other categories as part of the training data?
3. What info seems to be missing?

To answer these questions:

1. I cross-validate on all data, using features either combined with category labels or not
  1. number of word/characters with an overlap
    1. In name
    2. In baike
  2. Word/characters that overlapped
    1. In name
    2. In baike
2. I record cross-validate result into a report for error analysis.

In [27]:
%matplotlib inline
import matplotlib
from matplotlib import pyplot

# Imports
import sklearn
from sklearn.metrics import average_precision_score
import numpy as np
import random
import copy
from os import path
import jinja2
from IPython.display import display, HTML
import unicodecsv as csv
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import cross_validation

template_dir = path.abspath('../html')
loader = jinja2.FileSystemLoader(template_dir)
environment = jinja2.Environment(loader=loader)


def apk(actual, predicted, k=None):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if k is None:
        k = len(predicted)
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

sub_tasks = ['celebrity', 'movie', 'restaurant', 'tvShow']
trainset_locs = map(path.abspath, ['../data/TRAIN SET/%s.TRAINSET.txt' % t for t in sub_tasks])
devset_locs = map(path.abspath, ['../data/DEV SET/%s.DEVSET.txt' % t for t in sub_tasks])
output_locs = map(path.abspath, ['../output/%s.txt' % t for t in sub_tasks])


# loading datasets
def LoadInData(data_loc, test_data=False):
    lines = unicode(open(data_loc).read(), 'gbk').split('\n')
    parsing_result = []
    for line in lines:
        terms = line.split('\t')
        items = []
        for i in terms[1:]:
            if test_data:
                ent, score = i, None

            else:
                colon_separated = i.split(':')
                ent = ':'.join(colon_separated[:-1])
                score = int(colon_separated[-1])
            items.append((ent, score))
        if len(items) == 0:
            continue
        parsing_result.append((terms[0], items))
    return parsing_result


# take celebrity as example
cel_train_data = LoadInData(trainset_locs[0])


def EvaluateByRank(strategy, seed=None, train_data=cel_train_data, title="", export_report_at=None):
    if seed is None:
        seed = random.Random()
    score_results = []
    
    report_data = {"query_results" : [], "title" : title}
    
    for q_id, (query, gs_result) in enumerate(train_data):
        shuffled_result = copy.copy(gs_result)
        seed.shuffle(shuffled_result)
        my_result = strategy(query, [i for (i, t) in shuffled_result])
        gs_result = [i for i, t in gs_result if t == 1]
        
        report_item = {"term" : query, "ranked" : [], "id" : q_id}
        for r in my_result:
            report_item['ranked'].append(
                {'is_gs' : (r in gs_result), 'entity' : r})
        map_score = apk(gs_result, my_result, len(shuffled_result))
        report_item['MAP'] = map_score
        report_data["query_results"].append(report_item)
        
        score_results.append(map_score)
    
    map_value = sum(score_results) / len(score_results)
    
    report_data['map_value'] = map_value
    if export_report_at is not None:
        with open(export_report_at, 'w') as ofile:
            html = environment.get_template('error_analysis.html').render(report_data)
            ofile.write(html.encode('utf8'))
    
    return map_value

def EvaluateAllByRank(strategy, seed=None):
    if seed is None:
        seed = random.Random()
    result = {}
    for sub_task, train_loc in zip(sub_tasks, trainset_locs):
        train_data = LoadInData(train_loc)
        result[sub_task] = EvaluateByRank(strategy, seed, train_data)
    return result

def OrderByScore(func):
    def wrappee(q, results):
        return [r for s, r in sorted([
                    (func(q, r), r) for r in results
                ], reverse=True)]
    return wrappee

def ExportResultsWithStrategy(strategy):
    for output_filename, testdata_loc in zip(output_locs, devset_locs):
        testdata = LoadInData(testdata_loc, test_data=True)
        with open(output_filename, 'w') as ofile:
            for query, entries in testdata:
                my_result = strategy(query, [i for (i, t) in entries])
                print >> ofile, '\t'.join([query] + my_result).encode('gbk')
                
def BuildCutoffStrategy(strategy, cutoff):
    def wrappee(*args, **kw):
        result = strategy(*args, **kw)
        return result[:cutoff]
    return wrappee

First I'll try the aforementioned feature sets, and throwing them into a ML framework.

In [3]:
baike_csv_loc = '../entities_db/baike.csv'
entity_summary_map = dict()
with open(baike_csv_loc) as infile:
    for row in csv.DictReader(infile):
        name = row['entity_name']
        summary = row['summary']
        entity_summary_map[name] = summary

In [17]:
def ExtractNCharOverlapFeature((q_type, query), entity):
    return [(q_type + 'NCharOverlap', len(set(query).intersection(set(entity))))]

def ExtractCharOverlapFeature((q_type, query), entity):
    return [(q_type + 'CharOverlap=%s' % i, 1) for i in set(query).intersection(set(entity))]

def ExtractNSumCharOverlapFeature((q_type, query), entity):
    try:
        summary = entity_summary_map[entity]
        return [(q_type + 'NSumCharOverlap', len(set(query).intersection(set(summary))))]
    except:
        return [(q_type + 'NO_SUMMARY', 1)]

def ExtractSumCharOverlapFeature((q_type, query), entity):
    try:
        summary = entity_summary_map[entity]
        return [(q_type + 'SumCharOverlap=%s' % i, 1) for i in set(query).intersection(set(summary))]
    except:
        return [(q_type + 'NO_SUMMARY', 1)]
    
def CombinedModel(*models):
    def wrappee(*args, **kw):
        result = []
        for model in models:
            result.extend(model(*args, **kw))
        return result
    return wrappee

ExtractAllFeatures = CombinedModel(ExtractNCharOverlapFeature,
                                  ExtractCharOverlapFeature,
                                  ExtractNSumCharOverlapFeature,
                                  ExtractSumCharOverlapFeature)

In [24]:
all_train_data = []
for q_type, trainset_loc in zip(sub_tasks, trainset_locs):
    all_train_data.extend([(q_type, i) for i in LoadInData(trainset_loc)])

ml_train_data = []
for q_type, (query, entity_info_list) in all_train_data:
    for ent, gs in entity_info_list:
        ml_train_data.append((dict(ExtractAllFeatures((q_type, query), ent)), gs))

v = feature_extraction.DictVectorizer()
D = [d for d, y in ml_train_data]
Y = np.array([int(y) for d, y in ml_train_data])
X = v.fit_transform(D)
logistic_regression = linear_model.LogisticRegression()
scores = cross_validation.cross_val_score(logistic_regression, X, Y, cv=10, scoring='roc_auc')
print "AUC %.2f%%+-%.2f%%" % (100 * np.mean(scores), 100 * np.std(scores))

logreg_model = logistic_regression.fit(X, Y)

AUC 62.74%+-0.99%


In [30]:
def RankderByModelProb(model_type):
    @OrderByScore
    def ScorerByModel(q, r):
        d = [dict(ExtractAllFeatures((model_type, q), r))]
        x = v.transform(d)
        result = logreg_model.predict_proba(x)
        return result[0][1]
    
    return ScorerByModel

def ExportResultsWithStrategy(strategy, model_type):
    model_ind = sub_tasks.index(model_type)
    output_filename, testdata_loc = output_locs[model_ind], devset_locs[model_ind]
    testdata = LoadInData(testdata_loc, test_data=True)
    with open(output_filename, 'w') as ofile:
        for query, entries in testdata:
            my_result = strategy(query, [i for (i, t) in entries])
            print >> ofile, '\t'.join([query] + my_result).encode('gbk')
            
for t in sub_tasks:
    ExportResultsWithStrategy(RankderByModelProb(t), t)

In [33]:
ExportResultsWithStrategy(BuildCutoffStrategy(RankderByModelProb('restaurant'), 70), 'restaurant')