In [1]:
import gzip
import itertools
from collections import defaultdict

In [2]:
import numpy as np
import random

In [3]:
class PredictorBase:
    """
    This class contains implementation of experiment design.
    It passes few instances with classes to the `train` method, then it passes validation instances to predict
    method.
    """
    
    def __init__(self):
        pass
    
    def clear(self):
        """
        Clear info about previous bathces
        """
        pass
    
    def train(self, X, y):
        """
        Train on few entities. Override this method in real implementation
        """
        pass
    
    def predict(self, X):
        """
        Predict classes of the given entities
        """
        pass

In [4]:
class Evaluator:
    batch_size = 10
    test_fraction = 0.3
    
    @classmethod
    def read_lines(cls, fd):
        for line in fd:
            yield line.decode().strip('\n').split('\t')
    
    def __init__(self, filename="./shuffled_dedup_entities.tsv.gz"):
        self.fd = gzip.open(filename, 'r')
        self.reader = self.read_lines(self.fd)
        
        
    def read_batch(self, size=None):
        batch = list(itertools.islice(self.reader, size or self.batch_size))
        
        groups = defaultdict(list)
        for entity in batch:
            groups[entity[0]].append(entity)
            
        train_groups = {}
        test_groups = {}
        for etype, entities in groups.items():
            if len(entities) * self.test_fraction > 1:
                test_size = int(len(entities) * self.test_fraction)
                test_groups[etype] = entities[:test_size]
                train_groups[etype] = entities[test_size:]
        
        return train_groups, test_groups
    
    @classmethod
    def prepare_data(cls, group):
        X, y = [], []
        for label, entities in group.items():
            for entity in entities:
                X.append((entity[1], entity[3]))
                y.append(label)

        c = list(zip(X, y))

        random.shuffle(c)

        X, y = zip(*c)
        
        return X, y
    
    def eval_batched(self, model, metric, entities_count, count):
        metrics = []
        for batch_id in range(count):
            train, test = eva.read_batch(entities_count)
            X, y = Evaluator.prepare_data(train)
            X_test, y_test = Evaluator.prepare_data(test)
            model.train(X, y)
            pred = model.predict(X_test)
            score = metric(pred, y_test)
            metrics.append(score)
        return np.mean(metrics)
    

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

In [6]:
def concat_context(X):
    return np.array(list(map(
        lambda x: x[0] + " " + x[1],
        X
    )))

class KNNBaseline(PredictorBase):
    def __init__(self):
        self.model = None
        self.clear()
    
    def clear(self):
        self.model = Pipeline([
            ('concat_context', FunctionTransformer(concat_context)),
            ('vectorizer', CountVectorizer(stop_words='english')),
            ('cls', KNeighborsClassifier(metric='cosine', algorithm='brute'))
        ])
        
    def train(self, X, y):
        self.model.fit(X, y)
        
        
    def predict(self, X):
        return self.model.predict(X)
        
        

In [7]:
eva = Evaluator()

In [8]:
mean_score = eva.eval_batched(
    model=KNNBaseline(),
    metric=lambda x, y: f1_score(x, y, average='micro'),
    entities_count=1000,
    count=50
)

In [9]:
mean_score

0.68707437656001968