In [0]:


"""Functions for loading the RumorEval dataset.

Requires that the source files are manually placed in the `EXTERNAL_DATA_DIR`
folder. See the README for details.

Data is read directly from the .zip-files without extracting them, because this
was deemed more elegant.
"""

import json
from enum import Enum
from itertools import chain
from pathlib import Path
from sys import exit
from time import time
from typing import Dict, List, Optional
from zipfile import ZipFile

from tokenizer.tokenizer import RedditTokenizer, TweetTokenizer

DATA_DIR = Path('data')

EXTERNAL_DATA_DIR = DATA_DIR / 'external'
ELMO_WEIGHTS_FILE = (EXTERNAL_DATA_DIR
                     / 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5')
ELMO_OPTIONS_FILE = (EXTERNAL_DATA_DIR
                     / 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json')
# ELMO_WEIGHTS_FILE = (EXTERNAL_DATA_DIR
#                      / 'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5')
# ELMO_OPTIONS_FILE = (EXTERNAL_DATA_DIR
#                      / 'elmo_2x1024_128_2048cnn_1xhighway_options.json')
TRAINING_DATA_ARCHIVE_FILE = (EXTERNAL_DATA_DIR
                              / 'rumoureval-2019-training-data.zip')
TEST_DATA_ARCHIVE_FILE = (EXTERNAL_DATA_DIR
                          / 'rumoureval-2019-test-data.zip')
EVALUATION_DATA_FILE = EXTERNAL_DATA_DIR / 'final-eval-key.json'
EVALUATION_SCRIPT_FILE = EXTERNAL_DATA_DIR / 'home_scorer_macro.py'


def check_for_required_external_data_files() -> None:
    """Checks whether all required external data files are present.

    If not, will print a message to stderr and exit.
    """
    for required_file in [ELMO_WEIGHTS_FILE, ELMO_OPTIONS_FILE,
                          TRAINING_DATA_ARCHIVE_FILE, TEST_DATA_ARCHIVE_FILE, EVALUATION_SCRIPT_FILE] :
                          # EVALUATION_SCRIPT_FILE]:
        if not required_file.exists():
            exit('Required file "{}" is not present. See the README on how to '
                 'obtain it.'.format(required_file))


TOKENIZER_ARGS = {
    'preserve_case': False,
    'preserve_handles': False,
    'preserve_hashes': False,
    'preserve_len': False,
    'preserve_url': False,
}
TWEET_TOKENIZER = TweetTokenizer(**TOKENIZER_ARGS)
REDDIT_TOKENIZER = RedditTokenizer(**TOKENIZER_ARGS)


class Post:
    """Data class for both Twitter and Reddit posts.

    Args:
        id: ID of the post.
        text: Tokenized text of the the post for Twitter, title/body for Reddit.
        depth: Depth in the thread. Source posts always have `depth=0`, replies
            to source posts have `depth=1`, replies to replies have `depth=2`,
            and so forth.
        platform: Whether the post is from Twitter or from Reddit.
        has_media: `True` if the posts links to any media, `False` otherwise.
        source_id: The ID of the source post of the thread. If the current post
            is itself a source post, this is equal to `self.id`.
        topic: The rumor topic the posts belongs to for Twitter. `None` for
            Reddit posts, since the dataset has no topic labels for them.
        user_verified: Whether the user is a verified Twitter user. `None` for
            Reddit posts, since the dataset does not contain any info on this.
        followers_count: For Twitter posts, the number of followers the author
            of the post has. `None` for Reddit posts, since the concept doesn't
            exist for Reddit.
        friends_count: For Twitter posts, the number of accounts the author of
            the post is following. `None` for Reddit posts, since the concept
            doesn't exist for Reddit.
        upvote_ratio: The upvote ratio for Reddit posts. `None` for Twitter
            posts, since the concept doesn't exist for Twitter.
    """

    class Platform(Enum):
        """Enum to designate whether a posts is from Twitter or from Reddit."""
        twitter = 1
        reddit = 2

    def __init__(self,
                 id: str,
                 text: str,
                 depth: int,
                 platform: Platform,
                 has_media: bool,
                 source_id: Optional[str] = None,
                 topic: Optional[str] = None,
                 user_verified: Optional[bool] = None,
                 followers_count: Optional[int] = None,
                 friends_count: Optional[int] = None,
                 upvote_ratio: Optional[float] = None):
        self.id = id

        if platform == self.Platform.twitter:
            self.text: List[str] = TWEET_TOKENIZER.tokenize(text)
        elif platform == self.Platform.reddit:
            self.text: List[str] = REDDIT_TOKENIZER.tokenize(text)
        else:
            raise ValueError()

        self.depth = depth
        self.platform = platform
        self.has_media = has_media
        self.source_id = source_id or self.id
        self.topic = topic
        self.user_verified = user_verified
        self.followers_count = followers_count
        self.friends_count = friends_count
        self.upvote_ratio = upvote_ratio

    @property
    def has_source_depth(self) -> bool:
        """Whether the post is the source of a thread."""
        return self.depth == 0

    @property
    def has_reply_depth(self) -> bool:
        """Whether the post is a reply to the source of a thread."""
        return self.depth == 1

    @property
    def has_nested_depth(self) -> bool:
        """Whether the post is neither source nor reply to a thread's source."""
        return self.depth >= 2

    @property
    def url(self) -> str:
        """Url of the post (useful for debugging)."""
        if self.platform == self.Platform.twitter:
            return 'https://twitter.com/statuses/{}'.format(self.id)
        elif self.platform == self.Platform.reddit:
            if self.source_id == self.id:
                return 'https://reddit.com//comments/{}'.format(self.id)
            return 'https://reddit.com//comments/{}//{}'.format(self.source_id,
                                                                self.id)
        raise ValueError('Invalid post source value, must be either Twitter or '
                         'Reddit.')

    def __repr__(self) -> str:
        return 'Post {}'.format(vars(self))

    @classmethod
    def load_from_twitter_dict(cls,
                               twitter_dict: Dict,
                               post_depths: Dict[str, int],
                               source_id: Optional[str] = None,
                               topic: Optional[str] = None) -> 'Post':
        """Creates a `Post` instance from a JSON dict of a Twitter post.

        Args:
            twitter_dict: The JSON dict.
            post_depths: A map that gives the depth of the post by it's ID.
            source_id: The ID of the thread's source post. `None` if this post
                is itself the source post.
            topic: The rumor topic the posts is labelled to belong to.

        Returns:
            The created `Post` instance.
        """
        id = twitter_dict['id_str']
        return Post(id=id,
                    text=twitter_dict['text'],
                    depth=post_depths[id],
                    platform=cls.Platform.twitter,
                    has_media='media' in twitter_dict['entities'],
                    source_id=source_id,
                    topic=topic,
                    user_verified=twitter_dict['user']['verified'],
                    followers_count=twitter_dict['user']['followers_count'],
                    friends_count=twitter_dict['user']['friends_count'])

    @classmethod
    def load_from_reddit_dict(cls,
                              reddit_dict: Dict,
                              post_depths: Dict[str, int],
                              source_id: Optional[str] = None) -> 'Post':
        """Creates a `Post` instance from a JSON dict of a Reddit post.

        There are labels for some deleted Reddit posts (all classified as
        "comment"). For these posts only the ID is available. The text is set
        to be empty. See:
        https://groups.google.com/forum/#!msg/rumoureval/-6XzTDhWirk/eSc31xFOFQAJ

        Args:
            reddit_dict: The JSON dict.
            post_depths: A map that gives the depth of the post by it's ID.
            source_id: The ID of the thread's source post. `None` if this post
                is itself the source post.

        Returns:
            The created `Post` instance.
        """
        data = reddit_dict['data']
        if 'children' in data and isinstance(data['children'][0], dict):
            data = data['children'][0]['data']

        id = data['id']
        return Post(id=id,
                    text=data.get('title') or data.get('body') or '',
                    depth=post_depths[id],
                    platform=cls.Platform.reddit,
                    has_media=('domain' in data
                               and not data['domain'].startswith('self.')),
                    source_id=source_id,
                    upvote_ratio=data.get('upvote_ratio'))


def load_posts() -> Dict[str, Post]:
    """Loads all Twitter and Reddit posts into a dictionary.

    Since the dataset is very small, we just load the whole dataset into RAM.

    Returns:
        A dictionary mapping post IDs to their respective posts.
    """

    def get_archive_directory_structure(archive: ZipFile) -> Dict:
        """Parses a ZipFile's list of files into a hierarchical representation.

        We need to do this because ZipFile just gives us a list of all files in
        contains and doesn't provide any methods to check which files lie in a
        specific subdirectory.

        Args:
            archive: The archive to parse.

        Returns:
            A nested dictionary. Keys of this dictionary are either file names
            which point to their full path in the archive or directory names
            which again point to a nested dictionary that contains their
            contents.

        Example:
            If the archive would contain the following files::

                ['foo.txt',
                 'bar/bar.log',
                 'bar/baz.out',
                 'bar/boogy/text.out']

            This would be transformed into the following hierarchical form::

                {
                    'foo.txt': 'foo.txt',
                    'bar': {
                        'bar.log': 'bar/bar.log',
                        'baz.out': 'bar/baz.out',
                        'boogy': {
                            'text.out': 'bar/boogy/text.out'
                        }
                    }
                }
        """
        result = {}
        for file in archive.namelist():
            # Skip directories in archive.
            if file.endswith('/'):
                continue

            d = result
            path = file.split('/')[1:]  # [1:] to skip top-level directory.
            for p in path[:-1]:  # [:-1] to skip filename
                if p not in d:
                    d[p] = {}
                d = d[p]
            d[path[-1]] = file
        return result

    def calc_post_depths_from_thread_structure(thread_structure: Dict) \
            -> Dict[str, int]:
        """Calculates the nested depth of each post in a thread.

        We determine post depth from the provided `structure.json` files in the
        dataset because this is easier than following the chain of a post's
        parents to the source post of a thread.

        Args:
            thread_structure: The parsed JSON dict from one of the dataset's
                `structure.json` files.

        Returns:
            A dictionary mapping post IDs to their nested depth. The source
            post of a thread always has depth `0`, first level replies `1`, etc.

        Example:
            If the `thread_structure` would look like the following::

                {
                    'foo': {
                        'bar': [],
                        'baz': {
                            'boogy': []
                        },
                        'qux': []
                    }
                }

            The parsed post depths would be::

                {
                    'foo': 0,
                    'bar': 1,
                    'baz': 1,
                    'boogy': 2,
                    'qux': 1
                }
        """
        post_depths = {}

        def walk(thread: Dict, depth: int) -> None:
            for post_id, subthread in thread.items():
                post_depths[post_id] = depth
                if isinstance(subthread, Dict):
                    walk(subthread, depth + 1)

        walk(thread_structure, 0)
        return post_depths

    print('Loading posts...')
    time_before = time()

    training_data_archive = ZipFile(TRAINING_DATA_ARCHIVE_FILE)
    training_data_contents = get_archive_directory_structure(
        training_data_archive)
    twitter_english = training_data_contents['twitter-english']
    reddit_training_data = training_data_contents['reddit-training-data']
    reddit_dev_data = training_data_contents['reddit-dev-data']

    test_data_archive = ZipFile(TEST_DATA_ARCHIVE_FILE)
    test_data_contents = get_archive_directory_structure(test_data_archive)
    twitter_en_test_data = test_data_contents['twitter-en-test-data']
    reddit_test_data = test_data_contents['reddit-test-data']

    posts: Dict[str, Post] = {}

    # -- Load Twitter posts ----------------------------------------------------
    for archive, topics in [(training_data_archive, twitter_english.items()),
                            (test_data_archive, twitter_en_test_data.items())]:
        for topic, threads in topics:
            for thread in threads.values():
                post_depths = calc_post_depths_from_thread_structure(
                    json.loads(archive.read(thread['structure.json'])))

                source_post = Post.load_from_twitter_dict(
                    json.loads(archive.read(
                        next(iter(thread['source-tweet'].values())))),
                    post_depths,
                    topic=topic)
                posts[source_post.id] = source_post

                for reply in thread.get('replies', {}).values():
                    reply_post = Post.load_from_twitter_dict(
                        json.loads(archive.read(reply)),
                        post_depths,
                        source_id=source_post.id,
                        topic=topic)
                    posts[reply_post.id] = reply_post

    # -- Load Reddit posts. ----------------------------------------------------
    for archive, threads in [(training_data_archive,
                              chain(reddit_training_data.values(),
                                    reddit_dev_data.values())),
                             (test_data_archive, reddit_test_data.values())]:
        for thread in threads:
            post_depths = calc_post_depths_from_thread_structure(
                json.loads(archive.read(thread['structure.json'])))

            source_post = Post.load_from_reddit_dict(
                json.loads(archive.read(
                    next(iter(thread['source-tweet'].values())))),
                post_depths)
            posts[source_post.id] = source_post

            for reply in thread.get('replies', {}).values():
                reply_post = Post.load_from_reddit_dict(
                    json.loads(archive.read(reply)),
                    post_depths,
                    source_id=source_post.id)
                posts[reply_post.id] = reply_post

    print('  Number of posts: {:d} (Reddit={:d}, Twitter={:d})'.format(
        len(posts),
        sum(1 for p in posts.values() if p.platform == Post.Platform.reddit),
        sum(1 for p in posts.values() if p.platform == Post.Platform.twitter)))
    time_after = time()
    print('  Took {:.2f}s.'.format(time_after - time_before))

    return posts


class SdqcInstance:
    """Data class for SDQC (RumorEval Task A) instances.

    Args:
        post_id: An ID referencing a Twitter or a Reddit post.
        label: A label whether the stance expressed in the referenced post is
            *support*, *deny*, *query*, or *comment* towards the rumor expressed
            in the thread's source post.
    """

    class Label(Enum):
        """Enum for SDQC labels `support`, `deny`, `query`, and `comment`."""
        support = 0
        deny = 1
        query = 2
        comment = 3

    def __init__(self, post_id: str, label: Optional[Label] = None):
        self.post_id = post_id
        self.label = label

    def __repr__(self):
        return 'SDQC ({}, {})'.format(self.post_id, self.label)


def load_sdcq_instances() -> (List[SdqcInstance],
                              List[SdqcInstance],
                              Optional[List[SdqcInstance]]):
    """Load SDQC (RumorEval Task A) training, dev, and test datasets.

    Returns:
        A tuple containing lists of SDQC instances. The first element is the
        training dataset, the second the dev, and the third the test, if it is
        available, otherwise `None`.
    """

    def load_from_json_dict(json_dict: Dict[str, Dict[str, str]]) \
            -> List[SdqcInstance]:
        return [SdqcInstance(post_id, SdqcInstance.Label[label])
                for post_id, label in json_dict['subtaskaenglish'].items()]

    training_data_archive = ZipFile(TRAINING_DATA_ARCHIVE_FILE)
    train = load_from_json_dict(json.loads(training_data_archive.read(
        'rumoureval-2019-training-data/train-key.json')))
    dev = load_from_json_dict(json.loads(training_data_archive.read(
        'rumoureval-2019-training-data/dev-key.json')))
    test = None

    if EVALUATION_DATA_FILE.exists():
        with EVALUATION_DATA_FILE.open('rb') as fin:
            test = load_from_json_dict(json.loads(fin.read()))

    return train, dev, test


class VerifInstance:
    """Data class for Verification (RumorEval Task B) instances.

    Args:
        post_id: An ID referencing a Twitter or a Reddit thread's source post.
        label: A label whether the rumor expressed in the referenced post is
            `true`, `false`, or `unverified`.
    """

    class Label(Enum):
        """ Enum for verification labels `true`, `false`, and `unverified`."""
        false = 0
        true = 1
        unverified = 2

    def __init__(self, post_id: str, label: Optional[Label] = None):
        self.post_id = post_id
        self.label = label

    def __str__(self):
        print('Verif ({}, {})'.format(self.post_id, self.label))


def load_verif_instances() -> (List[VerifInstance],
                               List[VerifInstance],
                               Optional[List[VerifInstance]]):
    """Load Verification (RumorEval Task B) training, dev, and test datasets.

    Returns:
        A tuple containing lists of Verification instances. The first element is
        the training dataset, the second the dev, and the third the test, if it
        is available, otherwise `None`.
    """

    def load_from_json_dict(json_dict: Dict[str, Dict[str, str]]) \
            -> List[VerifInstance]:
        return [VerifInstance(post_id, VerifInstance.Label[label])
                for post_id, label in json_dict['subtaskbenglish'].items()]

    training_data_archive = ZipFile(TRAINING_DATA_ARCHIVE_FILE)
    train = load_from_json_dict(json.loads(training_data_archive.read(
        'rumoureval-2019-training-data/train-key.json')))
    dev = load_from_json_dict(json.loads(training_data_archive.read(
        'rumoureval-2019-training-data/dev-key.json')))
    test = None

    if EVALUATION_DATA_FILE.exists():
        with EVALUATION_DATA_FILE.open('rb') as fin:
            test = load_from_json_dict(json.loads(fin.read()))

    return train, dev, test


In [0]:
import json
from collections import OrderedDict, defaultdict
from enum import Enum
from itertools import chain
from math import sqrt
from pathlib import Path
from pprint import pprint
from random import shuffle
from sys import maxsize
from time import time
from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union

import numpy as np
import torch
import torch.nn.functional as F
from allennlp.modules.elmo import Elmo, batch_to_ids
from torch.utils.data import Dataset

# from src.dataset import ELMO_OPTIONS_FILE, ELMO_WEIGHTS_FILE, Post, \
#     SdqcInstance, VerifInstance


class ScalingMode(Enum):
    none = 0
    min_max = 1
    standard = 2


class DatasetHelper(Dataset):
    def __init__(self, post_embeddings: Dict[str, torch.tensor]):
        super().__init__()
        self._dataset = []
        self._post_embeddings = post_embeddings

    @classmethod
    def calc_shared_features(cls, post: Post, post_embeddings: Dict[str, torch.Tensor]) \
            -> (np.ndarray, np.ndarray, np.ndarray):
        post_platform = [post.platform == Post.Platform.twitter,
                         post.platform == Post.Platform.reddit]

        post_author = [0, 0, 0, 0, 0]
        if post.platform == Post.Platform.twitter:
            post_author = [post.user_verified,
                           not post.user_verified,
                           post.followers_count,
                           post.friends_count,
                           post.followers_count / (post.friends_count + 1e-8)]

        post_similarity_to_source = np.array(1)
        if not post.has_source_depth:
            post_emb_mean = post_embeddings[post.id].mean(dim=1)
            source_emb_mean = post_embeddings[post.source_id].mean(dim=1)
            post_similarity_to_source = F.cosine_similarity(
                post_emb_mean, source_emb_mean, dim=0).cpu().numpy()

        return post_platform, post_author, post_similarity_to_source

    def __len__(self) -> int:
        return len(self._dataset)

    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
        return self._dataset[index]

    def calc_stats_for_aux_feature(self,
                                   index: int,
                                   filter_func: Optional[
                                       Callable[[str], bool]] = None) \
            -> (float, float, float, float):
        if not filter_func:
            def filter_func(_post_id: str) -> bool:
                return True

        feature_values = np.array([post['features'][index].item()
                                   for post in self._dataset
                                   if filter_func(post['post_id'])])
        return (feature_values.min(),
                feature_values.max(),
                feature_values.mean(),
                feature_values.std())

    def min_max_scale_aux_feature(self,
                                  index: int,
                                  min: float,
                                  max: float,
                                  filter_func: Optional[
                                      Callable[[str], bool]] = None) \
            -> None:
        if not filter_func:
            def filter_func(_post_id: str) -> bool:
                return True

        for post in self._dataset:
            if filter_func(post['post_id']):
                value = post['features'][index]
                post['features'][index] = (value - min) / (max - min)

    def standard_scale_aux_feature(self,
                                   index: int,
                                   mean: float,
                                   std: float,
                                   filter_func: Optional[
                                       Callable[[str], bool]] = None) \
            -> None:
        if not filter_func:
            def filter_func(_post_id: str) -> bool:
                return True

        for post in self._dataset:
            if filter_func(post['post_id']):
                value = post['features'][index]
                post['features'][index] = (value - mean) / std


def calculate_post_elmo_embeddings(posts: Dict[str, Post],
                                   max_sentence_length: int,
                                   batch_size: int,
                                   scalar_mix_parameters: List[float],
                                   device: torch.device,
                                   elmo_options_file: Path = ELMO_OPTIONS_FILE,
                                   elmo_weights_file: Path = ELMO_WEIGHTS_FILE
                                   ) \
        -> Dict[str, torch.Tensor]:
    """Calculate ELMo embeddings of all posts in the dataset.

    Calculating these embeddings one time before training the actual models
    allows for extremely fast training later. The downsides are that we can't
    propagate gradients through the embeddings, but fine-tuning these would
    probably lead to be overfitting, since our dataset is very small.
    Additionally, we also can't learn the scalar_mix_parameters, but since
    training is so much faster, adjusting these by hand should be sufficient.

    Since we are going to load the entire dataset into GPU memory later anyways,
    we keep the embeddings in GPU memory here already.

    Args:
        posts: A dictionary mapping post IDs to their respective posts. Load
            this with `src.dataset.load_posts()`.
        max_sentence_length: Number of tokens after which sentences will be
            truncated.
        batch_size: Batch size for calculating the ELMo embeddings.
        scalar_mix_parameters: Parameters for mixing the different ELMo layers.
            See the paper for details on this.
        device: Device to execute on.
        elmo_options_file: file path to options for ELMo embeddings.
        elmo_weights_file: file path to weights for ELMo embeddings.

    Returns:
        A dictionary mapping post IDs to their respective ELMo embedding in a
        PyTorch tensor. Each tensor will have shape
        `(num_elmo_dimensions, max_sentence_length)`.
    """

    print('Calculating post embeddings...')
    time_before = time()

    elmo = Elmo(elmo_options_file,
                elmo_weights_file,
                num_output_representations=1,
                dropout=0,
                requires_grad=False,
                do_layer_norm=False,
                scalar_mix_parameters=scalar_mix_parameters).to(device)
    elmo.eval()

    post_embeddings = {}
    batch_ids = []
    # Add a dummy sentence with max_sentence_length to each batch to enforce
    # that each batch of embeddings has the same shape. `batch_to_id()` and
    # `elmo()` take care of zero padding shorter sentences for us.
    batch_texts = [['' for _ in range(max_sentence_length)]]
    for i, post in enumerate(posts.values()):
        batch_ids.append(post.id)
        batch_texts.append(post.text[:max_sentence_length])

        if not i % batch_size or i == len(posts) - 1:
            batch_character_ids = batch_to_ids(batch_texts).to(device)
            batch_texts = [['' for _ in range(max_sentence_length)]]

            # - [0] to select first output representation (there is only one
            #   because of `num_output_representations=1` at `elmo` creation.
            # - [1:] to ignore dummy sentence added at the start.
            batch_embeddings = \
                elmo(batch_character_ids)['elmo_representations'][0][1:]
            batch_embeddings = batch_embeddings.split(split_size=1, dim=0)
            del batch_character_ids  # Free up memory sooner.

            for post_id, post_embedding in zip(batch_ids, batch_embeddings):
                post_embedding.squeeze_(dim=0)
                post_embedding.transpose_(0, 1)
                post_embeddings[post_id] = post_embedding
            batch_ids = []

    time_after = time()
    print('  Took {:.2f}s.'.format(time_after - time_before))

    return post_embeddings


def generate_folds_for_k_fold_cross_validation(posts: Dict[str, Post],
                                               num_folds: int) \
        -> List[Set[str]]:
    posts_per_discriminator = defaultdict(set)
    for post in posts.values():
        if post.platform == Post.Platform.twitter:
            discriminator = post.topic
        elif post.platform == Post.Platform.reddit:
            discriminator = post.source_id
        else:
            raise ValueError('Unimplemented enum variant.')
        posts_per_discriminator[discriminator].add(post.id)
    posts_per_discriminator = list(posts_per_discriminator.values())
    shuffle(posts_per_discriminator)

    folds = [set() for _ in range(num_folds)]
    for post_ids in posts_per_discriminator:
        # Find fold with fewest elements
        index = None
        num_elements = maxsize
        for i, fold in enumerate(folds):
            if num_elements > len(fold):
                num_elements = len(fold)
                index = i

        # Add post to that fold
        folds[index].update(post_ids)

    return folds


def arrange_folds_for_k_fold_cross_validation(folds: List[Set[str]],
                                              index: int) \
        -> (Set[str], Set[str]):
    train_post_ids = set(chain.from_iterable(
        fold for i, fold in enumerate(folds) if i != index))
    test_post_ids = folds[index]
    return train_post_ids, test_post_ids


def filter_instances(train_post_ids: Set[str],
                     test_post_ids: Set[str],
                     instances: Iterable[Union[SdqcInstance, VerifInstance]]) \
        -> (List[Union[SdqcInstance, VerifInstance]],
            List[Union[SdqcInstance, VerifInstance]]):
    train_instances = [i for i in instances if i.post_id in train_post_ids]
    test_instances = [i for i in instances if i.post_id in test_post_ids]

    shuffle(train_instances)
    shuffle(test_instances)

    return train_instances, test_instances


def rmse_score(labels, predictions, confidences):
    rmse = 0
    for label, prediction, confidence in \
            zip(labels, predictions, confidences):
        if label == prediction and \
                (label == VerifInstance.Label.true.value
                 or label == VerifInstance.Label.false.value):
            rmse += (1 - confidence) ** 2
        elif label == VerifInstance.Label.unverified.value:
            rmse += confidence ** 2
        else:
            rmse += 1
    rmse = sqrt(rmse / len(labels))
    return rmse


def display_results(sdqc_accs: Iterable[float],
                    sdqc_f1s: Iterable[float],
                    sdqc_reports: Iterable[Dict[str, Dict[str, float]]],
                    verif_accs: Iterable[float],
                    verif_f1s: Iterable[float],
                    verif_rmses: Iterable[float],
                    verif_reports: Iterable[Dict[str, Dict[str, float]]]):
    def display_report(reports: Iterable[Dict[str, Dict[str, float]]]):
        report_lists = defaultdict(lambda: defaultdict(list))
        for report in reports:
            for outer_key, inner_report in report.items():
                if outer_key == 'accuracy':
                    report_lists[outer_key]['accuracy'].append(inner_report)
                else:
                    for inner_key, value in inner_report.items():
                        report_lists[outer_key][inner_key].append(value)

        report_stats = {}
        for outer_key, inner_report in report_lists.items():
            report_stats[outer_key] = {}
            for inner_key, values in inner_report.items():
                report_stats[outer_key][inner_key] = '{:.1%}±{:.1%}'.format(
                    np.mean(values), np.std(values))

        pprint(report_stats)

    sdqc_acc = (np.mean(sdqc_accs), np.std(sdqc_accs))
    sdqc_f1 = (np.mean(sdqc_f1s), np.std(sdqc_f1s))
    print('Task A: SDQC')
    print('  Accuracy: {:.1%}±{:.1%}'
          '  F1-Score: {:.1%}±{:.1%}'
          .format(sdqc_acc[0], sdqc_acc[1],
                  sdqc_f1[0], sdqc_f1[1]))
    display_report(sdqc_reports)

    verif_acc = (np.mean(verif_accs), np.std(verif_accs))
    verif_f1 = (np.mean(verif_f1s), np.std(verif_f1s))
    verif_rmse = (np.mean(verif_rmses), np.std(verif_rmses))
    print('Task B: Verification')
    print('  Accuracy: {:.1%}±{:.1%}'
          '  F1-Score: {:.1%}±{:.1%}'
          '  RMSE: {:.3f}±{:.3f}'
          .format(verif_acc[0], verif_acc[1],
                  verif_f1[0], verif_f1[1],
                  verif_rmse[0], verif_rmse[1]))
    display_report(verif_reports)


def write_answers_json(
        path: Path,
        sdqc_instances: List[SdqcInstance],
        verif_instances: List[SdqcInstance],
        sdqc_estimates: Dict[str, Tuple[SdqcInstance.Label,
                                        Dict[SdqcInstance.Label, float]]],
        verif_estimates: Dict[str, Tuple[VerifInstance.Label, float]]):
    sdqc_answers = OrderedDict()
    for instance in sdqc_instances:
        answer = sdqc_estimates[instance.post_id]
        sdqc_answers[instance.post_id] = answer[0].name

    verif_answers = OrderedDict()
    for instance in verif_instances:
        answer = verif_estimates[instance.post_id]
        verif_answers[instance.post_id] = (answer[0].name, answer[1])

    answers = OrderedDict()
    answers['subtaskaenglish'] = sdqc_answers
    answers['subtaskbenglish'] = verif_answers

    with path.open('w', encoding='UTF-8') as fout:
        json.dump(answers, fout, indent=2)


In [0]:
from typing import Dict, Iterable, List, Optional, Tuple

import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, classification_report, f1_score
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# from src.dataset import Post, SdqcInstance
# from src.util import DatasetHelper, ScalingMode

EVAL_DEV_EVERY_N_EPOCH = 20


class Sdqc:
    class Hyperparameters:
        def __init__(self,
                     max_sentence_length: int,
                     batch_size: int,
                     num_epochs: int,
                     learning_rate: float,
                     weight_decay: float,
                     class_weights: List[float],
                     input_num_emb_dims: int,
                     input_num_aux_dims: int,
                     input_aux_scaling_features: List[int],
                     input_aux_scaling_mode: ScalingMode,
                     conv_num_layers: int,
                     conv_kernel_sizes: List[int],
                     conv_num_channels: int,
                     dense_num_layers: int,
                     dense_num_hidden: int,
                     dense_dropout: float):
            self.max_sentence_length = max_sentence_length
            self.batch_size = batch_size
            self.num_epochs = num_epochs
            self.learning_rate = learning_rate
            self.weight_decay = weight_decay
            self.class_weights = class_weights
            self.input_num_emb_dims = input_num_emb_dims
            self.input_num_aux_dims = input_num_aux_dims
            self.input_aux_scaling_features = input_aux_scaling_features
            self.input_aux_scaling_mode = input_aux_scaling_mode
            self.conv_num_layers = conv_num_layers
            self.conv_kernel_sizes = conv_kernel_sizes
            self.conv_num_channels = conv_num_channels
            self.dense_num_layers = dense_num_layers
            self.dense_num_hidden = dense_num_hidden
            self.dense_dropout = dense_dropout

    def __init__(self,
                 posts: Dict[str, Post],
                 post_embeddings: Dict[str, torch.Tensor],
                 hparams: 'Sdqc.Hyperparameters',
                 device: torch.device):
        self._posts = posts
        self._post_embeddings = post_embeddings
        self._hparams = hparams
        self._device = device

    class Dataset(DatasetHelper):
        def __init__(self,
                     instances: Iterable[SdqcInstance],
                     posts: Dict[str, Post],
                     post_embeddings: Dict[str, torch.Tensor],
                     hparams: 'Sdqc.Hyperparameters',
                     device: torch.device):
            super().__init__(post_embeddings)

            for instance in instances:
                post = posts[instance.post_id]
                post_embedding = post_embeddings[post.id]

                post_features = self.calc_features(post, post_embeddings)

                self._dataset.append({
                    'post_id': post.id,
                    'emb': post_embedding,
                    'features': torch.from_numpy(post_features).to(device),
                    'label': (torch.tensor(instance.label.value, device=device)
                              if instance.label else 0),
                })

        @classmethod
        def calc_features(cls, post: Post, post_embeddings: Dict[str, torch.tensor]):
            post_platform, post_author, post_similarity_to_source = \
                cls.calc_shared_features(post, post_embeddings)

            post_type = [post.has_source_depth,
                         post.has_reply_depth,
                         post.has_nested_depth]

            return (np.concatenate((post_platform,
                                    post_author,
                                    [post_similarity_to_source],
                                    post_type))
                    .astype(np.float32))

    def build_datasets(self,
                       train_instances: Iterable[SdqcInstance],
                       dev_instances: Optional[Iterable[SdqcInstance]],
                       test_instances: Optional[Iterable[SdqcInstance]]) \
            -> ('Sdqc.Dataset',
                Optional['Sdqc.Dataset'],
                Optional['Sdqc.Dataset']):
        print('Number of instances: train={:d}, dev={:d}, test={:d}'
              .format(len(train_instances),
                      len(dev_instances or []),
                      len(test_instances or [])))

        train_dataset = self.Dataset(
            train_instances, self._posts, self._post_embeddings, self._hparams,
            self._device)

        dev_dataset = None
        if dev_instances:
            dev_dataset = self.Dataset(
                dev_instances, self._posts, self._post_embeddings,
                self._hparams, self._device)

        test_dataset = None
        if test_instances:
            test_dataset = self.Dataset(
                test_instances, self._posts, self._post_embeddings,
                self._hparams, self._device)

        def filter_func(post_id: str) -> bool:
            return self._posts[post_id].platform == Post.Platform.twitter

        for index in self._hparams.input_aux_scaling_features:
            min, max, mean, std = \
                train_dataset.calc_stats_for_aux_feature(index, filter_func)

            for dataset in (train_dataset, dev_dataset, test_dataset):
                if not dataset:
                    continue

                if self._hparams.input_aux_scaling_mode == ScalingMode.none:
                    pass
                elif (self._hparams.input_aux_scaling_mode
                      == ScalingMode.min_max):
                    dataset.min_max_scale_aux_feature(
                        index, min, max, filter_func)
                elif (self._hparams.input_aux_scaling_mode
                      == ScalingMode.standard):
                    dataset.standard_scale_aux_feature(
                        index, mean, std, filter_func)
                else:
                    raise ValueError('Unimplemented enum variant.')

        return train_dataset, dev_dataset, test_dataset

    class Model(nn.Module):
        def __init__(self, hparams: 'Sdqc.Hyperparameters'):
            super().__init__()
            self._hparams = hparams

            emb_num_output_dims = self._hparams.input_num_emb_dims

            # -- convolutional layers ------------------------------------------
            conv_num_input_dims = emb_num_output_dims
            conv_num_output_dims = (len(self._hparams.conv_kernel_sizes)
                                    * self._hparams.conv_num_channels)
            self._conv_layers = nn.ModuleList()
            for i in range(self._hparams.conv_num_layers):
                layer = nn.ModuleDict()
                for size in self._hparams.conv_kernel_sizes:
                    conv = nn.Conv1d(
                        in_channels=(conv_num_input_dims
                                     if i == 0 else conv_num_output_dims),
                        out_channels=self._hparams.conv_num_channels,
                        kernel_size=size)
                    batch_norm = nn.BatchNorm1d(
                        num_features=self._hparams.conv_num_channels)
                    layer['kernel_size{:d}'.format(size)] = nn.ModuleDict(
                        {'conv': conv, 'batch_norm': batch_norm})
                self._conv_layers.append(layer)

            # -- dense layers --------------------------------------------------
            if self._hparams.conv_num_layers:
                dense_num_input_dims = \
                    conv_num_output_dims + self._hparams.input_num_aux_dims
            else:
                dense_num_input_dims = \
                    emb_num_output_dims + self._hparams.input_num_aux_dims
            dense_num_output_dims = self._hparams.dense_num_hidden
            self._dense_layers = nn.ModuleList()
            for i in range(self._hparams.dense_num_layers):
                self._dense_layers.append(nn.Linear(
                    in_features=(dense_num_input_dims
                                 if i == 0 else dense_num_output_dims),
                    out_features=self._hparams.dense_num_hidden))

            # -- linear layer --------------------------------------------------
            if self._hparams.dense_num_layers:
                linear_num_input_dims = dense_num_output_dims
            elif self._hparams.conv_num_layers:
                linear_num_input_dims = \
                    conv_num_output_dims + self._hparams.input_num_aux_dims
            else:
                linear_num_input_dims = \
                    emb_num_output_dims + self._hparams.input_num_aux_dims
            self._linear = nn.Linear(
                in_features=linear_num_input_dims,
                out_features=len(SdqcInstance.Label))

            # num_total_params = 0
            # for i, (n, w) in enumerate(self.named_parameters()):
            #     if w.requires_grad:
            #         print(i, n, w.shape, w.numel())
            #         num_total_params += w.numel()
            # print('Num Total Parameters: {}'.format(num_total_params))

        def forward(self, emb, aux):
            x = emb

            for layer in self._conv_layers:
                h = []
                for size in self._hparams.conv_kernel_sizes:
                    conv_batch_norm = layer['kernel_size{:d}'.format(size)]
                    conv = conv_batch_norm['conv']
                    batch_norm = conv_batch_norm['batch_norm']

                    h.append(batch_norm(F.relu(conv(
                        F.pad(x, [(size - 1) // 2, size // 2])))))
                x = torch.cat(h, dim=1)

            x = F.avg_pool1d(x, kernel_size=self._hparams.max_sentence_length)
            x.squeeze_(dim=2)

            if self._hparams.input_num_aux_dims:
                x = torch.cat((x, aux), dim=1)

            for dense in self._dense_layers:
                x = F.dropout(F.relu(dense(x)),
                              p=self._hparams.dense_dropout,
                              training=self.training)

            logits = self._linear(x)

            return logits

    def train(self,
              train_dataset: 'Sdqc.Dataset',
              dev_dataset: Optional['Sdqc.Dataset'] = None,
              print_progress: bool = False) -> 'Sdqc.Model':
        model = self.Model(self._hparams).to(self._device)

        criterion = nn.CrossEntropyLoss(
            weight=torch.tensor(self._hparams.class_weights,
                                dtype=torch.float32,
                                device=self._device))
        optimizer = optim.Adam(model.parameters(),
                               lr=self._hparams.learning_rate,
                               weight_decay=self._hparams.weight_decay)

        train_loader = DataLoader(
            train_dataset, batch_size=self._hparams.batch_size, shuffle=True)

        for epoch_no in range(1, self._hparams.num_epochs + 1):
            losses, labels, predictions = [], [], []

            progress_bar = None
            if print_progress:
                progress_bar = tqdm(total=(len(train_loader)),
                                    unit='batch',
                                    desc='Epoch: {:{}d}/{:d}'.format(
                                        epoch_no,
                                        len(str(self._hparams.num_epochs)),
                                        self._hparams.num_epochs))

            for batch_no, batch in enumerate(train_loader):
                optimizer.zero_grad()

                model.train()
                batch_logits = model(batch['emb'], batch['features'])
                with torch.no_grad():
                    batch_prediction = torch.argmax(batch_logits, dim=1)

                loss = criterion(batch_logits, batch['label'])
                loss.backward()
                optimizer.step()

                losses.append(loss.item())
                labels.append(batch['label'].data.cpu().numpy())
                predictions.append(batch_prediction.data.cpu().numpy())

                if progress_bar:
                    progress_bar.set_postfix({
                        'loss': '{:.2e}'.format(loss.item()),
                    })
                    progress_bar.update(1)

            if progress_bar:
                progress_bar.close()

                labels = np.concatenate(labels)
                predictions = np.concatenate(predictions)

                epoch_loss = np.mean(losses)
                epoch_acc = accuracy_score(labels, predictions)
                epoch_f1 = f1_score(labels, predictions, average='macro')

                print('  Loss={:.2e}  Accuracy={:.2%}  F1-score={:.2%}'
                      .format(epoch_loss, epoch_acc, epoch_f1))

            if print_progress and dev_dataset and \
                    (epoch_no == self._hparams.num_epochs
                     or not epoch_no % EVAL_DEV_EVERY_N_EPOCH):
                dev_acc, dev_f1, _ = self.eval(model, dev_dataset)
                print('  Validation:    Accuracy={:.2%}  F1-score={:.2%}'
                      .format(dev_acc, dev_f1))

        return model

    def eval(self, model: 'Sdqc.Model', dataset: 'Sdqc.Dataset') \
            -> (float, float, Dict[str, Dict[str, float]]):
        labels, predictions = [], []

        with torch.no_grad():
            data_loader = DataLoader(
                dataset, batch_size=self._hparams.batch_size)
            for batch in data_loader:
                model.eval()
                batch_logits = model(batch['emb'], batch['features'])
                batch_prediction = torch.argmax(batch_logits, dim=1)

                labels.append(batch['label'].data.cpu().numpy())
                predictions.append(batch_prediction.data.cpu().numpy())

        labels = np.concatenate(labels)
        predictions = np.concatenate(predictions)

        acc = accuracy_score(labels, predictions)
        f1 = f1_score(labels, predictions, average='macro')
        report = classification_report(
            labels, predictions, output_dict=True,
            labels=range(len(SdqcInstance.Label)),
            target_names=[label.name for label in SdqcInstance.Label])

        return acc, f1, report

    def predict(self, model: 'Sdqc.Model', post_ids: Iterable[str]) \
            -> Dict[str, Tuple[SdqcInstance.Label,
                               Dict[SdqcInstance.Label, float]]]:
        instances = [SdqcInstance(post_id) for post_id in post_ids]
        dataset = self.Dataset(instances, self._posts, self._post_embeddings,
                               self._hparams, self._device)

        results = {}
        with torch.no_grad():
            data_loader = DataLoader(dataset,
                                     batch_size=self._hparams.batch_size)
            for batch in data_loader:
                model.eval()
                batch_logits = model(batch['emb'], batch['features'])
                batch_probs = F.softmax(batch_logits, dim=1)
                batch_prediction = torch.argmax(batch_logits, dim=1)

                for post_id, prediction, probs in zip(
                        batch['post_id'], batch_prediction, batch_probs):
                    results[post_id] = \
                        (SdqcInstance.Label(prediction.item()),
                        dict(zip(SdqcInstance.Label, probs.tolist())))
        return results


In [0]:
from typing import Dict, Iterable, List, Optional, Tuple

import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, classification_report, f1_score
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# from src.dataset import Post, SdqcInstance, VerifInstance
# from src.util import DatasetHelper, ScalingMode, \
#     rmse_score

EVAL_DEV_EVERY_N_EPOCH = 20


class Verif:
    class Hyperparameters:
        def __init__(self,
                     batch_size: int,
                     num_epochs: int,
                     learning_rate: float,
                     weight_decay: float,
                     class_weights: List[float],
                     input_num_dims: int,
                     input_scaling_features: List[int],
                     input_scaling_mode: ScalingMode,
                     dense_num_layers: int,
                     dense_num_hidden: int,
                     dense_dropout: float):
            self.batch_size = batch_size
            self.num_epochs = num_epochs
            self.learning_rate = learning_rate
            self.weight_decay = weight_decay
            self.class_weights = class_weights
            self.input_num_dims = input_num_dims
            self.input_scaling_features = input_scaling_features
            self.input_scaling_mode = input_scaling_mode
            self.dense_num_layers = dense_num_layers
            self.dense_num_hidden = dense_num_hidden
            self.dense_dropout = dense_dropout

    def __init__(self,
                 posts: Dict[str, Post],
                 post_embeddings: Dict[str, torch.Tensor],
                 hparams: 'Verif.Hyperparameters',
                 device: torch.device):
        self._posts = posts
        self._post_embeddings = post_embeddings
        self._hparams = hparams
        self._device = device

    class Dataset(DatasetHelper):
        def __init__(self,
                     instances: Iterable[VerifInstance],
                     posts: Dict[str, Post],
                     post_embeddings: Dict[str, torch.Tensor],
                     sdqc_estimates: Dict[str, Tuple[SdqcInstance.Label,
                                                     Dict[SdqcInstance.Label,
                                                          float]]],
                     hparams: 'Verif.Hyperparameters',
                     device: torch.device):
            super().__init__(post_embeddings)

            for instance in instances:
                source_post = posts[instance.post_id]

                post_features = self.calc_features(source_post, posts,post_embeddings, sdqc_estimates)

                self._dataset.append({
                    'post_id': source_post.id,
                    'features': torch.from_numpy(post_features).to(device),
                    'label': (torch.tensor(instance.label.value, device=device)
                              if instance.label else 0),
                })

        @classmethod
        def calc_features(cls, source_post: Post, posts: List[Post],post_embeddings: Dict[str, torch.tensor], sdqc_estimates:  Dict[str, Tuple[SdqcInstance.Label,
                                                     Dict[SdqcInstance.Label,
                                                          float]]]):
            post_platform, post_author, post_similarity_to_source = \
                cls.calc_shared_features(source_post,post_embeddings)
            post_has_media = [source_post.has_media,
                              not source_post.has_media]
            post_upvote_ratio = 0.5
            if source_post.upvote_ratio:
                post_upvote_ratio = source_post.upvote_ratio
            num_childs = 0
            depths = {'reply': 0, 'nested': 0}
            predictions = {label: 0 for label in SdqcInstance.Label}
            estimates = {label: 0 for label in SdqcInstance.Label}
            for post in posts.values():
                if post.source_id == source_post.id:
                    num_childs += 1

                    if post.has_reply_depth:
                        depths['reply'] += 1
                    elif post.has_nested_depth:
                        depths['nested'] += 1

                    predictions[sdqc_estimates[post.id][0]] += 1
                    for label, prob in sdqc_estimates[post.id][1].items():
                        estimates[label] += prob
            depths = {depth: num / num_childs
                      for depth, num in depths.items()}
            predictions = {label: num / num_childs
                           for label, num in predictions.items()}
            estimates = {label: prob / num_childs
                         for label, prob in estimates.items()}
            depths = [depths['reply'],
                      depths['nested']]
            predictions = [predictions[SdqcInstance.Label.support],
                           predictions[SdqcInstance.Label.deny],
                           predictions[SdqcInstance.Label.query]]
            estimates = [estimates[SdqcInstance.Label.support],
                         estimates[SdqcInstance.Label.deny],
                         estimates[SdqcInstance.Label.query]]
            return (np.concatenate((post_platform,
                                             post_author,
                                             [post_similarity_to_source],
                                             post_has_media,
                                             [post_upvote_ratio],
                                             depths,
                                             # predictions,
                                             estimates
                                    ))
                             .astype(np.float32))


    def build_datasets(self,
                       train_instances: Iterable[VerifInstance],
                       dev_instances: Optional[Iterable[VerifInstance]],
                       test_instances: Optional[Iterable[VerifInstance]],
                       sdqc_estimates: Dict[str, Tuple[SdqcInstance.Label,
                                                       Dict[SdqcInstance.Label,
                                                            float]]]) \
            -> ('Verif.Dataset',
                Optional['Verif.Dataset'],
                Optional['Verif.Dataset']):
        print('Number of instances: train={:d}, dev={:d}, test={:d}'
              .format(len(train_instances),
                      len(dev_instances or []),
                      len(test_instances or [])))

        train_dataset = self.Dataset(
            train_instances, self._posts, self._post_embeddings,
            sdqc_estimates, self._hparams, self._device)

        dev_dataset = None
        if dev_instances:
            dev_dataset = self.Dataset(
                dev_instances, self._posts, self._post_embeddings,
                sdqc_estimates, self._hparams, self._device)

        test_dataset = None
        if test_instances:
            test_dataset = self.Dataset(
                test_instances, self._posts, self._post_embeddings,
                sdqc_estimates, self._hparams, self._device)

        def filter_func(post_id: str) -> bool:
            return self._posts[post_id].platform == Post.Platform.twitter

        for index in self._hparams.input_scaling_features:
            min, max, mean, std = \
                train_dataset.calc_stats_for_aux_feature(index, filter_func)

            for dataset in (train_dataset, dev_dataset, test_dataset):
                if not dataset:
                    continue

                if self._hparams.input_scaling_mode == ScalingMode.none:
                    pass
                elif self._hparams.input_scaling_mode == ScalingMode.min_max:
                    dataset.min_max_scale_aux_feature(
                        index, min, max, filter_func)
                elif self._hparams.input_scaling_mode == ScalingMode.standard:
                    dataset.standard_scale_aux_feature(
                        index, mean, std, filter_func)
                else:
                    raise ValueError('Unimplemented enum variant.')

        return train_dataset, dev_dataset, test_dataset

    class Model(nn.Module):
        def __init__(self, hparams: 'Verif.Hyperparameters'):
            super().__init__()
            self._hparams = hparams

            # -- dense layers --------------------------------------------------
            dense_num_input_dims = self._hparams.input_num_dims
            dense_num_output_dims = self._hparams.dense_num_hidden
            self._dense_layers = nn.ModuleList()
            for i in range(self._hparams.dense_num_layers):
                self._dense_layers.append(nn.Linear(
                    in_features=(dense_num_input_dims
                                 if i == 0 else dense_num_output_dims),
                    out_features=self._hparams.dense_num_hidden))

            # -- linear layer --------------------------------------------------
            if self._hparams.dense_num_layers:
                linear_num_input_dims = dense_num_output_dims
            else:
                linear_num_input_dims = self._hparams.input_num_dims
            self._linear = nn.Linear(
                in_features=linear_num_input_dims,
                out_features=len(VerifInstance.Label))

        def forward(self, features):
            x = features

            for dense in self._dense_layers:
                x = F.dropout(F.relu(dense(x)),
                              p=self._hparams.dense_dropout,
                              training=self.training)

            logits = self._linear(x)

            return logits

    def train(self,
              train_dataset: 'Verif.Dataset',
              dev_dataset: Optional['Verif.Dataset'] = None,
              print_progress: bool = False) -> 'Verif.Model':
        model = self.Model(self._hparams).to(self._device)

        criterion = nn.CrossEntropyLoss(
            weight=torch.tensor(self._hparams.class_weights,
                                dtype=torch.float32,
                                device=self._device))
        optimizer = optim.Adam(model.parameters(),
                               lr=self._hparams.learning_rate,
                               weight_decay=self._hparams.weight_decay)

        train_loader = DataLoader(
            train_dataset, batch_size=self._hparams.batch_size, shuffle=True)

        for epoch_no in range(1, self._hparams.num_epochs + 1):
            losses, labels, predictions, prediction_probs = [], [], [], []

            progress_bar = None
            if print_progress:
                progress_bar = tqdm(total=(len(train_loader)),
                                    unit='batch',
                                    desc='Epoch: {:{}d}/{:d}'.format(
                                        epoch_no,
                                        len(str(self._hparams.num_epochs)),
                                        self._hparams.num_epochs))

            for batch_no, batch in enumerate(train_loader):
                optimizer.zero_grad()

                model.train()
                batch_logits = model(batch['features'])
                with torch.no_grad():
                    batch_prediction_prob, batch_prediction = \
                        F.softmax(batch_logits, dim=1).max(dim=1)

                loss = criterion(batch_logits, batch['label'])
                loss.backward()
                optimizer.step()

                losses.append(loss.item())
                labels.append(batch['label'].data.cpu().numpy())
                predictions.append(batch_prediction.data.cpu().numpy())
                prediction_probs.append(
                    batch_prediction_prob.data.cpu().numpy())

                if progress_bar:
                    progress_bar.set_postfix({
                        'loss': '{:.2e}'.format(loss.item()),
                    })
                    progress_bar.update(1)

            if progress_bar:
                progress_bar.close()

                labels = np.concatenate(labels)
                predictions = np.concatenate(predictions)
                prediction_probs = np.concatenate(prediction_probs)

                confidences = np.maximum(0.5, prediction_probs)
                confidences[
                    predictions == VerifInstance.Label.unverified.value] = 0

                epoch_loss = np.mean(losses)
                epoch_acc = accuracy_score(labels, predictions)
                epoch_f1 = f1_score(labels, predictions, average='macro')
                epoch_rmse = rmse_score(labels, predictions, confidences)

                print('  Loss={:.2e}  Accuracy={:.2%}  F1-score={:.2%}  '
                      'RMSE={:.4f}'.format(epoch_loss, epoch_acc, epoch_f1,
                                           epoch_rmse))

            if print_progress and dev_dataset and \
                    (epoch_no == self._hparams.num_epochs
                     or not epoch_no % EVAL_DEV_EVERY_N_EPOCH):
                dev_acc, dev_f1, dev_rmse, _ = self.eval(model, dev_dataset)
                print('  Validation:    Accuracy={:.2%}  F1-score={:.2%}  '
                      'RMSE={:.4f}'.format(dev_acc, dev_f1, dev_rmse))

        return model

    def eval(self, model: 'Verif.Model', dataset: 'Verif.Dataset') \
            -> (float, float, float, Dict[str, Dict[str, float]]):
        labels, predictions, prediction_probs = [], [], []

        with torch.no_grad():
            data_loader = DataLoader(
                dataset, batch_size=self._hparams.batch_size)
            for batch in data_loader:
                model.eval()
                batch_logits = model(batch['features'])
                batch_prediction_probs, batch_predictions = \
                    F.softmax(batch_logits, dim=1).max(dim=1)

                labels.append(batch['label'].data.cpu().numpy())
                predictions.append(batch_predictions.data.cpu().numpy())
                prediction_probs.append(
                    batch_prediction_probs.data.cpu().numpy())

        labels = np.concatenate(labels)
        predictions = np.concatenate(predictions)
        prediction_probs = np.concatenate(prediction_probs)

        confidences = np.maximum(0.5, prediction_probs)
        confidences[predictions == VerifInstance.Label.unverified.value] = 0

        acc = accuracy_score(labels, predictions)
        f1 = f1_score(labels, predictions, average='macro')
        rmse = rmse_score(labels, predictions, confidences)
        report = classification_report(
            labels, predictions, output_dict=True,
            labels=range(len(VerifInstance.Label)),
            target_names=[label.name for label in VerifInstance.Label])

        return acc, f1, rmse, report

    def predict(self,
                model: 'Verif.Model',
                post_ids: Iterable[str],
                sdqc_estimates: Dict[str, Tuple[SdqcInstance.Label,
                                                Dict[SdqcInstance.Label,
                                                     float]]]) \
            -> Dict[str, Tuple[VerifInstance.Label, float]]:
        instances = [VerifInstance(post_id) for post_id in post_ids]
        dataset = self.Dataset(
            instances, self._posts, self._post_embeddings, sdqc_estimates,
            self._hparams, self._device)

        results = {}
        with torch.no_grad():
            data_loader = DataLoader(dataset,
                                     batch_size=self._hparams.batch_size)
            for batch in data_loader:
                model.eval()
                batch_logits = model(batch['features'])
                batch_prediction_probs, batch_predictions = \
                    F.softmax(batch_logits, dim=1).max(dim=1)

                batch_predictions = batch_predictions.data.cpu().numpy()
                batch_prediction_probs = \
                    batch_prediction_probs.data.cpu().numpy()

                batch_confidences = np.maximum(0.5, batch_prediction_probs)
                batch_confidences[batch_predictions
                                  == VerifInstance.Label.unverified.value] = 0
                batch_predictions[batch_predictions
                                  == VerifInstance.Label.unverified.value] = \
                    VerifInstance.Label.true.value

                for post_id, prediction, confidence in zip(
                        batch['post_id'], batch_predictions, batch_confidences):
                    results[post_id] = (VerifInstance.Label(prediction.item()),
                                        confidence.item())

        return results


In [39]:
from datetime import datetime
from itertools import chain
from pathlib import Path
from time import time
from warnings import filterwarnings

import numpy as np
import torch
from sklearn.exceptions import UndefinedMetricWarning

# from src.dataset import check_for_required_external_data_files, load_posts, \
#     load_sdcq_instances, load_verif_instances
# from src.sdqc import Sdqc
# from src.util import ScalingMode, arrange_folds_for_k_fold_cross_validation, \
#     calculate_post_elmo_embeddings, display_results, filter_instances, \
#     generate_folds_for_k_fold_cross_validation, write_answers_json
# from src.verif import Verif

time_before = time()

torch.backends.cudnn.benchmark = True
torch.backends.cudnn.fastest = True

filterwarnings('ignore', category=UndefinedMetricWarning)

NUM_ORGA_REPETITIONS = 10
NUM_CV_REPETITIONS = 1
NUM_CV_FOLDS = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

check_for_required_external_data_files()

posts = load_posts()
post_embeddings = calculate_post_elmo_embeddings(
    posts,
    max_sentence_length=32,
    batch_size=512,
    scalar_mix_parameters=[0, 0, 0],
    device=device)
num_emb_dims = next(iter(post_embeddings.values())).shape[0]

sdqc_hparams = Sdqc.Hyperparameters(
    max_sentence_length=32,
    batch_size=512,
    num_epochs=100,
    learning_rate=1e-3,
    weight_decay=1e-2,
    class_weights=[1, 1, 1, 0.2],
    input_num_emb_dims=num_emb_dims,
    input_num_aux_dims=11,
    input_aux_scaling_features=[4, 5, 6],
    input_aux_scaling_mode=ScalingMode.min_max,
    conv_num_layers=1,
    conv_kernel_sizes=[2, 3],
    conv_num_channels=64,
    dense_num_layers=3,
    dense_num_hidden=128,
    dense_dropout=0.5)
sdqc = Sdqc(posts, post_embeddings, sdqc_hparams, device)

verif_hparams = Verif.Hyperparameters(
    batch_size=128,
    num_epochs=5000,
    learning_rate=1e-3,
    weight_decay=1e-2,
    class_weights=[1, 1, 0.3],
    input_num_dims=16,
    input_scaling_features=[4, 5, 6],
    input_scaling_mode=ScalingMode.min_max,
    dense_num_layers=2,
    dense_num_hidden=512,
    dense_dropout=0.25)
verif = Verif(posts, post_embeddings, verif_hparams, device)



Loading posts...
  Number of posts: 8529 (Reddit=1895, Twitter=6634)
  Took 3.68s.
Calculating post embeddings...
  Took 980.68s.


In [0]:
sdqc_train_instances, sdqc_dev_instances, sdqc_test_instances = \
    load_sdcq_instances()
print(sdqc_train_instances)
print("=======================================================>")
print(sdqc_dev_instances)
print("=======================================================>")
print(sdqc_test_instances)
sdqc_all_instances = list(chain(
    sdqc_train_instances, sdqc_dev_instances, sdqc_test_instances))
verif_train_instances, verif_dev_instances, verif_test_instances = \
    load_verif_instances()
verif_all_instances = list(chain(
    verif_train_instances, verif_dev_instances, verif_test_instances))

sdqc_times, verif_times = [], []
sdqc_dev_accs, sdqc_dev_f1s, sdqc_dev_reports = [], [], []
sdqc_test_accs, sdqc_test_f1s, sdqc_test_reports = [], [], []
sdqc_cv_accs, sdqc_cv_f1s, sdqc_cv_reports = [], [], []
verif_dev_accs, verif_dev_f1s, verif_dev_rmses, verif_dev_reports = \
    [], [], [], []
verif_test_accs, verif_test_f1s, verif_test_rmses, verif_test_reports = \
    [], [], [], []
verif_cv_accs, verif_cv_f1s, verif_cv_rmses, verif_cv_reports = [], [], [], []

answers_dir = Path('answers') / (datetime.utcnow().isoformat() + 'Z')
answers_dir.mkdir(parents=True, exist_ok=False)

print()
print('-- Organizer Split ----------------------------------------------------')

for repetition_no in range(NUM_ORGA_REPETITIONS):
    print()
    print('## Repetition {}/{}'.format(repetition_no + 1, NUM_ORGA_REPETITIONS))

    print('Task A: SDQC')
    t1 = time()
    sdqc_train_dataset, sdqc_dev_dataset, sdqc_test_dataset = \
        sdqc.build_datasets(sdqc_train_instances,
                            sdqc_dev_instances,
                            sdqc_test_instances)
    sdqc_model = sdqc.train(sdqc_train_dataset,
                            sdqc_dev_dataset,
                            print_progress=False)
    t2 = time()
    sdqc_times.append(t2 - t1)

    sdqc_estimates = sdqc.predict(sdqc_model, posts.keys())
    if sdqc_dev_dataset:
        acc, f1, report = sdqc.eval(sdqc_model, sdqc_dev_dataset)
        print('Validation:  Accuracy={:.1%}  F1-score={:.1%}'.format(acc, f1))
        sdqc_dev_accs.append(acc)
        sdqc_dev_f1s.append(f1)
        sdqc_dev_reports.append(report)
    if sdqc_test_dataset:
        acc, f1, report = sdqc.eval(sdqc_model, sdqc_test_dataset)
        print('Test:        Accuracy={:.1%}  F1-score={:.1%}'.format(acc, f1))
        sdqc_test_accs.append(acc)
        sdqc_test_f1s.append(f1)
        sdqc_test_reports.append(report)

    # model_path = 'data/sdqc_model_{}.pth'.format(repetition_no)
    # torch.save(sdqc_model.state_dict(),model_path)

    print('Task B: Verification')
    t1 = time()
    verif_train_dataset, verif_dev_dataset, verif_test_dataset = \
        verif.build_datasets(verif_train_instances,
                             verif_dev_instances,
                             verif_test_instances,
                             sdqc_estimates)
    verif_model = verif.train(verif_train_dataset,
                              verif_dev_dataset,
                              print_progress=False)
    t2 = time()
    verif_times.append(t2 - t1)

    verif_estimates = verif.predict(
        verif_model,
        [post.id for post in posts.values() if post.has_source_depth],
        sdqc_estimates)
    if verif_dev_dataset:
        acc, f1, rmse, report = verif.eval(verif_model, verif_dev_dataset)
        print('Validation:  Accuracy={:.1%}  F1-score={:.1%}  RMSE={:.3f}'
              .format(acc, f1, rmse))
        verif_dev_accs.append(acc)
        verif_dev_f1s.append(f1)
        verif_dev_rmses.append(rmse)
        verif_dev_reports.append(report)
    if verif_test_dataset:
        acc, f1, rmse, report = verif.eval(verif_model, verif_test_dataset)
        print('Test:        Accuracy={:.1%}  F1-score={:.1%}  RMSE={:.3f}'
              .format(acc, f1, rmse))
        verif_test_accs.append(acc)
        verif_test_f1s.append(f1)
        verif_test_rmses.append(rmse)
        verif_test_reports.append(report)

        # model_path = 'data/verif_model_{}.pth'.format(repetition_no)
        # torch.save(verif_model.state_dict(),model_path)

    write_answers_json(
        answers_dir / 'answers.organizers_rep{}_train.json'.format(
            repetition_no),
        sdqc_train_instances, verif_train_instances,
        sdqc_estimates, verif_estimates)
    if sdqc_dev_instances and verif_dev_instances:
        write_answers_json(
            answers_dir / 'answers.organizers_rep{}_dev.json'.format(
                repetition_no),
            sdqc_dev_instances, verif_dev_instances,
            sdqc_estimates, verif_estimates)
    if sdqc_test_instances and verif_test_instances:
        write_answers_json(
            answers_dir / 'answers.organizers_rep{}_test.json'.format(
                repetition_no),
            sdqc_test_instances, verif_test_instances,
            sdqc_estimates, verif_estimates)

print()
print('-- k-fold Cross Validation --------------------------------------------')

for repetition_no in range(NUM_CV_REPETITIONS):
    print()
    print('## Repetition {}/{}'.format(repetition_no + 1, NUM_CV_REPETITIONS))

    folds = generate_folds_for_k_fold_cross_validation(posts, NUM_CV_FOLDS)
    for i in range(NUM_CV_FOLDS):
        print()
        print('# Cross Validation {}/{}'.format(i + 1, NUM_CV_FOLDS))

        train_post_ids, test_post_ids = \
            arrange_folds_for_k_fold_cross_validation(folds, i)

        print('Task A: SDQC')
        sdqc_train_instances, sdqc_test_instances = \
            filter_instances(train_post_ids, test_post_ids, sdqc_all_instances)
        sdqc_train_dataset, _, sdqc_test_dataset = \
            sdqc.build_datasets(sdqc_train_instances, None, sdqc_test_instances)
        sdqc_model = sdqc.train(sdqc_train_dataset, print_progress=False)
        sdqc_estimates = sdqc.predict(sdqc_model, posts.keys())
        acc, f1, report = sdqc.eval(sdqc_model, sdqc_test_dataset)
        print('Test:        Accuracy={:.1%}  F1-score={:.1%}'
              .format(acc, f1))
        sdqc_cv_accs.append(acc)
        sdqc_cv_f1s.append(f1)
        sdqc_cv_reports.append(report)

        print('Task B: Verification')
        verif_train_instances, verif_test_instances = \
            filter_instances(train_post_ids, test_post_ids, verif_all_instances)
        verif_train_dataset, _, verif_test_dataset = \
            verif.build_datasets(verif_train_instances, None,
                                 verif_test_instances,
                                 sdqc_estimates)
        verif_model = verif.train(verif_train_dataset, print_progress=False)
        verif_estimates = verif.predict(
            verif_model,
            [post.id for post in posts.values() if post.has_source_depth],
            sdqc_estimates)
        acc, f1, rmse, report = verif.eval(verif_model, verif_test_dataset)
        print('Test:        Accuracy={:.1%}  F1-score={:.1%}  RMSE={:.3f}'
              .format(acc, f1, rmse))
        verif_cv_accs.append(acc)
        verif_cv_f1s.append(f1)
        verif_cv_rmses.append(rmse)
        verif_cv_reports.append(report)

        write_answers_json(
            answers_dir / 'answers.kfold_rep{}_cv{}_train.json'.format(
                repetition_no, i),
            sdqc_train_instances, verif_train_instances,
            sdqc_estimates, verif_estimates)
        write_answers_json(
            answers_dir / 'answers.kfold_rep{}_cv{}_test.json'.format(
                repetition_no, i),
            sdqc_test_instances, verif_test_instances,
            sdqc_estimates, verif_estimates)

print()
print('-- Results ------------------------------------------------------------')

print()
print('# Validation')
display_results(
    sdqc_dev_accs, sdqc_dev_f1s, sdqc_dev_reports,
    verif_dev_accs, verif_dev_f1s, verif_dev_rmses, verif_dev_reports)

print()
print('# Test')
display_results(
    sdqc_test_accs, sdqc_test_f1s, sdqc_test_reports,
    verif_test_accs, verif_test_f1s, verif_test_rmses, verif_test_reports)

print()
print('# Cross-Validation')
display_results(
    sdqc_cv_accs, sdqc_cv_f1s, sdqc_cv_reports,
    verif_cv_accs, verif_cv_f1s, verif_cv_rmses, verif_cv_reports)

print()
print('# Runtime')
print('SDQC:  {:.2}±{:.2}s'.format(np.mean(sdqc_times), np.std(sdqc_times)))
print('Verif: {:.2}±{:.2}s'.format(np.mean(verif_times), np.std(verif_times)))

print()
time_after = time()
print('Program ran for {:.2f}s in total'.format(time_after - time_before))


[SDQC (501760642928635904, Label.comment), SDQC (500270212198174720, Label.comment), SDQC (524971210275565568, Label.comment), SDQC (552836882770690049, Label.comment), SDQC (500289931097296897, Label.support), SDQC (544293230649810944, Label.comment), SDQC (544283032732332032, Label.comment), SDQC (500406679176241153, Label.comment), SDQC (500300182844878848, Label.comment), SDQC (552811591683821568, Label.comment), SDQC (524938343180931073, Label.comment), SDQC (521360786409943041, Label.deny), SDQC (524946532416884736, Label.comment), SDQC (544347456238931968, Label.comment), SDQC (525020658972258305, Label.query), SDQC (552817834628501505, Label.comment), SDQC (524959078229880832, Label.comment), SDQC (553588165408473088, Label.comment), SDQC (544293748377518080, Label.comment), SDQC (553196346375954433, Label.query), SDQC (524957146656821249, Label.comment), SDQC (524992668301262848, Label.deny), SDQC (500294827351621632, Label.comment), SDQC (544307827070615553, Label.support), S

In [0]:
# !pip3 install allennlp
# !wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5 -P ./data/external
# !wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json -P ./data/external