# Subject Recognition CRF Tagger Model

In [1]:
import sys
sys.path.insert(0, '../../../allennlp')
sys.path.insert(0, '../../')

In [18]:
import os
from allennlp.common.params import Params

crf_tagger = {
    "dataset_reader": {
        "type": "sequence_tagging",
        "word_tag_delimiter": "/",
        "token_indexers": {
            "tokens": {
                "type": "single_id",
                "lowercase_tokens": True
            },
            "token_characters": {
                "type": "characters",
                "character_tokenizer": {
                    "end_tokens": ["@@PADDING@@", "@@PADDING@@", "@@PADDING@@", "@@PADDING@@"]
                }
            }
        }
    },
    "pytorch_seed": 4006490763, # 0.957081 Other random seeds ranged from 0.954 - 0.956
    "numpy_seed": 4006490763,
    "random_seed": 4006490763,
    # TODO: Update this to the location for subject name recognition training data
    "train_data_path": './../../data/subject_recognition/train.txt',
    "validation_data_path": './../../data/subject_recognition/dev.txt',
    "model": {
        "type": "crf_tagger",
        "text_field_embedder": {
            "tokens": {
                "type":
                    "embedding",
                "pretrained_file":
                    "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz",
                "embedding_dim":
                    100,
                "trainable":
                    False
            },
            "token_characters": {
                "type": "character_encoding",
                "embedding": {
                    "embedding_dim": 25
                },
                "encoder": {
                    "type": "cnn",
                    "embedding_dim": 25,
                    "num_filters": 100,
                    "ngram_filter_sizes": [5]
                },
                "dropout": 0.25
            }
        },
        "encoder": {
            "type": "lstm",
            "input_size": 200,
            "hidden_size": 600,
            "num_layers": 3,
            "dropout": 0.25,
            "bidirectional": True
        },
        "regularizer": [["transitions$", {"type": "l2", "alpha": 0.01}]],
    },
    "iterator": {
        "type": "bucket",
        "sorting_keys": [["tokens", "num_tokens"]],
        "padding_noise": 0.3,
        "batch_size": 64,
    },
    "trainer": {
        "num_epochs": 100,
        "cuda_device": 0,
        "patience": 6,
        "optimizer": {
            "type": "adam",
            "amsgrad": False,
        },
        "validation_metric": "+accuracy",
        "learning_rate_scheduler":  {
          "type": "reduce_on_plateau",
          "factor": 0.5,
          "mode": "max",
          "patience": 2,
        },
    }
}

## Single Run

In [3]:
# from lib.utils import config_logging
# from lib.utils import new_experiment_folder
# from allennlp.commands.train import train_model

# config_logging()
# experiment_folder = new_experiment_folder(label='subject_recognition', parent_directory='../../experiments/')
# # experiment_folder = '../../experiments/subject_recognition.02_11_08:21:07'
# params = Params(crf_tagger)
# print('Serialization Directory:', experiment_folder)
# train_model(params=params, serialization_dir=experiment_folder)

## Grid Search

In [4]:
from copy import deepcopy
import itertools

token_characters_ = []
# base_rnn_token_characters = {
#     "type": "character_encoding",
#     "embedding": {
#         "embedding_dim": 25
#     },
#     "encoder": {
#         "type": "gru",
#         "input_size": 25,
#         "hidden_size": 50,
#         "num_layers": 2,
#         "dropout": 0.25,
#         "bidirectional": True
#     },
#     "dropout": 0.25,
# }
# rnn_token_characters_space = [[0.0, 0.25, 0.5], [1, 2, 3], ['gru', 'lstm']]
# for dropout, num_layers, type_ in list(itertools.product(*rnn_token_characters_space)):
#     copy = deepcopy(base_rnn_token_characters)
#     copy['encoder']['dropout'] = dropout
#     copy['dropout'] = dropout
#     copy['encoder']['num_layers'] = num_layers
#     copy['encoder']['type'] = type_
#     token_characters_.append(copy)

base_cnn_token_characters = {
    "type": "character_encoding",
    "embedding": {
        "embedding_dim": 25
    },
    "encoder": {
        "type": "cnn",
        "embedding_dim": 25,
        "num_filters": 100,
        "ngram_filter_sizes": [5]
    },
    "dropout": 0.25,
}
cnn_token_characters_space = [[0.25]]
for dropout, in list(itertools.product(*cnn_token_characters_space)):
    copy = deepcopy(base_cnn_token_characters)
    copy['dropout'] = dropout
    token_characters_.append(copy)

encoders = []
base_encoder = {
    "type": "lstm",
    "input_size": 200,
    "hidden_size": 200,
    "num_layers": 3,
    "dropout": 0.25,
    "bidirectional": True
}
encoder_space = [[0.25], [2, 3, 4, 5], [600, 800, 1000], ['lstm']]
encoder_points = list(itertools.product(*encoder_space))
for dropout, num_layers, hidden_size, type_ in encoder_points:
    copy = deepcopy(base_encoder)
    copy['dropout'] = dropout
    copy['num_layers'] = num_layers
    copy['hidden_size'] = hidden_size
    copy['type'] = type_
    encoders.append(copy)

In [5]:
import itertools
import random
space = [[random.randint(0, 2**32) for i in range(32)]]
points = list(itertools.product(*space))
random.shuffle(points)
print('Number of points: %d' % len(points))

Number of points: 32


In [None]:
import torch
from lib.utils import config_logging
from lib.utils import new_experiment_folder
from allennlp.commands.train import train_model
import json
import copy
import shutil

import os, sys
import stat

#config_logging()

for i, (rand_int,) in enumerate(points):
    hyperparameters = copy.deepcopy(crf_tagger)
    hyperparameters['pytorch_seed'] = rand_int
    hyperparameters['numpy_seed'] = rand_int
    hyperparameters['random_seed'] = rand_int
    print('–' * 100)
    experiment_folder = new_experiment_folder(label='subject_recognition_grid_search_' + str(i),
                                          parent_directory='../../experiments/')
    print('Seed: %s' % rand_int)
    print('Serialization Directory:', experiment_folder)
    params = Params(hyperparameters)
    train_model(params=params, serialization_dir=experiment_folder)
    shutil.rmtree(experiment_folder,ignore_errors=True)
    params = None
    torch.cuda.empty_cache()