In [1]:
import json
from pathlib import Path
import numpy as np
from copy import deepcopy
import pandas as pd

from deeppavlov.core.commands.train import read_data_by_config, train_evaluate_model_from_config
from deeppavlov.core.commands.infer import interact_model, build_model
from deeppavlov.core.commands.utils import expand_path, parse_config
from deeppavlov.core.common.params import from_params
from deeppavlov.core.common.errors import ConfigError

In [2]:
# read unlabelled data for label propagation
def read_unlabelled_data(UNLABELLED_DATA_PATH):
    with open(UNLABELLED_DATA_PATH, "r") as f:
        unlabelled_data = f.read().splitlines()
    unlabelled_data = [x for x in unlabelled_data if x != '']
    return unlabelled_data

In [3]:
def make_pl_config(CONFIG_PATH):
    config_path_pl = Path(CONFIG_PATH).parent / Path(Path(CONFIG_PATH).stem + "_pl.json")

    with open(CONFIG_PATH, "r") as f:
        config = json.load(f)
    
    config_pl = deepcopy(config)
    config_pl["dataset_reader"]["train"] = Path(config_pl["dataset_reader"].get("train", "train.csv")).stem + "_pl.csv"
    
    with open(config_path_pl, "w") as f:
        json.dump(config_pl, f, indent=2)
    
    return config, config_pl

In [4]:
def save_extended_data(config, samples, labels, new_config = None):
    train_data = read_data_by_config(deepcopy(config))
    
    for i in range(len(samples)):
        train_data["train"].append((samples[i], labels[i]))
    df = pd.DataFrame(train_data["train"], 
                      columns=[config["dataset_reader"]["x"], 
                               config["dataset_reader"]["y"]])
    df[config["dataset_reader"]["y"]] = df[config["dataset_reader"]["y"]].apply(
        lambda x: config["dataset_reader"].get("class_sep", ",").join(x))
    
    if new_config is not None:
        config = new_config
    file = expand_path(Path(config["dataset_reader"]["data_path"]) / 
                       Path(config["dataset_reader"]["train"]))

    if config["dataset_reader"].get("format", "csv") == "csv":
        keys = ('sep', 'header', 'names')
        df.to_csv(file, 
                  index=False,
                  sep=config["dataset_reader"].get("sep", ",")
                 )
    elif config["dataset_reader"].get("format", "csv") == "json":
        keys = ('orient', 'lines')
        df.to_json(file, 
                  index=False,
                  orient=config["dataset_reader"].get("orient", None),
                  lines=config["dataset_reader"].get("lines", False)
                  )
    else:
        raise ConfigError("Can not work with current data format")

In [5]:
# manually given parameters for pseudo-labeling

# path to config file
CONFIG_PATH = "../deeppavlov/configs/classifiers/convers_vs_info.json"
# read config, compose new one, save it
config, config_pl = make_pl_config(CONFIG_PATH)
config, config_pl = parse_config(config), parse_config(config_pl)
config

{'dataset_reader': {'class_name': 'basic_classification_reader',
  'x': 'Title',
  'y': 'Label',
  'data_path': '~/.deeppavlov/downloads/convers_vs_info_data/',
  'train': 'train_sber.csv',
  'valid': 'valid_sber.csv'},
 'dataset_iterator': {'class_name': 'basic_classification_iterator',
  'seed': 42},
 'chainer': {'in': ['x'],
  'in_y': ['y'],
  'pipe': [{'id': 'classes_vocab',
    'class_name': 'simple_vocab',
    'fit_on': ['y'],
    'save_path': '~/.deeppavlov/models/classifiers/convers_vs_info/model_v0/classes.dict',
    'load_path': '~/.deeppavlov/models/classifiers/convers_vs_info/model_v0/classes.dict',
    'in': 'y',
    'out': 'y_ids'},
   {'in': ['x'],
    'out': ['x_prep'],
    'class_name': 'dirty_comments_preprocessor',
    'remove_punctuation': False},
   {'in': 'x_prep',
    'out': 'x_tok',
    'id': 'my_tokenizer',
    'class_name': 'nltk_moses_tokenizer'},
   {'in': ['x_tok'],
    'out': ['x_emb'],
    'id': 'my_embedder',
    'class_name': 'elmo',
    'elmo_output_na

In [6]:
# path to file with unlabelled data
UNLABELLED_DATA_PATH = expand_path(Path(config["dataset_reader"]["data_path"])) / Path("question_L6.txt")
# number of samples that are going to be labelled during one iteration of label propagation
ONE_ITERATION_PORTION = 2000
# number of iterations
N_ITERATIONS = 10
CLASSES_VOCAB_ID_IN_PIPE = 0
CONFIDENT_PROBA = 0.9

In [7]:
# read unlabelled dataset
unlabelled_data = read_unlabelled_data(UNLABELLED_DATA_PATH)

# save initial dataset as extended
save_extended_data(config, [], [], new_config=config_pl)



In [None]:
# first of all train initial model on the initial dataset (w/o pseudo-labeling)
train_evaluate_model_from_config(deepcopy(config))

2018-11-19 14:27:35.275 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 89: [saving vocabulary to /home/dilyara/.deeppavlov/models/classifiers/convers_vs_info/model_v0/classes.dict]
[nltk_data] Downloading package punkt to /home/dilyara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dilyara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/dilyara/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/dilyara/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!
Using TensorFlow backend.
2018-11-19 14:27:36.456 INFO in 'tensorflow'['tf_logging'] at line 159: Using /tmp/tfhub_modules to cache modules.
2018-11-19 14:27:38.106 DEBUG in 'tensorflow'['tf_logging'] at line 100: Initiali

2018-11-19 14:27:38.209 DEBUG in 'tensorflow'['tf_logging'] at line 100: Initialize variable module/bilm/RNN_0/RNN/MultiRNNCell/Cell0/rnn/lstm_cell/kernel:0 from checkpoint b'/home/dilyara/.deeppavlov/downloads/embeddings/yahooo-sber-questions_epoches_n_9/variables/variables' with bilm/RNN_0/RNN/MultiRNNCell/Cell0/rnn/lstm_cell/kernel
2018-11-19 14:27:38.211 DEBUG in 'tensorflow'['tf_logging'] at line 100: Initialize variable module/bilm/RNN_0/RNN/MultiRNNCell/Cell0/rnn/lstm_cell/projection/kernel:0 from checkpoint b'/home/dilyara/.deeppavlov/downloads/embeddings/yahooo-sber-questions_epoches_n_9/variables/variables' with bilm/RNN_0/RNN/MultiRNNCell/Cell0/rnn/lstm_cell/projection/kernel
2018-11-19 14:27:38.215 DEBUG in 'tensorflow'['tf_logging'] at line 100: Initialize variable module/bilm/RNN_0/RNN/MultiRNNCell/Cell1/rnn/lstm_cell/bias:0 from checkpoint b'/home/dilyara/.deeppavlov/downloads/embeddings/yahooo-sber-questions_epoches_n_9/variables/variables' with bilm/RNN_0/RNN/MultiRNNC

2018-11-19 14:27:39.735 INFO in 'tensorflow'['tf_logging'] at line 115: Saver not created because there are no variables in the graph to restore
2018-11-19 14:27:40.130 INFO in 'deeppavlov.models.classifiers.keras_classification_model'['keras_classification_model'] at line 258: [initializing `KerasClassificationModel` from scratch as bigru_with_max_aver_pool_model]
2018-11-19 14:27:40.800 INFO in 'deeppavlov.models.classifiers.keras_classification_model'['keras_classification_model'] at line 132: Model was successfully initialized!
Model summary:
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 20, 1024)     0                                            
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 20, 1024)   

{"valid": {"eval_examples_count": 181, "metrics": {"roc_auc": 0.3269, "sets_accuracy": 0.547, "f1_macro": 0.3644}, "time_spent": "0:00:03", "epochs_done": 0, "batches_seen": 0, "train_examples_seen": 0, "impatience": 0, "patience_limit": 5}}
{"train": {"epochs_done": 1, "batches_seen": 7, "train_examples_seen": 1619, "metrics": {"roc_auc": 0.5243, "sets_accuracy": 0.53, "f1_macro": 0.5252}, "time_spent": "0:00:32", "loss": 1.1731205923216683}}


  'precision', 'predicted', average, warn_for)
2018-11-19 14:28:14.363 INFO in 'deeppavlov.core.commands.train'['train'] at line 524: New best roc_auc of 0.7989
2018-11-19 14:28:14.365 INFO in 'deeppavlov.core.commands.train'['train'] at line 526: Saving model
2018-11-19 14:28:14.365 INFO in 'deeppavlov.models.classifiers.keras_classification_model'['keras_classification_model'] at line 372: [saving model to /home/dilyara/.deeppavlov/models/classifiers/convers_vs_info/model_v0/model_opt.json]


{"valid": {"eval_examples_count": 181, "metrics": {"roc_auc": 0.7989, "sets_accuracy": 0.5525, "f1_macro": 0.3559}, "time_spent": "0:00:34", "epochs_done": 1, "batches_seen": 7, "train_examples_seen": 1619, "impatience": 0, "patience_limit": 5}}
{"train": {"epochs_done": 2, "batches_seen": 14, "train_examples_seen": 3238, "metrics": {"roc_auc": 0.872, "sets_accuracy": 0.7573, "f1_macro": 0.7279}, "time_spent": "0:01:01", "loss": 0.6307094352585929}}


2018-11-19 14:28:42.743 INFO in 'deeppavlov.core.commands.train'['train'] at line 524: New best roc_auc of 0.9525
2018-11-19 14:28:42.744 INFO in 'deeppavlov.core.commands.train'['train'] at line 526: Saving model
2018-11-19 14:28:42.745 INFO in 'deeppavlov.models.classifiers.keras_classification_model'['keras_classification_model'] at line 372: [saving model to /home/dilyara/.deeppavlov/models/classifiers/convers_vs_info/model_v0/model_opt.json]


{"valid": {"eval_examples_count": 181, "metrics": {"roc_auc": 0.9525, "sets_accuracy": 0.884, "f1_macro": 0.8807}, "time_spent": "0:01:02", "epochs_done": 2, "batches_seen": 14, "train_examples_seen": 3238, "impatience": 0, "patience_limit": 5}}
{"train": {"epochs_done": 3, "batches_seen": 21, "train_examples_seen": 4857, "metrics": {"roc_auc": 0.9548, "sets_accuracy": 0.903, "f1_macro": 0.9014}, "time_spent": "0:01:28", "loss": 0.4995504489966801}}


2018-11-19 14:29:09.761 INFO in 'deeppavlov.core.commands.train'['train'] at line 524: New best roc_auc of 0.9641
2018-11-19 14:29:09.761 INFO in 'deeppavlov.core.commands.train'['train'] at line 526: Saving model
2018-11-19 14:29:09.762 INFO in 'deeppavlov.models.classifiers.keras_classification_model'['keras_classification_model'] at line 372: [saving model to /home/dilyara/.deeppavlov/models/classifiers/convers_vs_info/model_v0/model_opt.json]


{"valid": {"eval_examples_count": 181, "metrics": {"roc_auc": 0.9641, "sets_accuracy": 0.9061, "f1_macro": 0.903}, "time_spent": "0:01:29", "epochs_done": 3, "batches_seen": 21, "train_examples_seen": 4857, "impatience": 0, "patience_limit": 5}}
{"train": {"epochs_done": 4, "batches_seen": 28, "train_examples_seen": 6476, "metrics": {"roc_auc": 0.9655, "sets_accuracy": 0.9067, "f1_macro": 0.9053}, "time_spent": "0:01:55", "loss": 0.424065819808415}}


2018-11-19 14:29:36.969 INFO in 'deeppavlov.core.commands.train'['train'] at line 524: New best roc_auc of 0.9681
2018-11-19 14:29:36.970 INFO in 'deeppavlov.core.commands.train'['train'] at line 526: Saving model
2018-11-19 14:29:36.971 INFO in 'deeppavlov.models.classifiers.keras_classification_model'['keras_classification_model'] at line 372: [saving model to /home/dilyara/.deeppavlov/models/classifiers/convers_vs_info/model_v0/model_opt.json]


{"valid": {"eval_examples_count": 181, "metrics": {"roc_auc": 0.9681, "sets_accuracy": 0.9116, "f1_macro": 0.9099}, "time_spent": "0:01:57", "epochs_done": 4, "batches_seen": 28, "train_examples_seen": 6476, "impatience": 0, "patience_limit": 5}}
{"train": {"epochs_done": 5, "batches_seen": 35, "train_examples_seen": 8095, "metrics": {"roc_auc": 0.974, "sets_accuracy": 0.9191, "f1_macro": 0.9183}, "time_spent": "0:02:26", "loss": 0.3873091084616525}}


2018-11-19 14:30:07.915 INFO in 'deeppavlov.core.commands.train'['train'] at line 531: Did not improve on the roc_auc of 0.9681


{"valid": {"eval_examples_count": 181, "metrics": {"roc_auc": 0.9638, "sets_accuracy": 0.9061, "f1_macro": 0.9044}, "time_spent": "0:02:28", "epochs_done": 5, "batches_seen": 35, "train_examples_seen": 8095, "impatience": 1, "patience_limit": 5}}


In [None]:
available_unlabelled_ids = np.arange(len(unlabelled_data))

np.random.seed(42)

for i in range(N_ITERATIONS):
    samples = []
    labels = []
    
    ids_to_label = available_unlabelled_ids[
        np.random.randint(low=0, 
                          high=len(available_unlabelled_ids), 
                          size=ONE_ITERATION_PORTION)]
    available_unlabelled_ids = np.delete(available_unlabelled_ids, ids_to_label)
    train_evaluate_model_from_config(deepcopy(config_pl))
    model = build_model_from_config(deepcopy(config_pl))
    classes = np.array(list(from_params(
        deepcopy(config_pl["chainer"]["pipe"][CLASSES_VOCAB_ID_IN_PIPE])).keys()))

    for j, sample_id in enumerate(ids_to_label):
        prediction = model([unlabelled_data[sample_id]])[0]
        if len(np.where(np.array(prediction) > CONFIDENT_PROBA)[0]):
            samples.append(unlabelled_data[sample_id])
            labels.append(classes[np.where(np.array(prediction) > CONFIDENT_PROBA)])
    
    print("Iteration {}: add {} samples to train dataset".format(i, len(samples)))
    save_extended_data(config_pl, samples, labels)