In [31]:
!conda install -c conda-forge ipywidgets

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/ml/miniconda3

  added / updated specs:
    - ipywidgets


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.10.1               |   py38h578d9bd_0         3.1 MB  conda-forge
    ipywidgets-7.6.3           |     pyhd3deb0d_0         101 KB  conda-forge
    jupyterlab_widgets-1.0.0   |     pyhd8ed1ab_1         130 KB  conda-forge
    openssl-1.1.1k             |       h7f98852_0         2.1 MB  conda-forge
    widgetsnbextension-3.5.1   |   py38h578d9bd_4         1.8 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         7.2 MB

The following NEW packages will be INSTALLED:

  ipywidgets         conda-forge/noarch::ipywidgets-7.6.3-pyhd3deb0d_0
  jupyterlab_widgets co

In [2]:
pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-1.3.1-py3-none-any.whl (805 kB)
[K     |████████████████████████████████| 805 kB 1.8 MB/s eta 0:00:01
Collecting torchmetrics>=0.2.0
  Downloading torchmetrics-0.3.2-py3-none-any.whl (274 kB)
[K     |████████████████████████████████| 274 kB 8.2 MB/s eta 0:00:01
Collecting pyDeprecate==0.3.0
  Downloading pyDeprecate-0.3.0-py3-none-any.whl (10 kB)
Collecting fsspec[http]>=2021.4.0
  Downloading fsspec-2021.5.0-py3-none-any.whl (111 kB)
[K     |████████████████████████████████| 111 kB 8.1 MB/s eta 0:00:01
Collecting tensorboard!=2.5.0,>=2.2.0
  Using cached tensorboard-2.4.1-py3-none-any.whl (10.6 MB)
Collecting aiohttp
  Downloading aiohttp-3.7.4.post0-cp38-cp38-manylinux2014_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 7.8 MB/s eta 0:00:01
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.6.3-cp38-cp38-manylinux2014_x86_64.whl (324 kB)
[K     |████████████████████████████████| 324 kB 10.9 MB/s eta 0:

In [1]:
from transformers import BertPreTrainedModel, BertModel
from torch import nn

In [2]:
from torch.nn import CrossEntropyLoss
import pytorch_lightning as pl

In [3]:
from collections import OrderedDict, UserDict
from typing import Tuple, Any, Optional
from dataclasses import dataclass
import torch
from dataclasses import fields



class ModelOutput(OrderedDict):
    """
    Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
    a tuple) or strings (like a dictionary) that will ignore the ``None`` attributes. Otherwise behaves like a regular
    python dictionary.
    .. warning::
        You can't unpack a :obj:`ModelOutput` directly. Use the :meth:`~transformers.file_utils.ModelOutput.to_tuple`
        method to convert it to a tuple before.
    """

    def __post_init__(self):
        class_fields = fields(self)

        # Safety and consistency checks
        assert len(class_fields), f"{self.__class__.__name__} has no fields."
        assert all(
            field.default is None for field in class_fields[1:]
        ), f"{self.__class__.__name__} should not have more than one required field."

        first_field = getattr(self, class_fields[0].name)
        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])

        if other_fields_are_none and not is_tensor(first_field):
            try:
                iterator = iter(first_field)
                first_field_iterator = True
            except TypeError:
                first_field_iterator = False

            # if we provided an iterator as first field and the iterator is a (key, value) iterator
            # set the associated fields
            if first_field_iterator:
                for element in iterator:
                    if (
                        not isinstance(element, (list, tuple))
                        or not len(element) == 2
                        or not isinstance(element[0], str)
                    ):
                        break
                    setattr(self, element[0], element[1])
                    if element[1] is not None:
                        self[element[0]] = element[1]
            elif first_field is not None:
                self[class_fields[0].name] = first_field
        else:
            for field in class_fields:
                v = getattr(self, field.name)
                if v is not None:
                    self[field.name] = v

    def __delitem__(self, *args, **kwargs):
        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")

    def setdefault(self, *args, **kwargs):
        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")

    def pop(self, *args, **kwargs):
        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")

    def update(self, *args, **kwargs):
        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")

    def __getitem__(self, k):
        if isinstance(k, str):
            inner_dict = {k: v for (k, v) in self.items()}
            return inner_dict[k]
        else:
            return self.to_tuple()[k]

    def __setattr__(self, name, value):
        if name in self.keys() and value is not None:
            # Don't call self.__setitem__ to avoid recursion errors
            super().__setitem__(name, value)
        super().__setattr__(name, value)

    def __setitem__(self, key, value):
        # Will raise a KeyException if needed
        super().__setitem__(key, value)
        # Don't call self.__setattr__ to avoid recursion errors
        super().__setattr__(key, value)

    def to_tuple(self) -> Tuple[Any]:
        """
        Convert self to a tuple containing all the attributes/keys that are not ``None``.
        """
        return tuple(self[k] for k in self.keys())


@dataclass
class MaskedLMOutput(ModelOutput):
    """
    Base class for masked language models outputs.
    Args:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
            Masked language modeling (MLM) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


class BertPredictionHeadTransform(nn.Module):
    def __init__(self, hidden_size, sent_size, hidden_act, layer_norm_eps):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size - sent_size)
        if isinstance(hidden_act, str):
            self.transform_act_fn = ACT2FN[hidden_act]
        else:
            self.transform_act_fn = hidden_act
        self.LayerNorm = nn.LayerNorm(hidden_size - sent_size, eps=layer_norm_eps)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


class BertLMPredictionHead(nn.Module):
    def __init__(self, hidden_size, sent_size, hidden_act, layer_norm_eps, vocab_size):
        super().__init__()
        self.transform = BertPredictionHeadTransform(hidden_size, sent_size, hidden_act, layer_norm_eps)
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(hidden_size - sent_size, vocab_size, bias=False)
     
        self.bias = nn.Parameter(torch.zeros(vocab_size))

        # # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
        self.decoder.bias = self.bias



    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)

        return hidden_states

class BertOnlyMLMHead_(nn.Module):
    def __init__(self, hidden_size, sent_size, hidden_act, layer_norm_eps, vocab_size):
        super().__init__()
        self.predictions = BertLMPredictionHead(hidden_size, sent_size, hidden_act, layer_norm_eps, vocab_size)

    def forward(self, sequence_output):
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores

In [4]:
import math

import torch
import torch.nn.functional as F
from packaging import version

def _gelu_python(x):
    """
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in
    torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def gelu_new(x):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
    """
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))


if version.parse(torch.__version__) < version.parse("1.4"):
    gelu = _gelu_python
else:
    gelu = F.gelu


def gelu_fast(x):
    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))


def quick_gelu(x):
    return x * torch.sigmoid(1.702 * x)


def _silu_python(x):
    """
    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
    later.
    """
    return x * torch.sigmoid(x)


if version.parse(torch.__version__) < version.parse("1.7"):
    silu = _silu_python
else:
    silu = F.silu


def mish(x):
    return x * torch.tanh(torch.nn.functional.softplus(x))


def linear_act(x):
    return x

In [5]:
import torch.nn.functional as F

ACT2FN = {
    "relu": F.relu,
    "silu": silu,
    "swish": silu,
    "gelu": gelu,
    "tanh": torch.tanh,
    "gelu_new": gelu_new,
    "gelu_fast": gelu_fast,
    "quick_gelu": quick_gelu,
    "mish": mish,
    "linear": linear_act,
    "sigmoid": torch.sigmoid,
}

In [6]:
!wget https://raw.githubusercontent.com/daria-sa/TST/main/YELP_with_sentiment_tags.json

--2021-05-18 01:39:42--  https://raw.githubusercontent.com/daria-sa/TST/main/YELP_with_sentiment_tags.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20211271 (19M) [text/plain]
Saving to: ‘YELP_with_sentiment_tags.json’


2021-05-18 01:39:43 (31,2 MB/s) - ‘YELP_with_sentiment_tags.json’ saved [20211271/20211271]



In [9]:
import json
data = json.load(open("YELP_with_sentiment_tags.json"))
data = eval(data)

In [10]:
from typing import List
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import BertTokenizer
import torch

class MLMDataset(Dataset):

    def __init__(
      self, 
      data: List[dict], 
      model_path_or_name: str = 'bert-base-cased',
      pad_token: str = "[PAD]",
      cls_token: str = "[CLS]",
      sep_token: str = "[SEP]",
      unk_token: str = "[UNK]",
      mask_token: str = "[MASK]"
      ):
        self.tokenizer = BertTokenizer.from_pretrained(model_path_or_name, do_lower_case=False)
        self.pad_token = pad_token
        self.cls_token = cls_token
        self.sep_token = sep_token
        self.unk_token = unk_token
        self.mask_token = mask_token
        self.pad_idx = 0
        self.max_sequence_length = 512
        self.prep_samples = self.prepare_data(data)
    
    def prepare_data(self, data):
        max_seq_length_without_special = 510
        prep_samples = []

        for input_sample in tqdm(data):
            sample = {"labels": [], "sentiment_labels": [], "input_ids": []}
            for orig_token, label in zip(input_sample["tokenized_text"], input_sample["senti_tags"]):
                orig_token = str(orig_token)
                label = int(label) + 2
                cur_tokens = self.tokenizer.tokenize(str(orig_token))
                if not cur_tokens:
                    cur_tokens = [self.tokenizer.convert_tokens_to_ids(self.unk_token)]
                if label != 2:
                    for _ in cur_tokens:
                        sample["input_ids"].append(self.tokenizer.convert_tokens_to_ids(self.mask_token))
                        sample["sentiment_labels"].append(label)
                    sample["labels"].extend(cur_tokens)

                else:
                    sample["input_ids"].extend(cur_tokens)
                    sample["labels"].extend(cur_tokens)
                    sample["sentiment_labels"].extend([label] * len(cur_tokens))


            for k, v in sample.items():
                sample[k] = v[:max_seq_length_without_special]

            sample["input_ids"] = self.tokenizer.convert_tokens_to_ids(
                [self.cls_token] + sample["input_ids"] + [self.sep_token]
            )
            sample["labels"] = self.tokenizer.convert_tokens_to_ids(
                [self.cls_token] + sample["labels"] + [self.sep_token]
            )
            sample["input_mask"] = [1] * len(sample["input_ids"])
            sample["sentiment_labels"] = [2] + sample["sentiment_labels"] + [2]


            while len(sample["input_ids"]) < self.max_sequence_length:
                sample["input_ids"].append(self.pad_idx)
                sample["labels"].append(self.pad_idx)
                sample["input_mask"].append(0)
                sample["sentiment_labels"].append(2)

            sample["input_type_ids"] = [0] * len(sample["input_ids"])
            prep_samples.append(sample)
        return prep_samples

    def __getitem__(self, index):
        return self.prep_samples[index]

    def __len__(self):
        return len(self.prep_samples)

In [11]:

class TextDataLoader(DataLoader):
    def __init__(self, dataset, batch_size, shuffle, device):
        super(TextDataLoader, self).__init__(
            dataset, batch_size, collate_fn=self.collate_fn, shuffle=shuffle
        )
        self.device = device
        self.tokens = []

    def collate_fn(self, data):
        input_ids_batch = []
        input_mask_batch = []
        input_type_ids_batch = []
        labels_batch = []
        sentiment_labels_batch = []
        
        max_len = max([sum(i["input_mask"]) for i in data])

        for sample in data:

            input_ids_batch.append(sample["input_ids"][:max_len])
            input_mask_batch.append(sample["input_mask"][:max_len])
            input_type_ids_batch.append(sample["input_type_ids"][:max_len])
            labels_batch.append(sample["labels"][:max_len])
            sentiment_labels_batch.append(sample["sentiment_labels"][:max_len])

        batch = {
            "input_ids": torch.tensor(
                input_ids_batch, dtype=torch.int64, device=self.device),
            "attention_mask": torch.tensor(
                input_mask_batch, dtype=torch.int64, device=self.device),
            "token_type_ids": torch.tensor(
                input_type_ids_batch, dtype=torch.int64, device=self.device),
            "labels": torch.tensor(
                labels_batch, dtype=torch.int64, device=self.device),
            "sentiment_labels": torch.tensor(
                sentiment_labels_batch, dtype=torch.int64, device=self.device)
            
        }

        return batch

In [17]:
train_ds = MLMDataset(data=data[:7000])
valid_ds = MLMDataset(data=data[7000:])
train_dl = TextDataLoader(train_ds, 2, False, "cuda:0")
valid_dl = TextDataLoader(valid_ds, 2, False, "cuda:0")


  0%|          | 0/7000 [00:00<?, ?it/s][A
  0%|          | 22/7000 [00:00<00:31, 218.59it/s][A
  1%|          | 50/7000 [00:00<00:30, 226.45it/s][A
  1%|          | 73/7000 [00:00<00:33, 204.92it/s][A
  1%|▏         | 94/7000 [00:00<00:35, 196.75it/s][A
  2%|▏         | 119/7000 [00:00<00:32, 213.70it/s][A
  2%|▏         | 145/7000 [00:00<00:30, 225.20it/s][A
  2%|▏         | 170/7000 [00:00<00:29, 232.60it/s][A
  3%|▎         | 197/7000 [00:00<00:28, 237.54it/s][A
  3%|▎         | 222/7000 [00:00<00:28, 240.71it/s][A
  4%|▎         | 247/7000 [00:01<00:29, 230.72it/s][A
  4%|▍         | 274/7000 [00:01<00:27, 241.40it/s][A
  4%|▍         | 299/7000 [00:01<00:30, 222.81it/s][A
  5%|▍         | 323/7000 [00:01<00:29, 226.64it/s][A
  5%|▍         | 346/7000 [00:01<00:30, 216.08it/s][A
  5%|▌         | 368/7000 [00:01<00:31, 213.33it/s][A
  6%|▌         | 394/7000 [00:01<00:29, 223.58it/s][A
  6%|▌         | 417/7000 [00:01<00:30, 217.94it/s][A
  6%|▋         | 439/700

 99%|█████████▉| 6915/7000 [00:32<00:00, 210.92it/s][A
 99%|█████████▉| 6937/7000 [00:33<00:00, 208.65it/s][A
 99%|█████████▉| 6958/7000 [00:33<00:00, 205.46it/s][A
100%|██████████| 7000/7000 [00:33<00:00, 210.15it/s][A

  0%|          | 0/3000 [00:00<?, ?it/s][A
  0%|          | 14/3000 [00:00<00:22, 133.86it/s][A
  1%|          | 36/3000 [00:00<00:17, 172.24it/s][A
  2%|▏         | 59/3000 [00:00<00:14, 197.05it/s][A
  3%|▎         | 81/3000 [00:00<00:14, 204.22it/s][A
  3%|▎         | 102/3000 [00:00<00:14, 197.86it/s][A
  4%|▍         | 126/3000 [00:00<00:13, 211.43it/s][A
  5%|▍         | 148/3000 [00:00<00:14, 202.49it/s][A
  6%|▌         | 170/3000 [00:00<00:13, 207.55it/s][A
  6%|▋         | 194/3000 [00:00<00:12, 217.24it/s][A
  7%|▋         | 216/3000 [00:01<00:13, 211.53it/s][A
  8%|▊         | 240/3000 [00:01<00:12, 216.53it/s][A
  9%|▉         | 264/3000 [00:01<00:12, 223.24it/s][A
 10%|▉         | 295/3000 [00:01<00:10, 248.07it/s][A
 11%|█         | 320

In [13]:
class BertForMaskedLM(BertPreTrainedModel, pl.LightningModule):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]

    def __init__(self, config, n_sentiment_class: int = 5, sentiment_embedding_dim: int = 100):
        super().__init__(config)

        if config.is_decoder:
            logger.warning(
                "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        self.bert = BertModel(config, add_pooling_layer=False)
        self.sentiment_embeddings = nn.Embedding(n_sentiment_class, sentiment_embedding_dim)
        config.hidden_size += sentiment_embedding_dim
        self.cls_ = BertOnlyMLMHead_(config.hidden_size, sentiment_embedding_dim, config.hidden_act, config.layer_norm_eps, config.vocab_size)

        self.init_weights()

    def get_output_embeddings(self):
        return self.cls_.predictions.decoder

    def set_output_embeddings(self, new_embeddings):
        self.cls_.predictions.decoder = new_embeddings

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        sentiment_labels=None
  ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
          input_ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids,
          position_ids=position_ids,
          head_mask=head_mask,
          inputs_embeds=inputs_embeds,
          encoder_hidden_states=encoder_hidden_states,
          encoder_attention_mask=encoder_attention_mask,
          output_attentions=output_attentions,
          output_hidden_states=output_hidden_states,
          return_dict=return_dict,
          )
        sentiment_embeddings = self.sentiment_embeddings(sentiment_labels)
        sequence_output = outputs[0]
        sequence_with_sent_output = torch.cat([sentiment_embeddings, sequence_output], dim=-1)
        prediction_scores = self.cls_(sequence_with_sent_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # -100 index = padding token
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        return MaskedLMOutput(
        loss=masked_lm_loss,
        logits=prediction_scores,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )


    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        effective_batch_size = input_shape[0]

        #  add a dummy token
        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
        dummy_token = torch.full(
            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
        )
        input_ids = torch.cat([input_ids, dummy_token], dim=1)

        return {"input_ids": input_ids, "attention_mask": attention_mask}

    def training_step(self, batch, batch_idx):
        loss = self.forward(
          input_ids=batch["input_ids"],
          attention_mask=batch["attention_mask"],
          token_type_ids=batch["token_type_ids"],
          position_ids=None,
          head_mask=None,
          inputs_embeds=None,
          encoder_hidden_states=None,
          encoder_attention_mask=None,
          labels=batch["labels"],
          output_attentions=None,
          output_hidden_states=None,
          return_dict=None,
          sentiment_labels=batch["sentiment_labels"]
        )
        return {'loss': loss.loss}

    def validation_step(self, batch, batch_idx):
        loss = self.forward(
          input_ids=batch["input_ids"],
          attention_mask=batch["attention_mask"],
          token_type_ids=batch["token_type_ids"],
          position_ids=None,
          head_mask=None,
          inputs_embeds=None,
          encoder_hidden_states=None,
          encoder_attention_mask=None,
          labels=batch["labels"],
          output_attentions=None,
          output_hidden_states=None,
          return_dict=None,
          sentiment_labels=batch["sentiment_labels"]
        )
        return {'loss': loss.loss}

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=5e-7)

In [19]:
trainer = pl.Trainer(gpus=-1, num_sanity_val_steps=0)
model = BertForMaskedLM.from_pretrained('bert-base-cased')
trainer.fit(model, train_dl, valid_dl)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the mode

Training: 0it [00:00, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 114.00 MiB (GPU 0; 3.95 GiB total capacity; 3.25 GiB already allocated; 41.44 MiB free; 3.40 GiB reserved in total by PyTorch)

In [1]:
import pandas as pd
from nltk import sent_tokenize, word_tokenize
import json


import numpy as np

import nltk
from nltk.corpus import stopwords
import string
import math


## Предобработаем данные 

In [153]:
data = pd.read_csv('/home/ml/TST/train_for_style.tsv', sep='\t',  usecols=[0,1], names=['stars', 'text'])


In [149]:
my_list = data["text"].to_list()

In [150]:

with open('input.txt', 'w') as f:
    for item in my_list:
        f.write("%s\n" % item)

In [146]:
data["text"]

Unnamed: 0,stars,text
0,2,Good sushi--definitely frequent here for their...
1,1,I have played golf all over the valley for 40 ...
2,2,I can't say enough positive about Deer Creek A...
3,2,Loved here for over a year and I enjoy it. I a...
4,1,Their Antipasto is good but pizza is expensive...
...,...,...
279995,2,"I have ridden this type of ride elsewhere, but..."
279996,1,45 mins on a wait for a banker? \n\nAll banker...
279997,1,I went here a few days ago. I tried the pot r...
279998,2,"Pizza here at Barros is very tasty, and the wi..."


In [154]:
docs = data["text"]
stars = data["stars"]



l = []
for i in range(len(docs)):
    sents = sent_tokenize(docs[i])
    for sent in sents:
        d = {"sent": sent, "star": stars[i]}
        l.append(d)
sents_data = pd.DataFrame(l) 

In [155]:
sents_data["tokenized_text"] = sents_data["sent"].apply(lambda x: word_tokenize(x))

In [160]:
my_list = sents_data["sent"].to_list()
with open('input.txt', 'w') as f:
    c = 0
    for item in my_list:
        c+=1
        if c< 500:
            f.write("%s\n" % item)
        else: 
            break
        

In [180]:
file_number = 1
idx_start = 0
idx_end = 50
chunk = my_list[idx_start:idx_end]
while chunk:
    with open('/home/ml/TST/corpus_for_corenlp_1/input_' + str(file_number) + ".txt", 'w') as chunk_file:
        chunk_file.write("\n".join(chunk))
    file_number += 1
    idx_start = idx_end
    idx_end +=50
    chunk = my_list[idx_start:idx_end]
    

In [175]:
with open("/home/ml/TST/corpus_for_corenlp/input_30.txt", 'r') as f:
    sentiwords = f.readlines()

In [8]:
with open("SentiWords_1.1.txt", 'r') as f:
    sentiwords = f.readlines()

In [9]:
sentiwords = sentiwords[26:]

In [10]:
sw_vocab = {}
for s in sentiwords:
    pair = s.split("\t")
    sw_vocab[pair[0].split("#")[0]] = float(pair[1])

In [11]:
sw_vocab_only_words = {}
for k in sw_vocab.keys():
    if "_" not in k:
        sw_vocab_only_words[k] = sw_vocab[k]
for i in string.punctuation:
    sw_vocab_only_words[i] = 0

In [12]:
def get_senti_tags(tokenized_text):    
    senti_tags = []
    for token in tokenized_text: 
        if token.lower() in sw_vocab_only_words:
            if sw_vocab_only_words[token.lower()] > 0.25:
                senti_tags.append(1)
            elif sw_vocab_only_words[token.lower()] < -0.25:
                senti_tags.append(-1)
            else:
                senti_tags.append(0)

        else:
            senti_tags.append(0)
    return senti_tags

In [13]:
sents_data["senti_tags"] = sents_data["tokenized_text"].apply(lambda x: get_senti_tags(x))

In [106]:
sents_data

Unnamed: 0,sent,star,tokenized_text,senti_tags
0,Good sushi--definitely frequent here for their...,2,"[Good, sushi, --, definitely, frequent, here, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Any two rolls for 8.50 and any three rolls for...,2,"[Any, two, rolls, for, 8.50, and, any, three, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Service here is usually prompt.,2,"[Service, here, is, usually, prompt, .]","[1, 0, 0, 0, 0, 0]"
3,Prefer this place from the State St. location ...,2,"[Prefer, this, place, from, the, State, St., l...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ..."
4,I'm not sure what's up with the mall here but ...,2,"[I, 'm, not, sure, what, 's, up, with, the, ma...","[0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,..."
...,...,...,...,...
2379543,Horrible!!!,1,"[Horrible, !, !, !]","[-1, 0, 0, 0]"
2379544,Everything dried and hot food is cold too.,1,"[Everything, dried, and, hot, food, is, cold, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0]"
2379545,I never had a this kind a buffet in my life..,1,"[I, never, had, a, this, kind, a, buffet, in, ...","[0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
2379546,I'm still eating now.,1,"[I, 'm, still, eating, now, .]","[0, 0, 0, 0, 0, 0]"


In [107]:
result = sents_data.to_json(orient="records")

In [108]:
parsed = json.loads(result)

In [110]:
with open('YELP2_train_with_sentiment_tags.json', 'w') as outfile:
    json.dump(parsed, outfile, indent=4)

In [15]:
with open('YELP2_train_with_sentiment_tags.csv', 'w') as outfile:
    sents_data.to_csv(outfile)

## оценка

In [500]:
with open("/home/ml/TST/target_with_predictions_and_labels_v-0-2.json", "r") as f:
    data = json.load(f)

In [501]:
data[1]

{'tokenized_text': ['What',
  'are',
  'you',
  'all',
  'talking',
  'about',
  '?',
  '!',
  'This',
  'place',
  'is',
  'awful',
  '.'],
 'senti_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 'predictions': ['What',
  'are',
  'you',
  'all',
  'talking',
  'about',
  '?',
  '!',
  'This',
  'place',
  'is',
  'awesome',
  '.']}

In [502]:
def prep_input(text):
    res =[]
    for t in text:
        if t!="_":
            res.append(t)
    return res

In [503]:
tokenized_text = []
for d in data:
    tokenized_text.append(prep_input(d['tokenized_text']))

In [504]:
senti_tags = []
for d in data:
    senti_tags.append(d['senti_tags'])

In [505]:
def get_no_BPE(pred):
    sent = []
    bpe_ind = []
    stack =[]
    for i in range(len(pred)):
        
        if "##" in pred[i]:
            bpe_ind.append(pred.index(pred[i]))
        else:
            if len(bpe_ind)>0:
                sent.extend(stack[:-1])
                start = stack[-1]
                for p in bpe_ind:
                    start+=pred[p][2:]
                sent.append(start)
                stack = [pred[i]]
                bpe_ind = []
                start = ""
            else: 
                stack.append(pred[i])
    if len(stack)>0:
        sent.extend(stack)
    return sent




In [506]:
predictions = []
for d in data:
    pred = d["predictions"]
    sent = get_no_BPE(pred)
    predictions.append(sent)

In [507]:
changes = []
for i in range(len(predictions)):
    changes.extend(list(set(predictions[i]) - set(tokenized_text[i])))
len(set(changes))

2944

In [508]:
hip = []
ref = []
no_change =[]
counter = 0
for i in range(len(predictions)):
    if tokenized_text[i] == predictions[i]:
        counter+=1
        no_change.append(tokenized_text[i])
    else:
        hip.append(" ".join(predictions[i]).replace(" .", ".").replace(" ,", ',').replace(" !", "!").replace(" ?", "?"))
        ref.append(" ".join(tokenized_text[i]).replace(" ,", ',').replace(" !", "!").replace(" ?", "?"))

переводим в предложения

In [511]:
ref_sents = []
hip_sents = []
for i in range(len(ref)):
    sents_ref = sent_tokenize(ref[i])
    sents_hip = sent_tokenize(hip[i])
    if len(sents_ref) != len(sents_hip):        
        ref_sents.append(ref[i]) 
        hip_sents.append(hip[i])
    else:
        ref_sents.extend(sents_ref) 
        hip_sents.extend(sents_hip)

метрики сохранения контента

In [13]:
from collections import Counter
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow as tf
import numpy as np
import math
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# BLEU functions from https://github.com/MaximumEntropy/Seq2Seq-PyTorch
def bleu_stats(hypothesis, reference):
    """Compute statistics for BLEU."""
    stats = []
    stats.append(len(hypothesis))
    stats.append(len(reference))
    for n in range(1, 5):
        s_ngrams = Counter(
            [tuple(hypothesis[i:i + n]) for i in range(len(hypothesis) + 1 - n)]
        )
        r_ngrams = Counter(
            [tuple(reference[i:i + n]) for i in range(len(reference) + 1 - n)]
        )
        stats.append(max([sum((s_ngrams & r_ngrams).values()), 0]))
        stats.append(max([len(hypothesis) + 1 - n, 0]))
    return stats

def bleu(stats):
    """Compute BLEU given n-gram statistics."""
    if len(list(filter(lambda x: x == 0, stats))) > 0:
        return 0
    (c, r) = stats[:2]
    log_bleu_prec = sum(
        [math.log(float(x) / y) for x, y in zip(stats[2::2], stats[3::2])]
    ) / 4.
    return math.exp(min([0, 1 - float(r) / c]) + log_bleu_prec)

def get_bleu(hyp, ref):
    """Get validation BLEU score for dev set."""
    stats = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    for h, r in zip(hyp, ref):
        stats += np.array(bleu_stats(h, r))
    return 100 * bleu(stats)

In [15]:
# на вход 2 списка: ref предложения с сорсовым стилем, hip то, что сгененрировала модель 
def get_USE_embeds_similaryty(ref, hip):    
    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
    similarities = []
    for i in range(len(ref)):
        ref_sentence = embed([ref[i]])
        hip_sentence = embed([hip[i]])  
        similarities.append(cosine_similarity(ref_sentence, hip_sentence))
    return sum(similarities)/len(similarities)

    
# на вход 2 списка: ref предложения с сорсовым стилем, hip то, что сгененрировала модель 
def get_BERT_embeds_similiarity(ref, hip):
    def normalization(embeds):
        norms = np.linalg.norm(embeds, 2, axis=1, keepdims=True)
        return embeds/norms
    preprocessor = hub.KerasLayer(
            "https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2")
    encoder = hub.KerasLayer("https://tfhub.dev/google/LaBSE/2")
    similarities = []
    for i in range(len(ref)):
        ref_sentence = tf.constant([ref[i]])
        hip_sentence = tf.constant([hip[i]])  

        ref_embed = normalization(encoder(preprocessor(ref_sentence))["default"])
        hip_embed = normalization(encoder(preprocessor(hip_sentence))["default"])

        similarities.append(np.matmul(ref_embed, np.transpose(hip_embed)))
    return sum(similarities)/len(similarities)

просто пример

In [None]:
ref = ["i think this menu is great", "my cat is great", "london is the capital of great britain"]
hip = ["i suppose this menu is not that good", "i like cats", "there are many stars in the sky today"]


In [None]:
get_BERT_embeds_similiarity(ref, hip)

In [None]:
первый

In [407]:
get_bleu(hip, ref)

84.18549679751044

In [None]:
второй эксперимент 

In [300]:
get_bleu(hip, ref)

84.24131786834421

In [None]:
третий эксперимент 

In [327]:
get_bleu(hip, ref)

75.61181435532288

In [None]:
четвертый

In [434]:
get_bleu(hip, ref)

66.25296853811392

In [None]:
пятый

In [353]:
get_bleu(hip, ref)

82.15306696735729

In [None]:
шестой

In [380]:
get_bleu(hip, ref)

84.26857049837395

Метрики переноса стиля

In [39]:
!pip install tensorflow_text --upgrade



In [18]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []


    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"


tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]


# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)





('cardiffnlp/twitter-roberta-base-sentiment/tokenizer_config.json',
 'cardiffnlp/twitter-roberta-base-sentiment/special_tokens_map.json',
 'cardiffnlp/twitter-roberta-base-sentiment/vocab.json',
 'cardiffnlp/twitter-roberta-base-sentiment/merges.txt',
 'cardiffnlp/twitter-roberta-base-sentiment/added_tokens.json',
 'cardiffnlp/twitter-roberta-base-sentiment/tokenizer.json')

In [19]:

def get_prediction_scores(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    d = {"text": text, "label": ranking[0]}
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        d[l]=np.round(float(s), 4)
    
    return d

In [514]:
classifier_scores_ref = []
for text in ref_sents:
    classifier_scores_ref.append(get_prediction_scores(text))

In [515]:
classifier_scores_hip = []
for text in hip_sents:
    classifier_scores_hip.append(get_prediction_scores(text))

In [437]:
pos = []
neg = []
neu = []

In [438]:

for i in classifier_scores_ref:
    if i["label"] == 2:
        pos.append(i["positive"])
    elif i["label"] == 0:
        neg.append(i["negative"])
    else:
        neu.append(i["neutral"])

In [439]:
sorted(neg)[0]

0.3737

In [440]:
classifier_scores_ref[:5]

[{'text': 'But the pool is kept up impeccably, and the locker room is nice ( except during the summer time when the kids make everything sticky ) A great facility overall .',
  'label': 2,
  'positive': 0.9702,
  'neutral': 0.0273,
  'negative': 0.0025},
 {'text': 'What are you all talking about?!',
  'label': 1,
  'neutral': 0.5195,
  'negative': 0.4479,
  'positive': 0.0326},
 {'text': 'This place is awful .',
  'label': 0,
  'negative': 0.9677,
  'neutral': 0.0272,
  'positive': 0.0051},
 {'text': 'Theres a reason why these sandwiches are $ 3 .',
  'label': 1,
  'neutral': 0.6409,
  'negative': 0.3077,
  'positive': 0.0514},
 {'text': "I ca n't believe so many people like this place, ( although the place was filled with the Chinese people and they have been known to have different taste buds ) You get a 10 inch piece of bread with a bunch of stringy cheap vegetables .",
  'label': 0,
  'negative': 0.6481,
  'neutral': 0.3037,
  'positive': 0.0482}]

In [441]:
classifier_scores_hip[:5]

[{'text': 'But the pool is EWRELEEEEEEEEEOO make up impeccably, and the locker room is nice ( except during the summer time when the kids RVERIRREEEEERRRRRT everything sticky ) A great facility overall.',
  'label': 2,
  'positive': 0.9743,
  'neutral': 0.0231,
  'negative': 0.0026},
 {'text': 'What are you all talking about?!',
  'label': 1,
  'neutral': 0.5195,
  'negative': 0.4479,
  'positive': 0.0326},
 {'text': 'This place is HWRIEELBLE.',
  'label': 0,
  'negative': 0.9532,
  'neutral': 0.0396,
  'positive': 0.0072},
 {'text': 'Theres a Hprorofefession problem why these sandwiches are $ 3.',
  'label': 0,
  'negative': 0.8154,
  'neutral': 0.1748,
  'positive': 0.0098},
 {'text': "I ca n ' t NEVEprorofessionE so many people H unrororofeRD this place, ( although the place was filled with the Chinese people and they have been known to have different unprororofessional buds ) You meansaERroRRRT a 10 inch piece of bread with a bunch of stringy cheap vegetables.",
  'label': 0,
  'ne

In [442]:
def delta_score():    
    delta_metric = 0
    for i in range(len(classifier_scores_ref)):
        # изначально негавив
        if classifier_scores_ref[i]["label"] == 0:
            delta = classifier_scores_hip[i]["positive"] - classifier_scores_ref[i]["positive"]
        elif classifier_scores_ref[i]["label"] == 2:
            delta = classifier_scores_hip[i]["negative"] - classifier_scores_ref[i]["negative"]
        else:
            delta = 1 - abs(classifier_scores_hip[i]["neutral"] - classifier_scores_ref[i]["neutral"])
        delta_metric+=delta
    return delta_metric/len(classifier_scores_ref)

In [520]:
def count_class_metrics():
    r = []
    num_neutral_no_change=0
    num_pos_no_change=0
    num_pos_to_neutral=0
    num_pos_to_neg=0
    num_neg_to_neutral=0
    num_neg_no_change=0
    num_neg_to_pos=0
    num_neutral_to_pos=0
    num_neutral_to_neg=0
    for i in range(len(classifier_scores_ref)):
        #neg
        if classifier_scores_ref[i]["label"] == 0:
            # 0==0
            if classifier_scores_ref[i]["label"] == classifier_scores_hip[i]["label"]:
                num_neg_no_change+=1
            #0 and 1
            elif classifier_scores_hip[i]["label"] == 1:
                num_neg_to_neutral+=1
            #0 and 2
            else:
                num_neg_to_pos+=1
                r.append([classifier_scores_ref[i]["text"], classifier_scores_hip[i]["text"]])
        #neutral        
        elif classifier_scores_ref[i]["label"] == 1:
            #1==1
            if classifier_scores_ref[i]["label"] == classifier_scores_hip[i]["label"]:
                num_neutral_no_change+=1
            # 1 and 0
            elif classifier_scores_hip[i]["label"] == 0:
                num_neutral_to_neg+=1
            #1 and 2
            else:
                num_neutral_to_pos+=1
        #pos
        else:
            #2==2
            if classifier_scores_ref[i]["label"] == classifier_scores_hip[i]["label"]:
                num_pos_no_change+=1
            #2 and 1
            elif classifier_scores_hip[i]["label"] == 1:
                num_pos_to_neutral+=1
            #2 and 0
            else:
                num_pos_to_neg+=1
                r.append([classifier_scores_ref[i]["text"], classifier_scores_hip[i]["text"]])
    res = {"num_neutral_no_change": num_neutral_no_change,
    "num_pos_no_change":num_pos_no_change,
    "num_pos_to_neutral": num_pos_to_neutral,
    "num_pos_to_neg":num_pos_to_neg,
    "num_neg_to_neutral": num_neg_to_neutral,
    "num_neg_no_change":num_neg_no_change,
    "num_neg_to_pos":num_neg_to_pos,
    "num_neutral_to_pos":num_neutral_to_pos,
    "num_neutral_to_neg":num_neutral_to_neg}
    return res, r

In [521]:
_, r = count_class_metrics()

In [523]:
r[100:200]

[['I like their seating area .', 'I hate their seating area.'],
 ['We had a couple beers ( small selection - but we found something we really liked ) and a glass of wine ( reasonably priced, tasty ) The service was fine, the prices fine .',
  'We had a couple beers ( small selection - but we found something we never liked ) and a glass of wine ( reasonably priced, notland ) The service was terrible, the prices reasonable.'],
 ['Everything was fine .', 'Everything was horrible.'],
 ['This means that my calls get returned in a timely fashion, work is done properly, quickly and competently but most importantly, that I get quoted a fair price for a good job .',
  'This means that my calls never returned in a timely fashion, work is done properly, quickly and competently but most importantly, that I never quoted a fair price for a bad job.'],
 ['Hubby started with smashed irishman drink and loved .',
  'Hubby started with smashed irishman drink and garbage.'],
 ['I moved onto Fish and Chips

In [None]:
первый эксперимент 

In [69]:
num_neutral_no_change

2522

In [70]:
num_neg_to_pos

624

In [71]:
num_pos_to_neg

1712

In [None]:
ХОРОШО

In [78]:
# удачный исход
(num_pos_to_neg+num_neg_to_pos)/len(classifier_scores_ref)

0.16039549574292777

In [80]:
num_neutral_no_change/len(classifier_scores_ref)

0.17316671244163692

In [None]:
Лучше так чем как ПЛОХО

In [79]:
# смазали в нейтральный 
(num_pos_to_neutral + num_neg_to_neutral)/len(classifier_scores_ref)

0.3318456468003296

In [None]:
ПЛОХО

In [77]:
# без изменений (с сентиментом)
(num_pos_no_change+num_neg_no_change)/len(classifier_scores_ref)

0.16444658060972261

In [81]:
# перевели нейтральный в сентимент
(num_neutral_to_pos+num_neutral_to_neg)/len(classifier_scores_ref)

0.17014556440538314

In [82]:
ref_pos_neg=0
for i in classifier_scores_ref:
    if i["label"]==0 or  i["label"]==2:
        ref_pos_neg+=1

In [83]:
ref_pos_neg

9564

In [84]:
# перевели из того, у чего был сентимент 
(num_pos_to_neg+num_neg_to_pos)/ref_pos_neg

0.24424926808866584

In [86]:
# перевели но не до конца
(num_pos_to_neutral + num_neg_to_neutral)/ref_pos_neg

0.5053324968632371

In [87]:
# не перевели 
(num_pos_no_change+num_neg_no_change)/ref_pos_neg

0.250418235048097

In [186]:
delta_score()

0.41794377918154224

In [413]:
res = count_class_metrics()

In [414]:
# удачный исход
(res["num_pos_to_neg"]+res["num_neg_to_pos"])/len(classifier_scores_ref)

0.2060580204778157

In [415]:
res['num_neutral_no_change']/len(classifier_scores_ref)

0.22689135381114903

In [416]:
# смазали в нейтральный 
(res['num_pos_to_neutral'] + res['num_neg_to_neutral'])/len(classifier_scores_ref)

0.15607224118316268

In [417]:
# без изменений (с сентиментом)
(res['num_pos_no_change']+res['num_neg_no_change'])/len(classifier_scores_ref)

0.3209613196814562

In [418]:
# перевели нейтральный в сентимент
(res['num_neutral_to_pos']+res['num_neutral_to_neg'])/len(classifier_scores_ref)

0.09001706484641639

In [419]:
delta_score()

0.457309499431174

In [None]:
второй эксперимент 

In [306]:
res2 = count_class_metrics()

In [307]:
# удачный исход
(res2["num_pos_to_neg"]+res2["num_neg_to_pos"])/len(classifier_scores_ref)

0.27394336132062047

In [308]:
res2['num_neutral_no_change']/len(classifier_scores_ref)

0.21495659598690764

In [309]:
# смазали в нейтральный 
(res2['num_pos_to_neutral'] + res2['num_neg_to_neutral'])/len(classifier_scores_ref)

0.1446563256012523

In [310]:
# без изменений (с сентиментом)
(res2['num_pos_no_change']+res2['num_neg_no_change'])/len(classifier_scores_ref)

0.26846449409420803

In [311]:
# перевели нейтральный в сентимент
(res2['num_neutral_to_pos']+res2['num_neutral_to_neg'])/len(classifier_scores_ref)

0.09797922299701153

In [312]:
delta_score()

0.5095686993026923

In [None]:
ТРЕТИЙ эксперимент

In [333]:
res3 = count_class_metrics()

In [334]:
# удачный исход
(res3["num_pos_to_neg"]+res3["num_neg_to_pos"])/len(classifier_scores_ref)

0.1759578349888506

In [335]:
res3['num_neutral_no_change']/len(classifier_scores_ref)

0.22758294479356714

In [336]:
# смазали в нейтральный 
(res3['num_pos_to_neutral'] + res3['num_neg_to_neutral'])/len(classifier_scores_ref)

0.20555442935333468

In [337]:
# без изменений (с сентиментом)
(res3['num_pos_no_change']+res3['num_neg_no_change'])/len(classifier_scores_ref)

0.29887154537468746

In [338]:
# перевели нейтральный в сентимент
(res3['num_neutral_to_pos']+res3['num_neutral_to_neg'])/len(classifier_scores_ref)

0.0920332454895601

In [339]:
delta_score()

0.4301646124738181

In [None]:
ЧЕТВЕРТЫЙ

In [446]:
res4 = count_class_metrics()

In [447]:
# удачный исход
(res4["num_pos_to_neg"]+res4["num_neg_to_pos"])/len(classifier_scores_ref)

0.14021217649841206

In [448]:
res4['num_neutral_no_change']/len(classifier_scores_ref)

0.2500844651665653

In [449]:
# смазали в нейтральный 
(res4['num_pos_to_neutral'] + res4['num_neg_to_neutral'])/len(classifier_scores_ref)

0.24636799783769173

In [450]:
# без изменений (с сентиментом)
(res4['num_pos_no_change']+res4['num_neg_no_change'])/len(classifier_scores_ref)

0.293803635380769

In [451]:
# перевели нейтральный в сентимент
(res4['num_neutral_to_pos']+res4['num_neutral_to_neg'])/len(classifier_scores_ref)

0.06953172511656193

In [138]:
delta_score()

0.41684824928142034

In [None]:
пятый

In [359]:
res5 = count_class_metrics()

In [360]:
# удачный исход
(res5["num_pos_to_neg"]+res5["num_neg_to_pos"])/len(classifier_scores_ref)

0.12023399862353751

In [361]:
res5['num_neutral_no_change']/len(classifier_scores_ref)

0.21816930488644184

In [362]:
# смазали в нейтральный 
(res5['num_pos_to_neutral'] + res5['num_neg_to_neutral'])/len(classifier_scores_ref)

0.1698554714384033

In [363]:
# без изменений (с сентиментом)
(res5['num_pos_no_change']+res5['num_neg_no_change'])/len(classifier_scores_ref)

0.38816242257398487

In [365]:
# перевели нейтральный в сентимент
(res5['num_neutral_to_pos']+res5['num_neutral_to_neg'])/len(classifier_scores_ref)

0.10357880247763249

In [366]:
delta_score()

0.3781803097040589

In [None]:
шестой

In [386]:
res6 = count_class_metrics()

In [387]:
# удачный исход
(res6["num_pos_to_neg"]+res6["num_neg_to_pos"])/len(classifier_scores_ref)

0.27358021181320635

In [388]:
res6['num_neutral_no_change']/len(classifier_scores_ref)

0.21344800625488664

In [389]:
# смазали в нейтральный 
(res6['num_pos_to_neutral'] + res6['num_neg_to_neutral'])/len(classifier_scores_ref)

0.1456393489231644

In [390]:
# без изменений (с сентиментом)
(res6['num_pos_no_change']+res6['num_neg_no_change'])/len(classifier_scores_ref)

0.26725424692586536

In [391]:
# перевели нейтральный в сентимент
(res6['num_neutral_to_pos']+res6['num_neutral_to_neg'])/len(classifier_scores_ref)

0.10007818608287725

In [392]:
delta_score()

0.5108238538631046

In [None]:
Разметка от stanford coreNLP

In [31]:
import xml.etree.ElementTree as ET
import os


In [32]:
res = []

w2n = {"Very positive":1,"Positive":1,"Neutral":0,"Negative":-1, "Very negative":-1}
yourpath = '/home/ml/TST/nlpcore_output_1'

files = os.walk(yourpath, topdown=False)
for root, dirs, files in files:
    for name in files:
        tree = ET.parse(os.path.join(root, name))
        tree_root = tree.getroot()

        for document in tree_root:
            sentences = document[1]
            for sentence in sentences:
                for s in sentence: 
                    sent=[]
                    tags =[]
                    if s.tag == "tokens":
                        for tokens in s:
                            for token in tokens:
                                if token.tag == "sentiment":

                                    tags.append(w2n[token.text])
                                elif token.tag == "word":
                                    sent.append(token.text)
                        tags = [str(t) for t in tags]
                        res.append({"sent": " ".join(sent), "tags": " ".join(tags)})




In [33]:
len(res)

237063

In [34]:
pd.DataFrame(res)

Unnamed: 0,sent,tags
0,Worst restaurant experience ever .,0 0 0 0 0
1,"Decent food , good wine , nice music , but a l...",1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
2,I could come back if there was a good wine spe...,0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,I went here today with high hopes of the avoca...,0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
4,Was served the smoothie first and I will say t...,0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 ...
...,...,...
237058,Cant go wrong here folks.They even have the sa...,0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
237059,Have to ask for the special cuz its not posted...,0 0 0 0 0 1 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
237060,\ n \ n \ nOnly cons : They did not seen to be...,0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0...
237061,\ n \ nI will def be trying more off the menu ...,0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0...


In [42]:
len([i for i in res if len(i["sent"])>0])

237063

In [40]:
res[0]["sent"]>0

'Worst restaurant experience ever .'

In [255]:
train = res[:50000]
test = res[50000:62000]

In [262]:
myfile = open('nlpCore_train_sents.txt', 'w')
for d in train:

    line = d["sent"] + "\t" + d["tags"]
    myfile.write("%s\n" % line)

myfile.close()


In [43]:
myfile = open('nlpCore_test_sents.txt', 'w')
for d in test:

    line = d["sent"] + "\t" + d["tags"]
    myfile.write("%s\n" % line)

myfile.close()

In [264]:
import tensorflow as tf
print(tf.__version__)

2.4.1
