# POS tagging

In [9]:
import os
import nltk

if not os.path.isdir("../../data/nltk/"):
	# check whether nltk data are already downloaded
    nltk.download("averaged_perceptron_tagger", download_dir="../../data/nltk/")  # textblob
    nltk.download("subjectivity", download_dir="../../data/nltk/")                # nltk.subjectivity
# to load from file
nltk.data.path.append("../../data/nltk/")

## textBlob
POS-tagging via textblob

In [None]:
from textblob import TextBlob

text = '''The titular threat of The Blob has always struck me as the ultimate movie
monster: an insatiably hungry, amoeba-like mass able to penetrate
virtually any safeguard, capable of--as a doomed doctor chillingly
describes it--"assimilating flesh on contact.
Snide comparisons to gelatin be damned, it's a concept with the most
devastating of potential consequences, not unlike the grey goo scenario
proposed by technological theorists fearful of
artificial intelligence run rampant.'''

# ortography changes resulting pos_tags 
text2 = "the movie begins in the past where a boy named sam attempts to save celebi from a hunter."

blob = TextBlob(text2)

# to get pos_tags
blob.tags
# to get noun phrases
#blob.noun_phrases

## nltk
POS-tagging via nltk

In [14]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

n_instances = 100
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs  = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

#subj_docs[0], obj_docs[0]
#len(subj_docs), len(obj_docs)

tags = nltk.pos_tag(obj_docs[0][0])
tags

text2 = "the movie begins in the past where a boy named sam attempts to save celebi from a hunter."
nltk.pos_tag(text2.strip().split(" "))

[('the', 'DT'),
 ('movie', 'NN'),
 ('begins', 'VBZ'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('past', 'NN'),
 ('where', 'WRB'),
 ('a', 'DT'),
 ('boy', 'NN'),
 ('named', 'VBN'),
 ('sam', 'JJ'),
 ('attempts', 'NNS'),
 ('to', 'TO'),
 ('save', 'VB'),
 ('celebi', 'NN'),
 ('from', 'IN'),
 ('a', 'DT'),
 ('hunter.', 'NN')]

# Named-Entity Recognition
Vocab and pre-trained embeddings tests

## pre-trained embeddings

In [2]:
import torchtext

vec = torchtext.vocab.GloVe(name='6B', dim=50)

tokens = ['<UNK>', 'zio']
vec.get_vecs_by_tokens(tokens, lower_case_backup=True)

#vec.vectors.size()

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.4264, -0.2981,  0.3472,  0.5420, -0.1408, -0.3406,  0.8830, -0.8148,
         -0.2890, -0.0779,  0.0931,  0.0395, -0.0308, -0.0490,  0.1947, -0.7215,
         -0.8993,  0.6439,  0.7098,  0.5537, -1.3525, -0.6833,  0.0090,  0.3511,
          0.5957,  0.8872, -0.4650,  0.1624, -0.4045, -0.4708, -0.8202,  0.0468,
          0.2596,  0.9540, -0.2491,  0.2000,  0.5608,  0.0319, -0.9115,  0.7136,
          0.3106,  0.0678, -0.2949, -0.2032, -0.1507, -0.2053,  0.4046, -0.3827,


# Transformers

### Need to choose correct BERT pre-trained model for the task, for every model the output size, and meaning, changes.
- **BertSequenceClassification**: batch x seq_len x 2 
- **BertTokenClassification**: batch x seq_len x 2
- **BertModel**: batch x seq_len x hidden_size(768) 

In [13]:
from transformers import BertTokenizer, BertModel, \
		BertForTokenClassification, BertForSequenceClassification
  
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = BertForSequenceClassification.from_pretrained(
	"bert-base-cased",
	num_labels=2)

x = [
	["Each element in list of batch should be of equal size.","cacca"],
	("Penguins jumps with each other, just to see if the water's still cold.", "daino"),
	("This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical initializing a BertForSequenceClassification model from a BertForSequenceClassification model).", "vvv")
]

inp = tokenizer.encode(x[0][0], x[0][1])
#nn = tokenizer.tokenize(x)
print(inp)

# out = model(**inp)

# out.last_hidden_state.size()   # if model BertModel
# out               # if model BertFor...Classification

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

[101, 100, 100, 102]
