In [1]:

import pandas as pd
import numpy as np
import nltk
import gzip
import torch

import tqdm as notebook_tqdm
from string import punctuation
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec, KeyedVectors
from transformers import BertTokenizer, BertModel, pipeline, AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset


ModuleNotFoundError: No module named 'nltk'

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

# Bag of Words

There are many ways to transform text data to numeric vectors. In this task you will try to use two of them. One of the well-known approaches is a bag-of-words representation. To create this transformation, follow the steps:

1. Find N most popular words in train corpus and numerate them. Now we have a dictionary of the most popular words.  
2. For each title in the corpora create a zero vector with the dimension equals to N.  
3. For each text in the corpora iterate over words which are in the dictionary and increase by 1 the corresponding coordinate.

In [None]:


def my_bag_of_words(text, words_to_index):
    """
    text: a string
    words_to_index: a list, train corpus words
    dict_size: size of the dictionary

    return a vector which is a bag-of-words representation of 'text'
    """

    dict_size = len(words_to_index)

    result_vector = np.zeros(dict_size)

    popular_words = enumerate(set(words_to_index))
    words_idx = {w:i for i,w in popular_words}

    for text in text.split():
        if text in words_idx:
            result_vector[words_idx[text]] += 1

    return result_vector

text = 'hi how are you'
words_to_index = ['hi', 'you', 'me', 'are']

my_bag_of_words(text, words_to_index)


# Word Vectorization

## Word2Vector  

- Google published pre-trained 300-dimensional vectors for 3 million words and phrases that were trained on Google News dataset (about 100 billion words)(https://code.google.com/archive/p/word2vec/)
- GloVe (Global Vectors for Word Representation): Pretained word vectors from different data sources provided by Standford https://nlp.stanford.edu/projects/glove/
- FastText by Facebook https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md

In [None]:

def train_wordvec(docs, vector_size):
    # Tokenize docs into tokens and remove punctuations
    documents = []
    for doc in docs:
        tokens = [word.strip(punctuation) for word in doc.lower().split()]
        documents.append(tokens)

    # Train word vectors using gensim package
    model = Word2Vec(documents, vector_size=vector_size, window=5, min_count=1, workers=4)

    return model

def generate_doc_vector(docs, wv_model):


    # Tokenize each document into tokens
    tokenized_docs = []
    for doc in docs:
        tokens = [word.strip(punctuation) for word in doc.lower().split()]
        tokenized_docs.append(tokens)

    # Generate document vectors
    doc_vectors = []
    for doc in tokenized_docs:
        vectors = [wv_model.wv[token] for token in doc if token in wv_model.wv]
        if len(vectors) > 0:
            doc_vector = np.mean(vectors, axis=0)
        else:
            doc_vector = np.zeros(wv_model.vector_size)
        doc_vectors.append(doc_vector)

    vectors = np.array(doc_vectors)
    # Return document vectors as a numpy array
    return vectors


In [None]:
# Here we will use our previous chatgpt dataset
data=pd.read_csv("./data/detect.csv")
x_train, x_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2,random_state=0)

data.head()

documents=[
            [
                token.strip(punctuation).strip()
                 for token in nltk.word_tokenize(doc.lower())
                    if token not in punctuation and len(token.strip(punctuation).strip()) >= 2
            ]
            for doc in data["text"]
        ]

# use function
model = Word2Vec(documents, vector_size=300, window=5, min_count=5, workers=4)


In [None]:

# print(model.wv['movie'], '\n')

# Top {n} word(s) with high relevance to {positive_vector} and low relevance to {negative_vector}: {top_n}
wv_correlation = lambda positive_vector=[], negative_vector=[], n=5: model.wv.most_similar(positive=positive_vector, negative=negative_vector, topn=n)
print(wv_correlation(['sound','music']), '\n')
print(wv_correlation(['sound','music'], ['film']), '\n')


# similarity between two tokens
wv_pos_sim = lambda token_a, token_b: model.wv.similarity(token_a, token_b)
print(wv_pos_sim('brass', 'acoustic'), '\n')
print(wv_pos_sim('movie','city'), '\n')

wv_outlier = lambda word_vector: model.wv.doesnt_match(word_vector)
print(wv_outlier(["sound", "music", "graphics", "actor", "book"]))



[('dance', 0.8091345429420471), ('pop', 0.8018935918807983), ('scene', 0.8004364371299744), ('recording', 0.7784169912338257), ('soul', 0.77668297290802)] 

[('mix', 0.6810025572776794), ('brass', 0.6776862144470215), ('acoustic', 0.6772572994232178), ('punk', 0.6729581356048584), ('influences', 0.664824903011322)] 

0.77276725 

-0.11576279 

actor


In [None]:

# from https://stackoverflow.com/questions/46433778/import-googlenews-vectors-negative300-bin

bin_url = "GoogleNews-vectors-negative300.bin"
gz_url = 'GoogleNews-vectors-negative300.bin.gz'
url = f'https://s3.amazonaws.com/dl4j-distribution/{gz_url}'

filename = wget.download(url)

f_in = gzip.open(gz_url, 'rb')
f_out = open(bin_url, 'wb')
f_out.writeline(f_in)


In [7]:

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
model.most_similar(positive=['women','king'], negative='man')


[('queen', 0.4827326238155365),
 ('queens', 0.466781347990036),
 ('kumaris', 0.4653734564781189),
 ('kings', 0.4558638632297516),
 ('womens', 0.422832190990448),
 ('princes', 0.4176960587501526),
 ('Al_Anqari', 0.41725507378578186),
 ('concubines', 0.4011078476905823),
 ('monarch', 0.3962482810020447),
 ('monarchy', 0.39430150389671326)]

## BERT Embedding

In [11]:
# Load data
# data=pd.read_csv('../data/amazon_reviews.csv')
# data.columns= ['label','text']

# Load a sample dataset (adjust this to your actual data structure)
dataset = load_dataset("imdb")


Downloading readme: 100%|██████████| 7.81k/7.81k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 21.0M/21.0M [00:00<00:00, 29.3MB/s]
Downloading data: 100%|██████████| 20.5M/20.5M [00:00<00:00, 35.0MB/s]
Downloading data: 100%|██████████| 42.0M/42.0M [00:00<00:00, 42.8MB/s]
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 198850.03 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 268019.66 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 292416.41 examples/s]


In [None]:
    
class BertVectorizer(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length 
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sentences = self.data[idx]['text']  # Assuming data is a list of dicts with 'text' and 'label'
        label = self.data[idx]['label']
        
        # Concatenate sentences
        text = " ".join(sentences)
        
        # Tokenize and encode the concatenated text
        encoding = self.tokenizer(text, 
                        padding='max_length', 
                        truncation=True, 
                        max_length=self.max_length, 
                        return_tensors='pt'
                        )
        
        # Return input_ids, attention_mask, and label as tensors
        model_attrs = { k: v.squeeze(0) for k, v in encoding.items() }
        return model_attrs, torch.tensor(label)

    

In [None]:
model_name = "bert-base-uncased"

# Load pre-trained BERT model with a classification head
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           output_hidden_states=True,
                                                           num_labels=2)

# Load pre-trained BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          do_lower_case=True)

In [None]:

# Transform the dataset format to a list of lists of sentences
# This step assumes dataset['train'] and dataset['test'] are lists of dictionaries with 'text' and 'label'
train_data = [{'text': [example['text']], 'label': example['label']} for example in dataset['train']]
test_data = [{'text': [example['text']], 'label': example['label']} for example in dataset['test']]

# Create custom dataset instances
train_dataset = BertVectorizer(train_data, tokenizer)
test_dataset = BertVectorizer(test_data, tokenizer)

# Create DataLoaders
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=batch_size)

# Define optimizer and learning rate scheduler
lr = 2e-5
num_epochs = 3
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=len(train_loader)*num_epochs)

# Define training function
def train(model, dataloader, optimizer, scheduler):
    
    model.train()
    
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        inputs, labels = batch
        outputs = model(**inputs, labels=labels)
        
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        
    avg_loss = total_loss / len(dataloader)
    return avg_loss

# Define evaluation function
def evaluate(model, dataloader):
    
    model.eval()
    
    total_acc = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch
            outputs = model(**inputs, labels=labels)
            
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            
            acc = (predictions == labels).float().mean()
            total_acc += acc.item()
            
    avg_acc = total_acc / len(dataloader)
    return avg_acc

# Training loop
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, scheduler)
    val_acc = evaluate(model, val_loader)
    print(f'Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f} - Val Accuracy: {val_acc:.4f}')

# Save the model
model_filename = "./tuned_bert"
model.save_pretrained(model_filename)
tokenizer.save_pretrained(model_filename)


In [8]:


text = "Here is the sentence I want embeddings for."
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence with the BERT tokenizer
tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens
print (tokenized_text)
# The original word has been split into smaller subwords and characters.
# The two hash signs preceding some of these subwords are just our tokenizer’s way to denote that this subword
# or character is part of a larger word and preceded by another subword.
# this way some contextual meaning of the original word will be retained.

# check out contents of BERT’s vocabulary
list(tokenizer.vocab.keys())[5000:5020]


Loading BERT tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

['[CLS]', 'here', 'is', 'the', 'sentence', 'i', 'want', 'em', '##bed', '##ding', '##s', 'for', '.', '[SEP]']


['knight',
 'lap',
 'survey',
 'ma',
 '##ow',
 'noise',
 'billy',
 '##ium',
 'shooting',
 'guide',
 'bedroom',
 'priest',
 'resistance',
 'motor',
 'homes',
 'sounded',
 'giant',
 '##mer',
 '150',
 'scenes']

In [9]:

unmasker = pipeline('fill-mask', model='bert-base-uncased')
unmasker("Artificial Intelligence [MASK] take over the world.")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.3182409107685089,
  'token': 2064,
  'token_str': 'can',
  'sequence': 'artificial intelligence can take over the world.'},
 {'score': 0.18299664556980133,
  'token': 2097,
  'token_str': 'will',
  'sequence': 'artificial intelligence will take over the world.'},
 {'score': 0.05600148066878319,
  'token': 2000,
  'token_str': 'to',
  'sequence': 'artificial intelligence to take over the world.'},
 {'score': 0.04519490897655487,
  'token': 2015,
  'token_str': '##s',
  'sequence': 'artificial intelligences take over the world.'},
 {'score': 0.045153163373470306,
  'token': 2052,
  'token_str': 'would',
  'sequence': 'artificial intelligence would take over the world.'}]

In [12]:

# Load data
data=pd.read_csv('/data/amazon_reviews.csv')
data.columns=['label','text']
data

# tokenize each document into a list of unigrams
# strip punctuations and leading/trailing spaces from unigrams
# only unigrams with 2 or more characters are taken
sentences=[ [token.strip(punctuation).strip() 
             for token in nltk.word_tokenize(doc.lower()) 
             if token not in punctuation and len(token.strip(punctuation).strip())>=2]
        for doc in data["text"] ]

print(sentences[0:2])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[['this', 'is', 'little', 'longer', 'and', 'more', 'detailed', 'than', 'the', 'first', 'two', 'books', 'in', 'the', 'series', 'however', 'have', 'enjoyed', 'each', 'new', 'aspect', 'of', 'the', 'exciting', 'fantasy', 'universe'], ['only', 'michelle', 'branch', 'save', 'this', 'album', 'all', 'guys', 'play', 'along', 'with', 'unenthusiastic', 'beat', 'even', 'karl']]


In [13]:
# use our data
data=data.iloc[:100]
sentences=data["text"].values
# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

input_ids = []
attention_masks = []
max_len =50

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                                       # Sentence to encode
                        add_special_tokens = True,                  # Add '[CLS]' and '[SEP]'
                        max_length = max_len,                       # Pad & truncate all sentences
                        pad_to_max_length = True,
                        return_attention_mask = True,               # Construct attn. masks
                        return_tensors = 'pt',                      # Return pytorch tensors
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])


# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 Original:  This is a little longer and more detailed than the first two books in the series. However, I have enjoyed each new aspect of the exciting fantasy universe.
Tokenized:  ['this', 'is', 'a', 'little', 'longer', 'and', 'more', 'detailed', 'than', 'the', 'first', 'two', 'books', 'in', 'the', 'series', '.', 'however', ',', 'i', 'have', 'enjoyed', 'each', 'new', 'aspect', 'of', 'the', 'exciting', 'fantasy', 'universe', '.']
Token IDs:  [2023, 2003, 1037, 2210, 2936, 1998, 2062, 6851, 2084, 1996, 2034, 2048, 2808, 1999, 1996, 2186, 1012, 2174, 1010, 1045, 2031, 5632, 2169, 2047, 7814, 1997, 1996, 10990, 5913, 5304, 1012]




In [14]:


# Load pre-trained model (weights)
bert_model = BertModel.from_pretrained('bert-base-uncased',
                                    output_attentions = True, # Whether the model returns attentions weights.
                                    output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

## Put the model in "evaluation" mode, meaning feed-forward operation
bert_model.eval()
with torch.no_grad():
    outputs = bert_model(input_ids)
    hidden_states = outputs[2]                #the third item will be the hidden states from all layers

print (f"Number of layers: {len(hidden_states)} (initial embeddings + 12 BERT layers)")
layer_i = 0
print (f"Number of batches: {len(hidden_states[layer_i])}")
#The second dimension, the batch size, is used when submitting multiple sentences to the model at once
batch_i = 0
print (f"Number of tokens: {len(hidden_states[layer_i][batch_i])}")
token_i = 0
print (f"Number of hidden units: {len(hidden_states[layer_i][batch_i][token_i])}")

# get the mean of last four layers instead of avging all layers
token_embeddings = torch.stack(hidden_states[-4:], dim=0)
print(f"Size of last 4 layers: {token_embeddings.size()}")
token_embeddings = token_embeddings.permute(1, 2, 0, 3)             # permute axis
token_embeddings = token_embeddings.mean(axis=2)                    # take the mean of the last 4 layers to get final embeddings
print(f"Size of mean embedding layer: {token_embeddings.size()}")


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 100
Number of tokens: 50
Number of hidden units: 768
Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 100
Number of tokens: 50
Number of hidden units: 768
torch.Size([4, 100, 50, 768])
torch.Size([100, 50, 4, 768])
torch.Size([100, 50, 768])
