In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# importing the needed libraries
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re 
import string
import preprocessor as p
import emoji
from emot.emo_unicode import EMOTICONS_EMO
from flashtext import KeywordProcessor

import torch
from transformers import BertTokenizer, BertModel

# Contextual Embeddings Using BERT

This documentation provides a detailed guide to preprocessing text data and extracting contextual embeddings using the BERT model. This process enhances the representation of text for advanced NLP tasks

## Import Dataset

In [171]:
# Read CSV file and drop any rows with missing values immediately
df = pd.read_csv('../data/labeled_texts_1000.csv')
df.dropna(inplace=True)
df

Unnamed: 0,Content,label
0,The #Binance towel comes everywhere with me......,1
1,Drop your $SOL address below and\r\nmake sure ...,1
2,"£52,356.70",1
3,It only takes one good altcoin to change the w...,2
4,disrespectful is one of my top 3 fav,2
...,...,...
980,LADY GAGA IS A FUCKING MOVIE STAR,0
981,Te haré salir volando como un cohete! 🚀💥 I’ll ...,2
982,hawl awn… is that—,1
983,You’re invited! See you in Rogueport on May 23...,2


## Prepare Data

In [172]:
# Extract content and labels into separate variables
X = df['Content']
y = df['label']
X, y

(0      The #Binance towel comes everywhere with me......
 1      Drop your $SOL address below and\r\nmake sure ...
 2                                             £52,356.70
 3      It only takes one good altcoin to change the w...
 4                   disrespectful is one of my top 3 fav
                              ...                        
 980                    LADY GAGA IS A FUCKING MOVIE STAR
 981    Te haré salir volando como un cohete! 🚀💥 I’ll ...
 982                                   hawl awn… is that—
 983    You’re invited! See you in Rogueport on May 23...
 984                                        Alien mothers
 Name: Content, Length: 985, dtype: object,
 0      1
 1      1
 2      1
 3      2
 4      2
       ..
 980    0
 981    2
 982    1
 983    2
 984    0
 Name: label, Length: 985, dtype: int64)

The content of the tweets and their labels are separated to facilitate preprocessing and model training.

## Preprocess Text For BERT

### Text Cleaning and Normalization

Define Helper Functions

In [178]:
# Function to convert emoticons to words
def convert_emoticons(text):
    # Merge emoji and emoticon dictionaries into one dictionary
    all_emoji_emoticons = {**EMOTICONS_EMO}
    
    # Replace colons and underscores in keys with spaces, and trim spaces
    all_emoji_emoticons = {k:v.replace(":","").replace("_"," ").strip() for k,v in all_emoji_emoticons.items()}
    
    # Initialize a KeywordProcessor for replacing keywords
    kp_all_emoji_emoticons = KeywordProcessor()
    
    # Add each emoticon and its corresponding word to the KeywordProcessor
    for k, v in all_emoji_emoticons.items():
        kp_all_emoji_emoticons.add_keyword(k, v)
    
    # Replace all emoticons in the text with corresponding words
    return kp_all_emoji_emoticons.replace_keywords(text)

In [179]:
# Function to handle abbreviations and normalize text
def normalize_text(text):
    # Replace 'fav' with 'favorite'
    text = re.sub(r'\bfav\b', "favorite", text)
    
    # Replace 'tkt' with 'ticket'
    text = re.sub(r'\btkt\b', "ticket", text)
    
    # Replace '(gm)' with 'good morning'
    text = re.sub(r'\(gm\)', 'good morning', text)
    
    # Replace '(r.i.p)' with 'rest in peace'
    text = re.sub(r'\(r.i.p\)', 'rest in peace', text)
    
    # Remove parenthetical references (typically credits like via or hat tips)
    text = re.sub(r'\([^)]*(via|h/t)[^)]*\)', '', text)
    
    # Reduce excess letter repetitions (more than two) to two
    text = re.sub(r'([a-zA-Z])\1{2,}', r'\1\1', text)
    
    return text


In [180]:
import re

def clean_characters(text):
    # Replace special characters (hyphens, underscores, colons) with a space
    text = re.sub(r'[-_:]', ' ', text)
    
    # Normalize line endings, replacing carriage return and newline with just newline
    normalized_text = re.sub(r'\r\n', '\n', text)
    
    # Remove decimal points used in numbers
    no_decimal_text = re.sub(r'(\d)\.(\d)', r'\1\2', normalized_text)
    
    # Remove characters that are not letters, numbers, basic punctuation, or newline
    cleaned_text = re.sub(r"[^a-zA-Z0-9 '.\n]", '', no_decimal_text)
    
    # Reduce multiple consecutive dots to a single dot
    cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
    
    # Reduce multiple consecutive question marks to a single one
    cleaned_text = re.sub(r'\?{2,}', '?', cleaned_text)
    
    # Replace multiple consecutive newlines with a single period or space
    cleaned_text = re.sub(r'(\n)+', lambda m: '.' if m.group().startswith('\n') and not m.group().endswith('.') else '. ', cleaned_text)
    
    # Clean up multiple spaces or periods into a single space or period
    cleaned_text = re.sub(r'\. \.', '. ', cleaned_text)
    
    # Reduce multiple spaces to a single space
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
    
    # Return the cleaned text, stripped of leading/trailing whitespace
    return cleaned_text.strip()


In [181]:
def clean_tweet(tweet):
    # Configure the preprocessor to remove URLs, mentions, and reserved words like RT or FAV
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED)
    
    # Clean the tweet using preprocessor settings
    cleaned_tweet = p.clean(tweet)
    
    # Convert all emojis in the tweet to text
    cleaned_tweet = emoji.demojize(cleaned_tweet)
    
    # Convert emoticons within the tweet to words
    cleaned_tweet = convert_emoticons(cleaned_tweet)
    
    # Normalize text to handle abbreviations and remove unnecessary parts
    normalized_text = normalize_text(cleaned_tweet.lower())
    
    # Clean characters and correct formatting issues
    sentence_cleaned = clean_characters(normalized_text)
    
    # Return the fully cleaned and processed tweet
    return sentence_cleaned

In [182]:
def get_cleaned_tweets(tweets):
    # Process a list of tweets, cleaning each one using clean_tweet function
    return [clean_tweet(tweet) for tweet in tweets]

In [183]:
# Apply the cleaning process to all tweets in X and store results
result = get_cleaned_tweets(X)
result[:5]

['the binance towel comes everywhere with me. including breakfast cooking',
 'drop your sol address below and make sure you are following me dont ask why backhand index pointing down medium light skin tone',
 '5235670',
 'it only takes one good altcoin to change the world. sparkles',
 'disrespectful is one of my top 3 favorite',
 'some congressional republicans call the affordable connectivity program wasteful. i call it necessary. its time congress extended it so the 23 million households across america that rely on the program can stay connected.',
 'donald trump wont do what an american president must do. he refuses to denounce political violence. ill say what trump wont political violence is never ever acceptable in america.',
 'there is a massive bitcoin breakout coming. are you ready',
 'max pain.',
 'ok so the vote is in. ill do a 25 ticket giveaway on station head in 10 mins. if i feel like it ill come on spaces give out another 5 10 after that. the generous queen princess medi

## Add BERT Special Tokens

In [184]:
def add_special_tokens(sentence):
    # Step 1: Adding the [CLS] token at the beginning
    sentence_with_cls = "[CLS] " + sentence
    
    # Step 2: Adding the [SEP] token before each full stop
    split_sentence = sentence_with_cls.split('.')
    sentence_with_sep = " [SEP].".join(split_sentence)
    
    # Clean up to handle cases where [SEP] might be added at the end unnecessarily
    sentence_with_sep = sentence_with_sep.replace(" [SEP].", " [SEP]").rstrip()
    
    return sentence_with_sep

BERT requires specific tokens to be added to the text. This function inserts the [CLS] token at the start and the [SEP] token at sentence boundaries.

In [185]:
# Applying the function to all sentences in the results list
processed_results = [add_special_tokens(sentence) for sentence in result]
processed_results[:5]

['[CLS] the binance towel comes everywhere with me [SEP] including breakfast cooking',
 '[CLS] drop your sol address below and make sure you are following me dont ask why backhand index pointing down medium light skin tone',
 '[CLS] 5235670',
 '[CLS] it only takes one good altcoin to change the world [SEP] sparkles',
 '[CLS] disrespectful is one of my top 3 favorite',
 '[CLS] some congressional republicans call the affordable connectivity program wasteful [SEP] i call it necessary [SEP] its time congress extended it so the 23 million households across america that rely on the program can stay connected [SEP]',
 '[CLS] donald trump wont do what an american president must do [SEP] he refuses to denounce political violence [SEP] ill say what trump wont political violence is never ever acceptable in america [SEP]',
 '[CLS] there is a massive bitcoin breakout coming [SEP] are you ready',
 '[CLS] max pain [SEP]',
 '[CLS] ok so the vote is in [SEP] ill do a 25 ticket giveaway on station head 

## Tokenization and Input Formatting

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize all the processed results
tokenized_results = [tokenizer.tokenize(sentence) for sentence in processed_results]
tokenized_results

# Convert tokens to their respective IDs in the BERT vocabulary
indexed_tokens_list = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_results]
indexed_tokens_list

Tokenize the preprocessed text and convert the tokens into indices that correspond to BERT's vocabulary.

## Prepare Model Inputs

### Create Segment IDs and Attention Masks

In [197]:
# Initialize lists for segment IDs and attention masks
token_type_list = []
attention_mask_list = []

# Generate segment IDs and attention masks for each sentence
for indexed_sentence in indexed_tokens_list:
    # For each sentence, all tokens belong to the same segment, so use 0
    segment_ids = [0] * len(indexed_sentence)
    token_type_list.append(segment_ids)
    
    # If you're not padding, all tokens are real, so the attention mask is all 1s
    attention_mask = [1] * len(indexed_sentence)
    attention_mask_list.append(attention_mask)


Segment IDs indicate to the model different segments of the input, while attention masks allow the model to ignore padding during processing.

### Add Padding

In [201]:
# Initialize padded lists
padded_input_ids = []
padded_attention_mask = []

# Find the maximum sequence length in your batch
max_length = max(len(tokens) for tokens in indexed_tokens_list)

for tokens in indexed_tokens_list:
    # Calculate the number of padding tokens needed
    num_padding_tokens = max_length - len(tokens)
    
    # Pad the input IDs with zeros (assuming 0 is your padding token)
    padded_tokens = tokens + [0] * num_padding_tokens
    padded_input_ids.append(padded_tokens)
    
    # Pad the attention mask where actual tokens are marked with 1 and padding tokens with 0
    padded_mask = [1] * len(tokens) + [0] * num_padding_tokens
    padded_attention_mask.append(padded_mask)


Uniform input length is crucial for batch processing in neural networks. This step pads shorter sequences with zeros.

### Extract Contextual Embeddings

This section describes loading the pre-trained BERT model, converting data into tensors, and running the model to extract contextual embeddings.

In [None]:
# Load the BERT model
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model.eval()  # Set the model to evaluation mode

# Convert the lists of integers into tensors
input_ids_tensor = torch.tensor(padded_input_ids)
attention_mask_tensor = torch.tensor(padded_attention_mask)

# Run the model and get the outputs
with torch.no_grad():
    outputs = model(input_ids_tensor, attention_mask=attention_mask_tensor)
    hidden_states = outputs[2]  # Hidden states from all BERT layers
    word_embeddings = outputs.last_hidden_state  # The last layer's output
word_embeddings

In [218]:
# write the embeddings to a file
import numpy as np
np.save('word_embeddings.npy', word_embeddings)

In [None]:
# read the embeddings from a file
word_embeddings = np.load('word_embeddings.npy')
word_embeddings