## Import Dataset

In [171]:
# read csv file
import pandas as pd
df = pd.read_csv('../data/labeled_texts_1000.csv')
df.dropna(inplace=True)

df

Unnamed: 0,Content,label
0,The #Binance towel comes everywhere with me......,1
1,Drop your $SOL address below and\r\nmake sure ...,1
2,"£52,356.70",1
3,It only takes one good altcoin to change the w...,2
4,disrespectful is one of my top 3 fav,2
...,...,...
980,LADY GAGA IS A FUCKING MOVIE STAR,0
981,Te haré salir volando como un cohete! 🚀💥 I’ll ...,2
982,hawl awn… is that—,1
983,You’re invited! See you in Rogueport on May 23...,2


In [172]:
X = df['Content']
y = df['label']

X, y

(0      The #Binance towel comes everywhere with me......
 1      Drop your $SOL address below and\r\nmake sure ...
 2                                             £52,356.70
 3      It only takes one good altcoin to change the w...
 4                   disrespectful is one of my top 3 fav
                              ...                        
 980                    LADY GAGA IS A FUCKING MOVIE STAR
 981    Te haré salir volando como un cohete! 🚀💥 I’ll ...
 982                                   hawl awn… is that—
 983    You’re invited! See you in Rogueport on May 23...
 984                                        Alien mothers
 Name: Content, Length: 985, dtype: object,
 0      1
 1      1
 2      1
 3      2
 4      2
       ..
 980    0
 981    2
 982    1
 983    2
 984    0
 Name: label, Length: 985, dtype: int64)

## Preprocess Text For BERT

In [174]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

In [176]:
# importing the needed libraries
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re 
import string
import preprocessor as p
import emoji
from emot.emo_unicode import EMOTICONS_EMO
from flashtext import KeywordProcessor

In [206]:
import torch
from transformers import BertTokenizer, BertModel

In [177]:
def get_wordnet_tag(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    elif tag.startswith('V'):
        return 'v'
    else:
        return None

In [178]:
def convert_emoticons(text):
    ## formatting
    all_emoji_emoticons = {**EMOTICONS_EMO}
    all_emoji_emoticons = {k:v.replace(":","").replace("_"," ").strip() for k,v in all_emoji_emoticons.items()}

    kp_all_emoji_emoticons = KeywordProcessor()
    for k,v in all_emoji_emoticons.items():
        kp_all_emoji_emoticons.add_keyword(k, v)
    output = kp_all_emoji_emoticons.replace_keywords(text)

    return output

In [179]:
def normalize_text(text):
    # handle abbreviations
    normalized_text = re.sub(r'\bfav\b', "favorite", text)
    normalized_text = re.sub(r'\btkt\b', "ticket", normalized_text)
    normalized_text = re.sub(r'\(gm\)', 'good morning', normalized_text)
    normalized_text = re.sub(r'\(r.i.p\)', 'rest in peace', normalized_text)
    
    # remove unnecessary information
    normalized_text = re.sub(r'\([^)]*(via|h/t)[^)]*\)', '', normalized_text)

    # reduce repeated characters
    normalized_text = re.sub(r'([a-zA-Z])\1{2,}', r'\1\1', normalized_text)
    
    return normalized_text

In [180]:
import re

def clean_characters(text):
    # Replacing special characters with space
    text = re.sub(r'[-_:]', ' ', text)

    normalized_text = re.sub(r'\r\n', '\n', text)

    # Remove decimal points between numbers
    no_decimal_text = re.sub(r'(\d)\.(\d)', r'\1\2', normalized_text)

    # Remove characters that are not spaces, letters, numbers, or full stops
    cleaned_text = re.sub(r"[^a-zA-Z0-9 '.\n]", '', no_decimal_text)

    # Reduce consecutive punctuations to a single instance (for . and ?)
    cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)  # For ...
    cleaned_text = re.sub(r'\?{2,}', '?', cleaned_text)  # For ???

    # Replace newlines with full stops, ensuring not to double-up full stops
    cleaned_text = re.sub(r'(\n)+', lambda m: '.' if m.group().startswith('\n') and not m.group().endswith('.') else '. ', cleaned_text)

    # Clean up any resulting double spaces or double full stops
    cleaned_text = re.sub(r'\. \.', '. ', cleaned_text)  # Double full stops to single
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)  # Double spaces to single

    return cleaned_text.strip()


In [181]:
def clean_tweet(tweet):
    # remove URLs, mentions, reserved words (RT, FAV)
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED)
    cleaned_tweet = p.clean(tweet)

    # remove emojis
    cleaned_tweet = emoji.demojize(cleaned_tweet)
    
    # convert emoticons to words
    cleaned_tweet = convert_emoticons(cleaned_tweet)

    # handle abbreviations
    normalized_text = normalize_text(cleaned_tweet.lower())
    #return normalized_text

    # clean characters
    sentence_cleaned = clean_characters(normalized_text)

    result = sentence_cleaned
    
    return result

In [182]:
def get_cleaned_tweets(tweets):
    return [clean_tweet(tweet) for tweet in tweets]

In [183]:
result = get_cleaned_tweets(X)
result

['the binance towel comes everywhere with me. including breakfast cooking',
 'drop your sol address below and make sure you are following me dont ask why backhand index pointing down medium light skin tone',
 '5235670',
 'it only takes one good altcoin to change the world. sparkles',
 'disrespectful is one of my top 3 favorite',
 'some congressional republicans call the affordable connectivity program wasteful. i call it necessary. its time congress extended it so the 23 million households across america that rely on the program can stay connected.',
 'donald trump wont do what an american president must do. he refuses to denounce political violence. ill say what trump wont political violence is never ever acceptable in america.',
 'there is a massive bitcoin breakout coming. are you ready',
 'max pain.',
 'ok so the vote is in. ill do a 25 ticket giveaway on station head in 10 mins. if i feel like it ill come on spaces give out another 5 10 after that. the generous queen princess medi

## Add BERT Special Tokens

In [184]:
def add_special_tokens(sentence):
    # Step 1: Adding the [CLS] token at the beginning
    sentence_with_cls = "[CLS] " + sentence
    
    # Step 2: Adding the [SEP] token before each full stop
    split_sentence = sentence_with_cls.split('.')
    sentence_with_sep = " [SEP].".join(split_sentence)
    
    # Clean up to handle cases where [SEP] might be added at the end unnecessarily
    sentence_with_sep = sentence_with_sep.replace(" [SEP].", " [SEP]").rstrip()
    
    return sentence_with_sep


In [185]:
# Applying the function to all sentences in the results list
processed_results = [add_special_tokens(sentence) for sentence in result]
processed_results

['[CLS] the binance towel comes everywhere with me [SEP] including breakfast cooking',
 '[CLS] drop your sol address below and make sure you are following me dont ask why backhand index pointing down medium light skin tone',
 '[CLS] 5235670',
 '[CLS] it only takes one good altcoin to change the world [SEP] sparkles',
 '[CLS] disrespectful is one of my top 3 favorite',
 '[CLS] some congressional republicans call the affordable connectivity program wasteful [SEP] i call it necessary [SEP] its time congress extended it so the 23 million households across america that rely on the program can stay connected [SEP]',
 '[CLS] donald trump wont do what an american president must do [SEP] he refuses to denounce political violence [SEP] ill say what trump wont political violence is never ever acceptable in america [SEP]',
 '[CLS] there is a massive bitcoin breakout coming [SEP] are you ready',
 '[CLS] max pain [SEP]',
 '[CLS] ok so the vote is in [SEP] ill do a 25 ticket giveaway on station head 

## Load Pre-Trained Model Tokenizer

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [186]:
tokenized_results = [tokenizer.tokenize(sentence) for sentence in processed_results]
tokenized_results

[['[CLS]',
  'the',
  'bin',
  '##ance',
  'towel',
  'comes',
  'everywhere',
  'with',
  'me',
  '[SEP]',
  'including',
  'breakfast',
  'cooking'],
 ['[CLS]',
  'drop',
  'your',
  'sol',
  'address',
  'below',
  'and',
  'make',
  'sure',
  'you',
  'are',
  'following',
  'me',
  'don',
  '##t',
  'ask',
  'why',
  'back',
  '##hand',
  'index',
  'pointing',
  'down',
  'medium',
  'light',
  'skin',
  'tone'],
 ['[CLS]', '52', '##35', '##6', '##70'],
 ['[CLS]',
  'it',
  'only',
  'takes',
  'one',
  'good',
  'alt',
  '##co',
  '##in',
  'to',
  'change',
  'the',
  'world',
  '[SEP]',
  'sparkle',
  '##s'],
 ['[CLS]',
  'di',
  '##sr',
  '##es',
  '##pe',
  '##ct',
  '##ful',
  'is',
  'one',
  'of',
  'my',
  'top',
  '3',
  'favorite'],
 ['[CLS]',
  'some',
  'congressional',
  'republicans',
  'call',
  'the',
  'affordable',
  'connectivity',
  'program',
  'waste',
  '##ful',
  '[SEP]',
  'i',
  'call',
  'it',
  'necessary',
  '[SEP]',
  'its',
  'time',
  'congress',


### Token List

In [188]:
indexed_tokens_list = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_results]
indexed_tokens_list

[[101, 1996, 8026, 6651, 10257, 3310, 7249, 2007, 2033, 102, 2164, 6350, 8434],
 [101,
  4530,
  2115,
  14017,
  4769,
  2917,
  1998,
  2191,
  2469,
  2017,
  2024,
  2206,
  2033,
  2123,
  2102,
  3198,
  2339,
  2067,
  11774,
  5950,
  7302,
  2091,
  5396,
  2422,
  3096,
  4309],
 [101, 4720, 19481, 2575, 19841],
 [101,
  2009,
  2069,
  3138,
  2028,
  2204,
  12456,
  3597,
  2378,
  2000,
  2689,
  1996,
  2088,
  102,
  26831,
  2015],
 [101,
  4487,
  21338,
  2229,
  5051,
  6593,
  3993,
  2003,
  2028,
  1997,
  2026,
  2327,
  1017,
  5440],
 [101,
  2070,
  7740,
  10643,
  2655,
  1996,
  15184,
  20831,
  2565,
  5949,
  3993,
  102,
  1045,
  2655,
  2009,
  4072,
  102,
  2049,
  2051,
  3519,
  3668,
  2009,
  2061,
  1996,
  2603,
  2454,
  3911,
  2408,
  2637,
  2008,
  11160,
  2006,
  1996,
  2565,
  2064,
  2994,
  4198,
  102],
 [101,
  6221,
  8398,
  2180,
  2102,
  2079,
  2054,
  2019,
  2137,
  2343,
  2442,
  2079,
  102,
  2002,
  10220,
  2000,
  

## Segment Ids and Attention Mask

In [197]:
# Assuming 'indexed_results' contains your list of lists of vocabulary indices for each sentence
token_type_list = []
attention_mask_list = []

for indexed_sentence in indexed_tokens_list:
    # For each sentence, all tokens belong to the same segment, so use 0
    segment_ids = [0] * len(indexed_sentence)
    token_type_list.append(segment_ids)
    
    # If you're not padding, all tokens are real, so the attention mask is all 1s
    attention_mask = [1] * len(indexed_sentence)
    attention_mask_list.append(attention_mask)


### Add Padding

In [201]:
# Initialize padded lists
padded_input_ids = []
padded_attention_mask = []

# Find the maximum sequence length in your batch
max_length = max(len(tokens) for tokens in indexed_tokens_list)

for tokens in indexed_tokens_list:
    # Calculate the number of padding tokens needed
    num_padding_tokens = max_length - len(tokens)
    
    # Pad the input IDs with zeros (assuming 0 is your padding token)
    padded_tokens = tokens + [0] * num_padding_tokens
    padded_input_ids.append(padded_tokens)
    
    # Pad the attention mask where actual tokens are marked with 1 and padding tokens with 0
    padded_mask = [1] * len(tokens) + [0] * num_padding_tokens
    padded_attention_mask.append(padded_mask)


## Convert the Lists to Tensor

In [203]:
import torch

input_ids_tensor = torch.tensor(padded_input_ids)
attention_mask_tensor = torch.tensor(padded_attention_mask)

In [207]:
model=BertModel.from_pretrained('bert-base-uncased',
                                output_hidden_states=True) 
## output_hidden_states tells the model to return all hidden states
# Put the model in "evaluation" mode [feed-forward operation]
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [208]:
# Run the text through BERT, and collect all of the hidden states produce from the 12 layers. 
with torch.no_grad():

    outputs = model(input_ids_tensor, attention_mask=attention_mask_tensor)


    # Because output_hidden_states is set to true, the third item will be hidden
    # states from all layers, different configurations could be used when
    # calling from_pretrained
    hidden_states = outputs[2]
    word_embeddings = outputs.last_hidden_state  # This contains the embeddings
    

In [210]:
word_embeddings

tensor([[[-1.1239e-01,  2.4357e-01,  8.5604e-02,  ..., -5.6149e-02,
           2.5564e-02,  1.8928e-01],
         [-1.2066e-01,  1.1085e-01, -3.8441e-01,  ...,  3.1131e-02,
           5.3486e-02, -1.7856e-01],
         [ 1.3444e-01, -2.8724e-01,  3.9719e-01,  ..., -7.5684e-02,
          -3.6914e-01, -8.7183e-02],
         ...,
         [ 2.1701e-01, -3.8892e-01,  1.6733e-01,  ...,  2.0448e-01,
          -1.2196e-01,  2.0441e-01],
         [ 1.5314e-02, -1.7250e-01,  4.9109e-01,  ...,  3.5868e-01,
          -6.8774e-02, -1.0300e-01],
         [ 5.7077e-03, -7.4810e-02,  2.6271e-02,  ...,  1.8333e-01,
          -1.8999e-02, -2.1065e-01]],

        [[ 3.1895e-01,  2.6895e-01,  1.2585e-01,  ..., -6.2433e-01,
          -5.1604e-02,  5.1289e-04],
         [ 5.0138e-01,  1.3102e-01,  3.1142e-01,  ..., -1.4255e-01,
          -2.1697e-01,  1.9033e-01],
         [ 1.6318e-01,  2.7957e-01,  2.4956e-01,  ..., -2.3605e-01,
           3.8383e-02,  1.7178e-01],
         ...,
         [ 2.6218e-01,  7

In [218]:
# write the embeddings to a file
import numpy as np
np.save('word_embeddings.npy', word_embeddings)

In [None]:
# read the embeddings from a file
word_embeddings = np.load('word_embeddings.npy')
word_embeddings