In [1]:
import sys
!{sys.executable} -m pip install nltk transformers torch scikit-learn --break-system-packages



In [2]:
# Download required NLTK data
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # Open Multilingual WordNet

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/djleamen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/djleamen/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# Install modules
# A '!' in a Jupyter Notebook runs the line in the system's shell, and not in the Python interpreter

# Import necessary libraries
import pandas as pd
import random

# Load dataset 
# you can download this dataset from https://huggingface.co/datasets/stepp1/tweet_emotion_intensity/tree/main
data = pd.read_csv("hf://datasets/stepp1/tweet_emotion_intensity/train.csv")

# Preview the data
print(data.head())

      id                                              tweet    class  \
0  40815  Loved @Bethenny independence msg on @WendyWill...     fear   
1  10128  @mark_slifer actually maybe we were supposed t...  sadness   
2  40476  I thought the nausea and headaches had passed ...     fear   
3  20813  Anger, resentment, and hatred are the destroye...    anger   
4  40796  new tires &amp; an alarm system on my car. fwm...     fear   

  sentiment_intensity class_intensity  labels  
0                 low        fear_low       4  
1                high    sadness_high       9  
2              medium     fear_medium       5  
3                high      anger_high       0  
4                 low        fear_low       4  


In [4]:
import re # Import the `re` module for working with regular expressions

# Function to clean the text
def clean_text(text):
    text = text.lower() # Convert all text to lowercase for uniformity
    text = re.sub(r'http\S+', '', text) # Remove URLs from the text
    text = re.sub(r'<.*?>', '', text) # Remove any HTML tags from the text
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation, keep only words and spaces
    return text # Return the cleaned text

# Assume `data` is a pandas DataFrame with a column named 'text'
# Apply the cleaning function to each row of the 'text' column
data['cleaned_text'] = data['tweet'].apply(clean_text)

# Print the first 5 rows of the cleaned text to verify the cleaning process
print(data['cleaned_text'].head())

0    loved bethenny independence msg on wendywillia...
1    mark_slifer actually maybe we were supposed to...
2    i thought the nausea and headaches had passed ...
3    anger resentment and hatred are the destroyer ...
4      new tires amp an alarm system on my car fwm now
Name: cleaned_text, dtype: object


In [5]:
# Check for missing values in the dataset
print(data.isnull().sum()) # Print the count of missing values for each column

# Option 1: Remove rows with missing data in the 'cleaned_text' column
data = data.dropna(subset=['cleaned_text']) # Drop rows where 'cleaned_text' is NaN (missing)

# Option 2: Fill missing values in 'cleaned_text' with a placeholder
data['cleaned_text'].fillna('unknown', inplace=True) # Replace NaN values in 'cleaned_text' with 'unknown'

id                     0
tweet                  0
class                  0
sentiment_intensity    0
class_intensity        0
labels                 0
cleaned_text           0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['cleaned_text'].fillna('unknown', inplace=True) # Replace NaN values in 'cleaned_text' with 'unknown'


In [6]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the cleaned text
tokens = tokenizer(
    data['cleaned_text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt'
)

print(tokens['input_ids'][:5])  # Preview the first 5 tokenized examples

tensor([[  101,  3866,  7014,  2368,  4890,  4336,  5796,  2290,  2006, 12815,
         29602,  6632,  5244,  2022,  3407, 23713, 16829,  2306,  4426, 23713,
         13433, 28032,  7730,  2097, 19311,  2000,  2017,  3407,  2981,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  2928,  1035, 22889, 23780,  2941,  2672,  2057,  2020,  4011,
          2000,  3280,  1998,  2026, 13445,  5552,  2256,  3268, 27451,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  1045,  2245,  1996, 19029,  1998, 14978,  2015,  2018,  2979,
          2021,  8840,  2140,  1045,  2514,  9643,  2651,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     

In [7]:
# Import necessary modules
import random # Random module for generating random numbers and selections
from nltk.corpus import wordnet # NLTK's WordNet corpus for finding synonyms

# Define a function to find and replace a word with a synonym
def synonym_replacement(word):
# Get all synsets (sets of synonyms) for the given word from WordNet
    synonyms = wordnet.synsets(word)

# If the word has synonyms, randomly choose one synonym, otherwise return the original word
    if synonyms:
# Select a random synonym and get the first lemma (word form) of that synonym
        return random.choice(synonyms).lemmas()[0].name()

# If no synonyms are found, return the original word
    return word

# Define a function to augment text by replacing words with synonyms randomly
def augment_text(text):
# Split the input text into individual words
    words = text.split() # Split the input text into individual words

# Replace each word with a synonym with a probability of 20% (random.random() > 0.8)
    augmented_words = [
    synonym_replacement(word) if random.random() > 0.8 else word 
# If random condition met, replace
for word in words] # Iterate over each word in the original text

# Join the augmented words back into a single string and return it
    return ' '.join(augmented_words)

# Apply the text augmentation function to the 'cleaned_text' column in a DataFrame
# Create a new column 'augmented_text' containing the augmented version of 'cleaned_text'
data['augmented_text'] = data['cleaned_text'].apply(augment_text)

In [8]:
# Import necessary modules
import random # Random module for generating random numbers and selections
from nltk.corpus import wordnet # NLTK's WordNet corpus for finding synonyms

# Define a function to find and replace a word with a synonym
def synonym_replacement(word):
# Get all synsets (sets of synonyms) for the given word from WordNet
    synonyms = wordnet.synsets(word)

# If the word has synonyms, randomly choose one synonym, otherwise return the original word
    if synonyms:
# Select a random synonym and get the first lemma (word form) of that synonym
        return random.choice(synonyms).lemmas()[0].name()

# If no synonyms are found, return the original word
    return word

# Define a function to augment text by replacing words with synonyms randomly
def augment_text(text):
# Split the input text into individual words
    words = text.split() # Split the input text into individual words

# Replace each word with a synonym with a probability of 20% (random.random() > 0.8)
    augmented_words = [
    synonym_replacement(word) if random.random() > 0.8 else word 
# If random condition met, replace
for word in words] # Iterate over each word in the original text

# Join the augmented words back into a single string and return it
    return ' '.join(augmented_words)

# Apply the text augmentation function to the 'cleaned_text' column in a DataFrame
# Create a new column 'augmented_text' containing the augmented version of 'cleaned_text'
data['augmented_text'] = data['cleaned_text'].apply(augment_text)

In [9]:
# Check the columns in the dataset to identify the label column
print("Dataset columns:")
print(data.columns.tolist())
print("\nFirst few rows:")
print(data.head())

Dataset columns:
['id', 'tweet', 'class', 'sentiment_intensity', 'class_intensity', 'labels', 'cleaned_text', 'augmented_text']

First few rows:
      id                                              tweet    class  \
0  40815  Loved @Bethenny independence msg on @WendyWill...     fear   
1  10128  @mark_slifer actually maybe we were supposed t...  sadness   
2  40476  I thought the nausea and headaches had passed ...     fear   
3  20813  Anger, resentment, and hatred are the destroye...    anger   
4  40796  new tires &amp; an alarm system on my car. fwm...     fear   

  sentiment_intensity class_intensity  labels  \
0                 low        fear_low       4   
1                high    sadness_high       9   
2              medium     fear_medium       5   
3                high      anger_high       0   
4                 low        fear_low       4   

                                        cleaned_text  \
0  loved bethenny independence msg on wendywillia...   
1  mark_slifer 

In [10]:
from sklearn.model_selection import train_test_split # Import function to split dataset
from torch.utils.data import TensorDataset, DataLoader
import torch

# Extract input_ids and attention_masks from tokens
input_ids = tokens['input_ids']
attention_masks = tokens['attention_mask']

# Create labels from the 'labels' column in the dataset
labels = torch.tensor(data['labels'].values)

# First split: 15% for test set, the rest for training/validation
train_val_inputs, test_inputs, train_val_masks, test_masks, train_val_labels, test_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.15, random_state=42
)

# Second split: 20% for validation set from remaining data
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    train_val_inputs, train_val_masks, train_val_labels, test_size=0.2, random_state=42
)

# Create TensorDataset objects for each set, including attention masks
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)

# Create DataLoader objects
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

print("Training, validation, and test sets are prepared with attention masks!")
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")

Training, validation, and test sets are prepared with attention masks!
Number of training samples: 2692
Number of validation samples: 674
Number of test samples: 594
