In [1]:
!pip install torch transformers peft datasets bitsandbytes
!pip install accelerate>=0.26.0

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from huggingface_hub import list_datasets
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
import bitsandbytes
#List out the data set available
class_dataset = list_datasets(full=True, filter="text-classification")
classification_dataset_names = [dataset.id for dataset in class_dataset]

print(f"There are {len(classification_dataset_names)} classification datasets available on the hub")
print(f"The first 10 are: {classification_dataset_names[:10]}")



There are 41 classification datasets available on the hub
The first 10 are: ['nace-ai/policy-alignment-verification-dataset', 'ParsBench/PersianSyntheticEmotions', 'ronnieaban/alquran', 'MonoHime/ru_sentiment_dataset', 'cmotions/NL_restaurant_reviews', 'UCL-DARK/openai-tldr-summarisation-preferences', 'UCL-DARK/openai-tldr-filtered', 'UCL-DARK/openai-tldr-filtered-queries', 'cvcio/toxic-el', 'prasadsawant7/sentiment_analysis_preprocessed_dataset']


In [3]:
tweet_sent = load_dataset("fancyzhx/ag_news")

In [4]:
tweet_sent

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [5]:
train_ds = tweet_sent["train"]

In [6]:
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 120000
})

In [7]:
train_ds[0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [8]:
train_ds.column_names

['text', 'label']

In [9]:
print(train_ds.features)

{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)}


In [10]:
print(train_ds['text'][:5])

["Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.', "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.", 'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.', 'Oil prices soar to all-time record, posing new menace to US economy (AFP) 

In [11]:
import pandas as pd
tweet_sent.set_format(type="pandas")
df = tweet_sent["train"][:]

In [12]:
df.head(10)

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2
5,"Stocks End Up, But Near Year Lows (Reuters) Re...",2
6,Money Funds Fell in Latest Week (AP) AP - Asse...,2
7,Fed minutes show dissent over inflation (USATO...,2
8,Safety Net (Forbes.com) Forbes.com - After ear...,2
9,Wall St. Bears Claw Back Into the Black NEW Y...,2


In [14]:
from sklearn.preprocessing import LabelEncoder

def process_column(df, column_name, custom_map=None, is_label_column=False):
    """
    Processes a column in the dataframe by either converting strings to integers
    or mapping integers to string labels.

    Parameters:
    - df: The input dataframe.
    - column_name: The name of the column to process.
    - custom_map: A custom mapping dictionary for string-to-int conversion (optional).
    - is_label_column: If True, maps integers to string labels using `label_string`.
                      If False, converts strings to integers using `convert_str_int_col`.

    Returns:
    - df: The modified dataframe.
    - mapping: The mapping used for conversion (only applicable for string-to-int conversion).
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the dataframe")
    
    if is_label_column:
        # If the column is a label column, map integers to string labels
        df[f"{column_name}_name"] = df[column_name].apply(lambda row: tweet_sent['train'].features['label'].int2str(row))
        return df, None
    else:
        # If the column contains strings, convert them to integers
        if df[column_name].isnull().any():
            raise ValueError(f"Column '{column_name}' contains missing values")
        
        if custom_map:
            unique_values = df[column_name].unique()
            for value in unique_values:
                if value not in custom_map:
                    raise ValueError(f"Value '{value}' in column '{column_name}' is not found in custom map")
            df[f"{column_name}_int"] = df[column_name].map(custom_map)
            mapping = custom_map
        else:
            Label_encoder = LabelEncoder()
            df[f"{column_name}_int"] = Label_encoder.fit_transform(df[column_name])
            mapping = dict(zip(Label_encoder.classes_, Label_encoder.transform(Label_encoder.classes_)))
        return df, mapping

In [15]:
df, _ = process_column(df, column_name='label', is_label_column=True)

In [16]:
df.head(10)

Unnamed: 0,text,label,label_name
0,Wall St. Bears Claw Back Into the Black (Reute...,2,Business
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2,Business
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2,Business
3,Iraq Halts Oil Exports from Main Southern Pipe...,2,Business
4,"Oil prices soar to all-time record, posing new...",2,Business
5,"Stocks End Up, But Near Year Lows (Reuters) Re...",2,Business
6,Money Funds Fell in Latest Week (AP) AP - Asse...,2,Business
7,Fed minutes show dissent over inflation (USATO...,2,Business
8,Safety Net (Forbes.com) Forbes.com - After ear...,2,Business
9,Wall St. Bears Claw Back Into the Black NEW Y...,2,Business


In [None]:
def label_string(row):
    return tweet_sent['train'].features['label'].int2str(row)
df['label_name'] = df['label'].apply(label_string)


: 

In [None]:
df.head(10)

: 

In [None]:
tweet_sent_df = df.drop(columns=['Date'])

: 

In [None]:
tweet_sent_df.head(10)

: 

In [None]:
tweet_sent_df["Sentiment"].unique()

: 

In [None]:
!pip install -U scikit-learn

: 

In [None]:
from sklearn.preprocessing import LabelEncoder

: 

In [None]:
from sklearn.preprocessing import LabelEncoder

# Convert sentiment labels to integers
def convert_str_int_col(df, column_name, custom_map=None):
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")
    if df[column_name].isnull().any():
        raise ValueError(f"Column '{column_name}' contains missing values.")
    if custom_map:
        unique_values = df[column_name].unique()
        for value in unique_values:
            if value not in custom_map:
                raise ValueError(f"Value '{value}' in column '{column_name}' is not present in the custom_map.")
        df[f"{column_name}_int"] = df[column_name].map(custom_map)
        mapping = custom_map
    else:
        label_encoder = LabelEncoder()
        df[f"{column_name}_int"] = label_encoder.fit_transform(df[column_name])
        mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    return df, mapping

custom_map = {"Positive": 1, "Neutral": 0, "Negative": 2}
tweet_sent_df, mapping = convert_str_int_col(tweet_sent_df, "Sentiment", custom_map)

: 

In [None]:
tweet_sent_df.head(10)

: 

In [None]:
!pip install emoji nltk

: 

In [None]:
import re
import emoji
from nltk.corpus import stopwords
import torch.nn as nn
import nltk

# Ensure NLTK stopwords are downloaded
nltk.download("stopwords")

class FeatureCleaner(nn.Module):
    """
    A feature cleaning module for text preprocessing.
    Performs operations like URL removal, hashtag removal, stopword removal, etc.
    """

    def __init__(self, slang_dict=None):
        """
        Initialize the FeatureCleaner.

        Args:
            slang_dict (dict, optional): A dictionary of slang terms and their expansions.
                                         Defaults to a predefined set of slang terms.
        """
        super().__init__()
        # Load stopwords
        self.stop_words = set(stopwords.words("english"))
        # Define slang dictionary (customizable)
        self.slang_dict = slang_dict or {
            "HODL": "hold on for dear life",
            "FOMO": "fear of missing out",
        }
        # Compile regex patterns for efficiency
        self.url_pattern = re.compile(r"http\S+|www\S+|https\S+", re.MULTILINE)
        self.hashtag_pattern = re.compile(r"@\w+|#\w+")
        self.special_char_pattern = re.compile(r"[^a-zA-Z0-9\s]")
        self.date_pattern = re.compile(
            r"\b\d{4}-\d{2}-\d{2}\b|\b\d{2}/\d{2}/\d{4}\b|\b\w{3,9}\s\d{1,2},?\s\d{4}\b"
        )
        # Compile slang replacement pattern
        self.slang_pattern = re.compile("|".join(re.escape(key) for key in self.slang_dict.keys()))

    def remove_url(self, text):
        """
        Remove URLs from the text.

        Args:
            text (str): Input text.

        Returns:
            str: Text with URLs removed.
        """
        if not text:
            return text
        return self.url_pattern.sub("", text)

    def remove_hashtags(self, text):
        """
        Remove hashtags and mentions from the text.

        Args:
            text (str): Input text.

        Returns:
            str: Text with hashtags and mentions removed.
        """
        if not text:
            return text
        return self.hashtag_pattern.sub("", text)

    def remove_special_characters(self, text):
        """
        Remove special characters from the text.

        Args:
            text (str): Input text.

        Returns:
            str: Text with special characters removed.
        """
        if not text:
            return text
        return self.special_char_pattern.sub("", text)

    def to_lowercase(self, text):
        """
        Convert text to lowercase.

        Args:
            text (str): Input text.

        Returns:
            str: Lowercase text.
        """
        if not text:
            return text
        return text.lower()

    def demoji(self, text):
        """
        Convert emojis to their text representations.

        Args:
            text (str): Input text.

        Returns:
            str: Text with emojis converted to text.
        """
        if not text:
            return text
        return emoji.demojize(text)

    def remove_stop_words(self, text):
        """
        Remove stopwords from the text.

        Args:
            text (str): Input text.

        Returns:
            str: Text with stopwords removed.
        """
        if not text:
            return text
        return " ".join([word for word in text.split() if word not in self.stop_words])

    def expand_slangs(self, text):
        """
        Expand slang terms in the text.

        Args:
            text (str): Input text.

        Returns:
            str: Text with slang terms expanded.
        """
        if not text:
            return text
        return self.slang_pattern.sub(lambda x: self.slang_dict[x.group()], text)

    def remove_dates(self, text):
        """
        Remove dates from the text.

        Args:
            text (str): Input text.

        Returns:
            str: Text with dates removed.
        """
        if not text:
            return text
        return self.date_pattern.sub("", text)

    def forward(self, text, remove_stopwords=True):
        """
        Apply all cleaning operations to the text.

        Args:
            text (str): Input text.
            remove_stopwords (bool, optional): Whether to remove stopwords. Defaults to True.

        Returns:
            str: Cleaned text.
        """
        if not text:
            return text

        # Apply cleaning operations in sequence
        text = self.remove_url(text)
        text = self.remove_hashtags(text)
        text = self.remove_special_characters(text)
        text = self.to_lowercase(text)
        text = self.demoji(text)
        text = self.expand_slangs(text)
        text = self.remove_dates(text)

        if remove_stopwords:
            text = self.remove_stop_words(text)

        return text

: 

In [None]:
feature_cleaner = FeatureCleaner()

: 

In [None]:
tweet_sent_df['text'] = tweet_sent_df['text'].apply(lambda x: feature_cleaner.forward(x))

: 

In [None]:
tweet_sent_df.head(10)

: 

In [None]:
tweet_sent_df.loc[3,"text"]

: 

In [None]:
# Count the number of empty or NaN rows in the 'text' column
empty_row_count = tweet_sent_df['text'].isna().sum() + (tweet_sent_df['text'].str.strip() == "").sum()
print(f"Number of empty rows: {empty_row_count}")


: 

In [None]:
# Drop empty rows
tweet_sent_df = tweet_sent_df.dropna(subset=["text", "Sentiment", "Sentiment_int"])
tweet_sent_df = tweet_sent_df[~(tweet_sent_df["text"].str.strip() == "")]


: 

In [None]:
tweet_sent_df['text'].isna().sum()

: 

In [None]:
tweet_sent_df.head(10)

: 

In [None]:
# Prepare test and validation datasets
df_test = tweet_sent["test"][:]
df_val = tweet_sent["eval"][:]

tweet_sent_test = df_test.drop(columns=['Date'])
tweet_sent_val = df_val.drop(columns=['Date'])

tweet_sent_test, _ = convert_str_int_col(tweet_sent_test, "Sentiment", custom_map)
tweet_sent_val, _ = convert_str_int_col(tweet_sent_val, "Sentiment", custom_map)

: 

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType

: 

In [None]:
# Enable CUDA debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

: 

In [None]:
# Verify data types in the DataFrame
print("Data types in training DataFrame:")
print(tweet_sent_df.dtypes)

: 

In [None]:
# Convert DataFrames to datasets with explicit type conversion
train_data = {
    'text': tweet_sent_df['text'].tolist(),
    'labels': tweet_sent_df['Sentiment_int'].astype(int).tolist()  # Ensure integer labels
}

val_data = {
    'text': tweet_sent_val['text'].tolist(),
    'labels': tweet_sent_val['Sentiment_int'].astype(int).tolist()  # Ensure integer labels
}

: 

In [None]:
# Create datasets
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

: 

In [None]:
# Load tokenizer
llama_model = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(llama_model)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.padding_side = "right"

: 

In [None]:
# Tokenization function
def tokenize_function(examples):
    # Ensure text entries are strings
    texts = [str(text) for text in examples['text']]
    
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors=None
    )

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

: 

In [None]:
# Configure model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float32
)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    llama_model,
    num_labels=3,
    quantization_config=bnb_config,
    device_map="auto"
)

: 

In [None]:
# Update model configuration
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

: 

In [None]:
# Apply LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=['q_proj', 'v_proj']
)

model = get_peft_model(model, lora_config)

: 

In [None]:
# Prepare datasets for training
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

: 

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    gradient_accumulation_steps=8,
    optim="adamw_torch",
    no_cuda=False,
    fp16=False,
    bf16=False
)

: 

In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

: 

In [None]:
trainer.train()

: 

: 