In [5]:
### Use this code to do the following:

# 1) Prepare human-coded datset for model building 
# 2) Train RoBERTa model
# 3) Evaluate performance of model
# 4) Prepare full dataset for classification
# 5) Classify text using trained RoBERTa model

### Note that you will need to change your file path names, and change the name of the theme you are working on (i.e. political, humor, etc.)

In [None]:
### Import Packages ###
import pandas as pd
import torch
import numpy as np
from transformers import RobertaTokenizer, RobertaModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
import re
from sklearn.metrics import classification_report

In [None]:
# 1) Prepare human-coded datset for model building 

In [3]:
### Read in the CSV file

# Specify data types for specific columns
dtype_spec = {
    "id": str,
    "humorous_attempt": int,
    "encouraging_others_vaccination": int,
    "vaccination_experience": int,
    "side_effects": int,
    "political": int,
    "health_benefits": int,
    "social_benefits": int,
    "expressing_gratitude_appreciation": int,
    "expressing_relief": int,
    "expressing_civic_duty": int,
    "identifying_vaccine_community": int,
}
# Read the CSV file with specified data types
df = pd.read_csv("YOUR FILE PATH HERE", dtype=dtype_spec)

NameError: name 'pd' is not defined

In [None]:
#Check dataset head, columns, or length
df.head()
#df.columns
#len(df)

In [None]:
### Function to preprocess text

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    # Replace email addresses
    text = re.sub(r'\S+@\S+', '', text)
    # Kerning
    text = re.sub(r'([A-Z]\s){2,}[A-Z]', lambda m: m.group().replace(' ', ''), text)
    # Make lowercase
    text = text.lower()
    # Remove #
    text = text.replace("#", "")
    # Replace &amp with "and"
    text = text.replace("&amp", "and")
    # Replace date content
    text = re.sub(r'\d{4}-\d{2}-\d{2}', '', text)
    
    # Replace mispelling
    replacement_map = {
        r'\bvaccienated\b': 'vaccinated',
    }
    for pattern, replacement in replacement_map.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

    contraction_mapping = {
        "i'll": "i will",
        "i'm": "i am",
        "you're": "you are",
        "we're": "we are",
        "they're": "they are",
        "he's": "he is",
        "she's": "she is",
        "it's": "it is",
        "can't": "cannot",
        "don't": "do not",
        "didn't": "did not",
        "won't": "will not",
        "wouldn't": "would not",
        "shouldn't": "should not",
        "couldn't": "could not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "doesn't": "does not",
        "isn't": "is not",
        "aren't": "are not",
    }

    for contraction, expanded_form in contraction_mapping.items():
        text = text.replace(contraction, expanded_form)

    # Replace vaccine emoji with "vaccine"
    text = text.replace("\U0001F48A", "vaccine")  # Replace U+1F48A with "vaccine"
    # Remove emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U0001F004"  # Mahjong Tile Red Dragon
                               u"\U0001F0CF"  # Playing Card Black Joker
                               u"\U0001F170-\U0001F251"  # Enclosed Alphanumeric Supplement
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    return text

# Apply preprocessing function to the 'text' column and create a new column 'processed_text'
df['processed_text'] = df['text'].apply(preprocess_text)

In [2]:
### Indicate column of interest (the category of which you are going to classify)

#Rename the column of interest to "column_of_interest"
df.rename(columns={'political': 'column_of_interest'}, inplace=True)
# Print the modified DataFrame
df.head()
#Rename the column "processed_text" to "text"
df.rename(columns={'processed_text': 'text'}, inplace=True)


In [None]:
### Remove Duplicates
#Count duplicates in the 'text' column
duplicate_count = df['text'].duplicated().sum()
# Print the number of duplicates
print(f"Number of duplicates in 'text' column: {duplicate_count}")
# Print the duplicate instances
duplicate_instances = df[df['text'].duplicated(keep=False)]
print("\nDuplicate instances:")
print(duplicate_instances)
df.drop_duplicates(subset='text', keep='first', inplace=True)

In [None]:
# 2) Train RoBERTa model

In [None]:
### Build RoBERTa Model

# Split the dataframe into training, validation, and test sets (80:10:10 split)###
np.random.seed(115)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                                     [int(.8 * len(df)), int(.9 * len(df))])

print(len(df_train), len(df_val), len(df_test))



In [None]:
# Initialize the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
### Define the labels mapping ###
labels = {
    0: 0,
    1: 1
}

### Define a Dataset class to handle the data ###
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['column_of_interest']]
        self.texts = [tokenizer.encode_plus(
            text,
            padding='max_length',
            max_length=128,
            truncation=True,
            return_tensors="pt"
        ) for text in df['text']]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        batch_texts = self.texts[idx]
        batch_y = self.labels[idx]

        input_ids = batch_texts['input_ids'].squeeze(0)
        attention_mask = batch_texts['attention_mask'].squeeze(0)

        return input_ids, attention_mask, batch_y

### Define the RoBERTa classifier model ###
class RoBERTaClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(RoBERTaClassifier, self).__init__()

        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
### Train the model
def train(model, train_data, val_data, learning_rate, epochs):
    train_dataset = Dataset(train_data)
    val_dataset = Dataset(val_data)
# Create a DataLoader for the training data
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for input_ids, attention_mask, labels in tqdm(train_dataloader):
            labels = labels.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            output = model(input_ids, attention_mask)

            batch_loss = criterion(output, labels.long())
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == labels).sum().item()
            total_acc_train += acc

            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

        avg_acc_train = total_acc_train / len(train_dataset)
        avg_loss_train = total_loss_train / len(train_dataset)

        print(f"Epoch {epoch_num + 1}/{epochs}")
        print(f"Train Loss: {avg_loss_train:.4f}, Train Accuracy: {avg_acc_train:.4f}")

# Train the model
EPOCHS = 35 ### Change Epochs if needed
model = RoBERTaClassifier()
LR = 1e-5

train(model, df_train, df_val, LR, EPOCHS)


In [None]:
# Save after training
torch.save(model.state_dict(), 'roberta_classifier_model.pth')

In [None]:
# 3) Evaluate performance of model

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate(model, test_data):
    test_dataset = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    if use_cuda:
        model = model.cuda()

    model.eval()  # Set model to evaluation mode

    predictions, true_labels = [], []

    with torch.no_grad():
        for input_ids, attention_mask, labels in test_dataloader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            
            outputs = model(input_ids, attention_mask)
            logits = outputs.argmax(dim=1).cpu().numpy()
            label_ids = labels.cpu().numpy()
            
            predictions.extend(logits)
            true_labels.extend(label_ids)

    # Calculate performance metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')

    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'Test Precision: {precision:.4f}')
    print(f'Test Recall: {recall:.4f}')
    print(f'Test F1 Score: {f1:.4f}')

# Evaluate the model
evaluate(model, df_test)


In [None]:
# 4) Prepare full dataset for classification

In [None]:
### Run full dataset (Excluding the training data) through the RoBERTa classifier

# Read the CSV file 
dtype_spec = {"id": str}  # Treat the "id" column as a string (character) type
df2 = pd.read_csv("DATA FILE PATH HERE", dtype=dtype_spec)
df2.head()

In [None]:
#Delete any row that appears in the training set (identified by the id column)
# Create a list of ids in df
df_ids = df['id'].tolist()
# Iterate through new_df and keep only the rows where the "id" is not in df_ids
filtered_rows = []
for index, row in df2.iterrows():
    if row['id'] not in df_ids:
        filtered_rows.append(row)
# Create a new dataframe from the filtered rows
new_df = pd.DataFrame(filtered_rows)
# Now new_df contains only the rows from new_df that don't appear in df by the "id" column

In [None]:
# Apply preprocessing function to the 'text' column and create a new column 'processed_text'
new_df['processed_text'] = new_df['text'].apply(preprocess_text)

In [None]:
# 5) Classify text using trained RoBERTa model

In [None]:
# Load the trained model
class RoBERTaClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(RoBERTaClassifier, self).__init__()

        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

loaded_model = RoBERTaClassifier()
loaded_model.load_state_dict(torch.load('roberta_classifier_model.pth'))
loaded_model.eval()

In [None]:
# Initialize the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [None]:
# Define a function to classify text using the loaded model
def classify_text(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        logits = loaded_model(inputs['input_ids'], inputs['attention_mask'])
    predicted_label = torch.argmax(logits, dim=1).item()
    return predicted_label

In [None]:
#Necessary Downloads
from concurrent.futures import ThreadPoolExecutor
from torch.nn.utils.rnn import pad_sequence

In [None]:
# Define a function to classify a batch of texts
def classify_batch(batch_texts):
    inputs = [tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128) for text in batch_texts]
    batch_input_ids = pad_sequence([input_dict['input_ids'].squeeze(0) for input_dict in inputs], batch_first=True)
    batch_attention_mask = pad_sequence([input_dict['attention_mask'].squeeze(0) for input_dict in inputs], batch_first=True)

    with torch.no_grad():
        logits = loaded_model(batch_input_ids, batch_attention_mask)
    batch_predicted_labels = torch.argmax(logits, dim=1).tolist()

    return batch_predicted_labels

In [None]:
# Batch processing for classification with optimized tokenization
def batch_classify_text_optimized(texts, batch_size=16):
    num_texts = len(texts)
    predicted_labels = []

    for i in range(0, num_texts, batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_results = classify_batch(batch_texts)
        predicted_labels.extend(batch_results)

    return predicted_labels

In [None]:
# Batch processing for classification with optimized tokenization and parallel processing
def batch_classify_text_parallel_optimized(texts, batch_size=16):
    num_texts = len(texts)
    predicted_labels = []

    with ThreadPoolExecutor() as executor:
        for i in range(0, num_texts, batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_results = list(executor.map(classify_batch, [batch_texts]))
            predicted_labels.extend(batch_results[0])

    return predicted_labels

In [None]:
# Apply parallel batch classification with optimized tokenization to the new dataset
new_df['predicted_label'] = batch_classify_text_parallel_optimized(new_df['processed_text'], batch_size=16)

In [None]:
# Save the output predictions to a CSV file
output_filename = "political_predictions.csv" ###CHANGE THE NAME HERE
new_df.to_csv(output_filename, index=False)
print(f"Predictions saved to {output_filename}")

In [None]:
# Download the CSV file to your local machine
files.download("political_predictions.csv")