In [None]:
# Install required packages for the project
!pip install transformers demoji

# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.optim import Adam
from transformers import RobertaTokenizer, RobertaModel
import re
import demoji
from tqdm import tqdm
from sklearn.metrics import classification_report
from google.colab import files
from concurrent.futures import ThreadPoolExecutor
from torch.nn.utils.rnn import pad_sequence

# Download emoji codes for preprocessing
demoji.download_codes()

# Load dataset uploaded by the user
uploaded = files.upload()
df = pd.read_csv("All Human Coded Disclosure Tweets.csv")

# Function to preprocess text data
def preprocess_text(text):
    """Preprocesses input text by lowercasing, removing punctuation, URLs, emojis,
    smileys, numbers, email addresses, and replacing certain patterns."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[#.,!?]', '', text)  # Remove punctuation
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = demoji.replace(text, '')  # Remove emojis
    text = re.sub(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', '', text)  # Remove smileys
    text = re.sub(r'\b\d+\b', '', text)  # Remove numbers
    text = re.sub(r'\S+@\S+', '', text)  # Remove email addresses
    text = text.replace('&amp', 'and')  # Replace &amp with 'and'
    text = re.sub(r'\d{4}-\d{2}-\d{2}', '', text)  # Remove dates in YYYY-MM-DD format
    return text.strip()

# Preprocess texts and retain relevant columns
df['processed_text'] = df['text'].apply(preprocess_text)
columns_to_keep = ["processed_text", "coding (0 or 1)"]
df = df[[col for col in df.columns if col in columns_to_keep]]
df.rename(columns={'processed_text': 'text', 'coding (0 or 1)': 'label'}, inplace=True)
df.dropna(how="any", inplace=True)

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Define a PyTorch Dataset class
class Dataset(torch.utils.data.Dataset):
    """Custom Dataset class for loading data into the model."""
    def __init__(self, df):
        self.labels = df['label'].values
        self.texts = [tokenizer(text, padding='max_length', max_length=128, truncation=True, return_tensors="pt")
                      for text in df['text']]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val.squeeze(0) for key, val in self.texts[idx].items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Split the dataframe into training, validation, and test sets
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

# Define the RoBERTa classifier model
class RoBERTaClassifier(nn.Module):
    """Classifier model based on RoBERTa."""
    def __init__(self, dropout=0.5):
        super(RoBERTaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)  # RoBERTa base produces 768-dimensional vectors
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

# Training function
def train(model, train_data, val_data, learning_rate, epochs):
    """Trains the model on the training dataset and evaluates on the validation dataset."""
    # Data loaders
    train_loader = torch.utils.data.DataLoader(Dataset(train_data), batch_size=16, shuffle=True)
    val_loader = torch.utils.data.DataLoader(Dataset(val_data), batch_size=16)

    # Setup
    device = torch


In [None]:
from sklearn.metrics import precision_recall_fscore_support

def evaluate(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.tolist())
            true_labels.extend(labels.tolist())

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

# Create test loader
test_loader = torch.utils.data.DataLoader(Dataset(df_test), batch_size=16)

# Evaluate the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
evaluate(model, test_loader, device)


In [None]:
# Save the model
torch.save(model.state_dict(), 'roberta_classifier_model.pth')
print("Model saved successfully.")


In [None]:
# Upload the full dataset
uploaded_full_dataset = files.upload()  # This will prompt you to upload a file
full_df = pd.read_csv(next(iter(uploaded_full_dataset)))  # Replace with the actual file name if known

# Preprocess the full dataset
full_df['processed_text'] = full_df['text'].apply(preprocess_text)
print("Full dataset preprocessed.")


In [None]:
# Convert the full dataset for model input
full_dataset = Dataset(full_df)
full_loader = torch.utils.data.DataLoader(full_dataset, batch_size=32)

# Classify the full dataset
model.eval()  # Make sure model is in evaluation mode
full_predictions = []

with torch.no_grad():
    for batch in full_loader:
        input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)
        full_predictions.extend(preds.tolist())

full_df['predicted_label'] = full_predictions
print("Classification complete.")


In [None]:
output_filename = "classified_dataset.csv"
full_df.to_csv(output_filename, index=False)
files.download(output_filename)  # This will download the file to your local machine
print(f"Output saved to {output_filename} and download initiated.")
