In [None]:
import subprocess
import sys


# Function to install packages
'''def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])'''

# Ensure pip is up-to-date
#install("pip --upgrade")

# Install required libraries
!pip install datasets
!pip install accelerate

# Import necessary libraries
import pandas as pd
import torch
from nltk.tokenize import TweetTokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Verify versions
import transformers
import accelerate
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


2024-06-09 19:15:07.274045: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Transformers version: 4.39.2
Accelerate version: 0.28.0


In [None]:
'''# Step 1: Download the dataset from Google Drive
from google.colab import drive
import shutil
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/final_jawn.csv'
destination_path = '/content/sample_data/'
shutil.copy(file_path, destination_path)'''

# Step 2: Load the dataset into a Pandas DataFrame
df = pd.read_csv('./final_jawn.csv')

# Assuming the DataFrame has columns 'text' and 'nominate_dim1' for the ideological scores
# Select only the required columns
df = df[['text', 'nominate_dim1']]

In [None]:
# Check for missing values
if df.isnull().values.any():
    print("Warning: Missing values detected in the dataset. Removing missing values...")
    df = df.dropna()

In [None]:
#SAMPLING TO REDUCE COMPUTING POWER
sample_size = 120000  # Adjust this number to control the size of your sample
df = df.sample(n=sample_size, random_state=42)

In [None]:
# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into train and validation sets
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train']
eval_dataset = dataset['test']

# Initialize TweetTokenizer
tweet_tokenizer = TweetTokenizer()

# Load Distilroberta tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
def tokenize_function(examples):
    tweet_tokens = [tweet_tokenizer.tokenize(text) for text in examples['text']]
    # Convert tokens to IDs and pad/truncate to max_length
    encoded = roberta_tokenizer(tweet_tokens, is_split_into_words=True, padding='max_length', truncation=True)
    return encoded

In [None]:
# Apply tokenization to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'nominate_dim1'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'nominate_dim1'])

# Rename 'nominate_dim1' to 'labels' because Trainer expects labels column
train_dataset = train_dataset.rename_column("nominate_dim1", "labels")
eval_dataset = eval_dataset.rename_column("nominate_dim1", "labels")

# Ensure dataset columns are in the correct format for the model
train_dataset = train_dataset.map(lambda examples: {'labels': examples['labels'].float()}, batched=True)
eval_dataset = eval_dataset.map(lambda examples: {'labels': examples['labels'].float()}, batched=True)


Map:   0%|          | 0/96000 [00:00<?, ? examples/s]

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/96000 [00:00<?, ? examples/s]

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

In [None]:
print("Sample train data:", train_dataset[0])
train_dataset.to_csv('./traintokenized.csv', index=False)
print("Sample eval data:", eval_dataset[0])
eval_dataset.to_csv('./testtokenized.csv', index=False)

Sample train data: {'labels': tensor(0.4680), 'input_ids': tensor([    0, 24953, 27785,  1205,   640,  1556,     4,   175,    73,  4014,
         5273,  1215,   771, 40719,    73, 29552,    73, 40996,  1749, 32620,
         1749,  2890, 35400, 33787,  1209,   565,   787,  4014,  5273,  1215,
          771, 40719,  4062,  5273,  3703, 21688, 17147, 26389,  3858,  2620,
         3888, 27785, 27785, 27785,   226, 18827, 11932, 30105, 26686,  8103,
        10659,    27,  8103, 10659,  3726,  8103, 10659,    27,  8103, 10659,
         3726,  8103, 10659,    27,  8103, 10659,  3726,     2,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
     

Creating CSV from Arrow format:   0%|          | 0/380 [00:00<?, ?ba/s]

Sample eval data: {'labels': tensor(0.5930), 'input_ids': tensor([    0, 10541,   787,  3048, 41810,  1582, 19034,   293,    32,   588,
          849, 21136, 40589,    29, 10781, 24837,     7, 24361,   211,  7842,
         6128,   479,  2054,   640,   642,  4311,     4, 17137, 34252,     4,
          175,    73,  5535,    73,   717,   428,   347,   401,   574, 36640,
         1000,   571,   250, 39318,   246,   487,   705,     4, 10474,     2,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
      

Creating CSV from Arrow format:   0%|          | 0/95 [00:00<?, ?ba/s]

425386328

In [None]:
#try creating a hugging face token and add it to secrets

In [None]:
train_dataset = pd.read_csv("./traintokenized.csv")
eval_dataset = pd.read_csv("./testtokenized.csv")

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',  # Use 'evaluation_strategy' if 'eval_strategy' doesn't work
    learning_rate=2e-5, #look into this
    per_device_train_batch_size=4,  # Reduce batch size for faster computation
    per_device_eval_batch_size=4,
    save_steps=50000,
    num_train_epochs=3,
    weight_decay=0.01,
    #logging_dir='./logs', #maybe get rid of logging
    #logging_steps=10,
    fp16=True,  # Enable mixed precision training
)

# Roberta model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)  # Regression output

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
# Train the model
#trainer.train()
trainer.train(resume_from_checkpoint = False)

# Evaluate the model
#eval_results = trainer.evaluate()
#print(f"Evaluation results: {eval_results}")

# Save the model
model.save_pretrained('./trained_model')
roberta_tokenizer.save_pretrained('./trained_model')

In [None]:
# Get predictions
predictions = trainer.predict(eval_dataset)

# The predictions object contains several elements including predictions, label_ids, and metrics
preds = predictions.predictions
predicted_values = preds.squeeze()

In [None]:
predicted_values

In [None]:
eval_dataset['labels']
#'text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask']"

In [None]:
# Calculate regression metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, precision_recall_fscore_support, roc_auc_score

mse = mean_squared_error(eval_dataset['labels'], predicted_values)
mae = mean_absolute_error(eval_dataset['labels'], predicted_values)
r2 = r2_score(eval_dataset['labels'], predicted_values)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

"""accuracy = accuracy_score(eval_dataset['labels'], predicted_values)
precision, recall, f1, _ = precision_recall_fscore_support(eval_dataset['labels'], predicted_values, average='binary')
roc_auc = roc_auc_score(eval_dataset['labels'], predicted_values)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC: {roc_auc}")"""

Mean Squared Error: 0.0573095977306366
Mean Absolute Error: 0.1398831307888031
R-squared: 0.7190815474320499


'accuracy = accuracy_score(eval_dataset[\'labels\'], predicted_values)\nprecision, recall, f1, _ = precision_recall_fscore_support(eval_dataset[\'labels\'], predicted_values, average=\'binary\')\nroc_auc = roc_auc_score(eval_dataset[\'labels\'], predicted_values)\n\nprint(f"Accuracy: {accuracy}")\nprint(f"Precision: {precision}")\nprint(f"Recall: {recall}")\nprint(f"F1 Score: {f1}")\nprint(f"ROC-AUC: {roc_auc}")'

In [None]:
import numpy as np
from sklearn.preprocessing import label_binarize
# Define the bins for the 1 to 7 scale
bins = np.linspace(-1, 1, 7 + 1)
print(bins)
labels = [1, 2, 3, 4, 5, 6, 7]

true_continuous = np.array(eval_dataset['labels'])
predicted_values = np.array(predicted_values)

# Convert continuous labels to discrete 1 to 7 scale
true_binned_labels = pd.cut(true_continuous, bins=bins, labels=labels, include_lowest=True).astype(int)
predicted_binned_values = pd.cut(predicted_values, bins=bins, labels=labels, include_lowest=True).astype(int)

#predicted_binned_values = predicted_binned_values.to_numpy()
print("True labels distribution:")
print(pd.Series(true_binned_labels).value_counts().sort_index())

print("Predicted labels distribution:")
print(pd.Series(predicted_binned_values).value_counts().sort_index())

# Calculate accuracy
accuracy = accuracy_score(true_binned_labels, predicted_binned_values)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1 score for each class
precision, recall, f1, _ = precision_recall_fscore_support(
    true_binned_labels,
    predicted_binned_values,
    average=None  # Calculate for each class
)

# Print the results for each class
for i, (p, r, f) in enumerate(zip(precision, recall, f1)):
    print(f"Class {i+1} - Precision: {p:.2f}, Recall: {r:.2f}, F1: {f:.2f}")

# Binarize the true and predicted labels for ROC AUC calculation
true_binarized_labels = label_binarize(true_binned_labels, classes=[1, 2, 3, 4, 5, 6, 7])
predicted_binarized_values = label_binarize(predicted_binned_values, classes=[1, 2, 3, 4, 5, 6, 7])

# Calculate ROC AUC score for each class and then average
roc_auc = roc_auc_score(true_binarized_labels, predicted_binarized_values, average='macro')
print(f"ROC AUC: {roc_auc:.2f}")

[-1.         -0.71428571 -0.42857143 -0.14285714  0.14285714  0.42857143
  0.71428571  1.        ]
True labels distribution:
1      402
2    13825
3    36407
4     1509
5    13771
6    25957
7     3115
Name: count, dtype: int64
Predicted labels distribution:
2    10639
3    39540
4     2440
5    13728
6    27361
7     1278
Name: count, dtype: int64
Accuracy: 0.64
Class 1 - Precision: 0.00, Recall: 0.00, F1: 0.00
Class 2 - Precision: 0.50, Recall: 0.39, F1: 0.44
Class 3 - Precision: 0.70, Recall: 0.76, F1: 0.73
Class 4 - Precision: 0.17, Recall: 0.28, F1: 0.21
Class 5 - Precision: 0.50, Recall: 0.50, F1: 0.50
Class 6 - Precision: 0.71, Recall: 0.75, F1: 0.73
Class 7 - Precision: 0.45, Recall: 0.19, F1: 0.26
ROC AUC: 0.67


  _warn_prf(average, modifier, msg_start, len(result))
