In [1]:
!pip install transformers
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import torch
torch.cuda.empty_cache()
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import output

#GPU usage setup
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import random
random.seed(1)
np.random.seed(1)
torch.cuda.manual_seed(1)
torch.manual_seed(1)
import time
start_time = time.time()



In [2]:
from google.colab import auth
auth.authenticate_user()
from google.colab import drive
drive.mount('/content/gdrive/')


%cd /content/gdrive/MyDrive/LonelinessR21
%ls

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).
/content/gdrive/MyDrive/LonelinessR21
 annotation_Drew_sample_social_disconnection.xlsx
 annotation_Drew_sample_stigmatizing_labels.xlsx
 annotation_Selen_sample_social_disconnection.xlsx
 annotation_Selen_sample_stigmatizing_labels.xlsx
'Copy of loneliness_social_isolation_lexicon_evaluation.csv'
 gold_standard_loneliness_lexicon_df.csv
 gold_standard_social_isolation_1000.csv
 loneliness_ehr_roberta.ipynb
 loneliness_gold_standard_expanded_lexicon_matches.xlsx
 loneliness_lexicon_dev.ipynb
 loneliness_lexicon.Rmd
 loneliness_lexicon_stem_and_similar_round1.csv
 loneliness_lexicon_stem_and_similar_round2.csv
 loneliness_matches_expanded.xlsx
 loneliness_matches.xlsx
 loneliness_regex_matching_and_sample.ipynb
 loneliness_roberta_ehr_performance.gsheet
 loneliness_social_isolation_lexicon_evaluation.csv
'loneliness_social_isolation_lexicon_evaluation KL.c

In [3]:
df_concat_filtered  = pd.read_csv("gold_standard_social_isolation_1000.csv")
df_concat_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Unnamed: 0                          1000 non-null   int64  
 1   Sentence ID                         1000 non-null   int64  
 2   TEXT                                1000 non-null   object 
 3   Sentence                            1000 non-null   object 
 4   matched_term                        1000 non-null   object 
 5   chronic_social_disconnection_label  1000 non-null   float64
 6   lives_alone_label                   1000 non-null   float64
 7   acute_social_disconnection_label    1000 non-null   float64
 8   full_text                           1000 non-null   object 
 9   full_text_truncated                 1000 non-null   object 
dtypes: float64(3), int64(2), object(5)
memory usage: 78.2+ KB


In [4]:

# Rename 'full_text_truncated' to 'text'
df_concat_filtered = df_concat_filtered.rename(columns={'full_text_truncated': 'text', 'chronic_social_disconnection_label':'label'})
df_concat_filtered.reset_index(drop=True, inplace=True)

# Create a new dataframe with the renamed 'text' column and 'chronic_social_disconnection_label'
chronic_df = df_concat_filtered[['label', 'text']]

# Display the first few rows of chronic_df
chronic_df.info()
chronic_df.head()
chronic_df.reset_index(drop=True, inplace=True)
unique_chronic_values = chronic_df['label'].value_counts().reset_index()

# Renaming columns for clarity
unique_chronic_values.columns = ['label', 'count']
unique_chronic_values




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   1000 non-null   float64
 1   text    1000 non-null   object 
dtypes: float64(1), object(1)
memory usage: 15.8+ KB


Unnamed: 0,label,count
0,0.0,968
1,1.0,32


In [5]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
class BiasData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        if index >= len(self.text):
            raise IndexError(f"Index {index} out of bounds for dataset of length {len(self.text)}")

        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


In [7]:
train_size = 0.8
train_data=chronic_df.sample(frac=train_size,random_state=0)
test_data=chronic_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(chronic_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = BiasData(train_data, tokenizer, MAX_LEN)
testing_set = BiasData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (1000, 2)
TRAIN Dataset: (800, 2)
TEST Dataset: (200, 2)


In [8]:
train_data['label'].unique()

array([0., 1.])

In [9]:
test_data['label'].unique()
test_data['text'].head()

Unnamed: 0,text
0,"Ischemic disease was excluded (no EKG changes,..."
1,As other etiologies for bladder wall thickenin...
2,"MAE, patient is cooparative with but seems to ..."
3,Social: Found down in apartment alone.</s>T/SI...
4,It may represent extension / recurrence of a t...


In [10]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [11]:
from transformers import RobertaConfig

config = RobertaConfig.from_pretrained("roberta-base")
config.output_attentions = True

class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base", config=config)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.0)
        self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        # Get the hidden states, pooler output, and attention weights
        last_hidden_state, pooler_output, all_attentions = self.l1(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        pooler = last_hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output, all_attentions



In [12]:
model = RobertaClass()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [13]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [14]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [15]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        # Extract the logits (output) from the returned tuple
        logits, attention_weights = model(ids, mask)

        # Use the logits when computing the loss
        loss = loss_function(logits, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(logits.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if _ % 5000 == 0:
            loss_step = tr_loss / nb_tr_steps
            accu_step = (n_correct * 100) / nb_tr_examples
            print(f"Training Loss: {loss_step}")
            print(f"Training Accuracy: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct * 100) / nb_tr_examples}')
    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return


print(len(training_loader.dataset))



800


In [16]:
for epoch in range(EPOCHS):
    train(epoch)



Training Loss: 0.6942698955535889
Training Accuracy: 37.5


50it [04:32,  5.44s/it]


The Total Accuracy for Epoch 0: 95.375
Training Loss Epoch: 0.2619872373715043
Training Accuracy Epoch: 95.375




Training Loss: 0.03759019449353218
Training Accuracy: 100.0


50it [04:26,  5.34s/it]


The Total Accuracy for Epoch 1: 96.5
Training Loss Epoch: 0.15238201674073934
Training Accuracy Epoch: 96.5




Training Loss: 0.0491531565785408
Training Accuracy: 100.0


50it [04:29,  5.39s/it]


The Total Accuracy for Epoch 2: 96.5
Training Loss Epoch: 0.12621663801372052
Training Accuracy Epoch: 96.5




Training Loss: 0.015921810641884804
Training Accuracy: 100.0


50it [04:27,  5.35s/it]


The Total Accuracy for Epoch 3: 96.5
Training Loss Epoch: 0.10877932639792562
Training Accuracy Epoch: 96.5




Training Loss: 0.08214662224054337
Training Accuracy: 93.75


50it [04:28,  5.36s/it]


The Total Accuracy for Epoch 4: 96.875
Training Loss Epoch: 0.09306443821638823
Training Accuracy Epoch: 96.875




Training Loss: 0.017657775431871414
Training Accuracy: 100.0


50it [04:28,  5.38s/it]


The Total Accuracy for Epoch 5: 97.375
Training Loss Epoch: 0.06915374024771154
Training Accuracy Epoch: 97.375




Training Loss: 0.038472749292850494
Training Accuracy: 100.0


50it [04:29,  5.40s/it]


The Total Accuracy for Epoch 6: 98.625
Training Loss Epoch: 0.04174811236094683
Training Accuracy Epoch: 98.625




Training Loss: 0.004940931685268879
Training Accuracy: 100.0


50it [04:29,  5.38s/it]


The Total Accuracy for Epoch 7: 99.125
Training Loss Epoch: 0.034371224557980896
Training Accuracy Epoch: 99.125




Training Loss: 0.003536077681928873
Training Accuracy: 100.0


50it [04:30,  5.40s/it]


The Total Accuracy for Epoch 8: 99.0
Training Loss Epoch: 0.03946645521558821
Training Accuracy Epoch: 99.0




Training Loss: 0.007124125026166439
Training Accuracy: 100.0


50it [04:30,  5.40s/it]

The Total Accuracy for Epoch 9: 99.375
Training Loss Epoch: 0.017146367158275097
Training Accuracy Epoch: 99.375





In [17]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0

    all_preds = []  # list to store predictions
    all_targets = []  # list to store original targets
    all_texts = []  # list to store original input texts

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.long)

            # Extract the logits (output) from the returned tuple
            logits, attention_weights = model(ids, mask)
            logits = logits.squeeze()

            # Use the logits when computing the loss and other operations
            loss = loss_function(logits, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(logits.data, dim=1)

            all_preds.extend(big_idx.cpu().numpy())  # store predictions
            all_targets.extend(targets.cpu().numpy())  # store targets

            all_texts.extend(data['ids'])  # store original input texts

            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples += targets.size(0)

            if _ % 5000 == 0:
                loss_step = tr_loss / nb_tr_steps
                accu_step = (n_correct * 100) / nb_tr_examples
                print(f"Validation Loss: {loss_step}")
                print(f"Validation Accuracy: {accu_step}")

    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples

    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    # Print classification report
    report = classification_report(all_targets, all_preds)
    print(report)

    # Confusion matrix
    cm = confusion_matrix(all_targets, all_preds)
    print("Confusion Matrix:")
    print(cm)

    # Create a DataFrame and save it
    df_predictions = pd.DataFrame({
        'Text': all_texts,
        'Original': all_targets,
        'Predicted': all_preds
    })
    df_predictions.to_csv('predictions.csv', index=False)
    print(df_predictions.head())

    return epoch_accu



In [None]:
## VALIDATION

In [18]:
valid(model, testing_loader)

acc = valid(model,testing_loader)
print("test accuracy = %0.2f%%" % acc)

end_time = time.time()

elapsed_time = end_time - start_time
print(f"Elapsed Time: {elapsed_time:.2f} seconds")



1it [00:02,  2.62s/it]

Validation Loss: 0.026007499545812607
Validation Accuracy: 100.0


13it [00:30,  2.36s/it]


Validation Loss Epoch: 0.08231155370147182
Validation Accuracy Epoch: 98.0
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       196
           1       0.50      0.25      0.33         4

    accuracy                           0.98       200
   macro avg       0.74      0.62      0.66       200
weighted avg       0.98      0.98      0.98       200

Confusion Matrix:
[[195   1]
 [  3   1]]
                                                Text  Original  Predicted
0  [tensor(0), tensor(1620), tensor(97), tensor(4...         0          0
1  [tensor(0), tensor(47874), tensor(35), tensor(...         0          0
2  [tensor(0), tensor(26369), tensor(3935), tenso...         0          0
3  [tensor(0), tensor(4688), tensor(6256), tensor...         0          0
4  [tensor(0), tensor(34440), tensor(31995), tens...         0          0


1it [00:01,  1.66s/it]

Validation Loss: 0.10605359822511673
Validation Accuracy: 93.75


13it [00:23,  1.83s/it]

Validation Loss Epoch: 0.08491789007810159
Validation Accuracy Epoch: 98.0
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       196
           1       0.50      0.25      0.33         4

    accuracy                           0.98       200
   macro avg       0.74      0.62      0.66       200
weighted avg       0.98      0.98      0.98       200

Confusion Matrix:
[[195   1]
 [  3   1]]
                                                Text  Original  Predicted
0  [tensor(0), tensor(5632), tensor(31798), tenso...         0          0
1  [tensor(0), tensor(574), tensor(3699), tensor(...         0          0
2  [tensor(0), tensor(713), tensor(16), tensor(14...         0          0
3  [tensor(0), tensor(18547), tensor(139), tensor...         0          0
4  [tensor(0), tensor(387), tensor(16908), tensor...         0          0
test accuracy = 98.00%
Elapsed Time: 4460.14 seconds





In [None]:
## PREDICTIONS

In [26]:
def predict_on_dataframe(df, model, tokenizer, max_len):
    model.eval()
    predictions = []

    for index, row in df.iterrows():
        text = row['text']
        inputs = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = torch.tensor([inputs['input_ids']], dtype=torch.long).to(device)
        mask = torch.tensor([inputs['attention_mask']], dtype=torch.long).to(device)

        with torch.no_grad():
            logits, _ = model(ids, mask)  # Here we unpack the tuple

        big_val, big_idx = torch.max(logits.data, dim=1)  # Use logits instead of outputs
        predictions.append(big_idx[0].item())

    df['predictions'] = predictions
    return pd.DataFrame(df)



In [27]:
predicted_classes = predict_on_dataframe(chronic_df, model, tokenizer, max_len=512)
print(predicted_classes)




     label                                               text  predictions
0      0.0  The electronic pacer device overlying the left...            0
1      0.0  No attempts to get OOB alone.</s>87 year old f...            0
2      0.0  Neuro: Pt remains withdrawn, opens eyes sponta...            0
3      0.0  FWB'ing B LE Social / Occupational History: Pt...            0
4      0.0  Response: Plan: .H/O anxiety Assessment: Letha...            0
..     ...                                                ...          ...
995    0.0  Lives at home alone, has home aide come in eve...            0
996    0.0  POST EXTUBATION PT CALM AND WITHDRAWN.</s>MICU...            0
997    0.0  I spoke with [**First Name8 (NamePattern2) 862...            0
998    0.0  #5: Mom in for few mins this am alone.</s>NPN ...            0
999    0.0  When family here they think she is [** 467**] ...            0

[1000 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predictions'] = predictions


In [31]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import numpy as np

# Assuming predicted_classes is a DataFrame with 'label' and 'predictions' columns
true_labels = predicted_classes['label'].tolist()  # Extract 'label' column as list
predicted_labels = predicted_classes['predictions'].tolist()  # Extract 'predictions' column as list

# Function for bootstrapping a metric
def bootstrap_metric(y_true, y_pred, metric_func, n_iterations=1000):
    bootstrap_values = []
    n_samples = len(y_true)
    for _ in range(n_iterations):
        indices = np.random.choice(np.arange(n_samples), size=n_samples, replace=True)
        bootstrap_true = np.array(y_true)[indices]
        bootstrap_pred = np.array(y_pred)[indices]
        value = metric_func(bootstrap_true, bootstrap_pred)
        bootstrap_values.append(value)
    return bootstrap_values

# Bootstrapping each metric for the positive class (label = 1)
bootstrap_f1_pos = bootstrap_metric(true_labels, predicted_labels, lambda y_true, y_pred: f1_score(y_true, y_pred, pos_label=1))
bootstrap_precision_pos = bootstrap_metric(true_labels, predicted_labels, lambda y_true, y_pred: precision_score(y_true, y_pred, pos_label=1))
bootstrap_recall_pos = bootstrap_metric(true_labels, predicted_labels, lambda y_true, y_pred: recall_score(y_true, y_pred, pos_label=1))
bootstrap_accuracy = bootstrap_metric(true_labels, predicted_labels, accuracy_score)

# Function to compute mean and confidence intervals
def mean_and_confidence_interval(data, alpha=0.05):
    mean_value = np.mean(data)
    lower_percentile = 100 * alpha / 2.
    upper_percentile = 100 * (1 - alpha / 2.)
    lower = np.percentile(data, lower_percentile)
    upper = np.percentile(data, upper_percentile)
    return mean_value, lower, upper

# Compute mean and confidence intervals for each metric
f1_mean, f1_lower, f1_upper = mean_and_confidence_interval(bootstrap_f1_pos)
precision_mean, precision_lower, precision_upper = mean_and_confidence_interval(bootstrap_precision_pos)
recall_mean, recall_lower, recall_upper = mean_and_confidence_interval(bootstrap_recall_pos)
accuracy_mean, accuracy_lower, accuracy_upper = mean_and_confidence_interval(bootstrap_accuracy)

# Print results
print(f"F1 Score: Mean={f1_mean:.3f}, 95% CI=({f1_lower:.3f}, {f1_upper:.3f})")
print(f"Precision: Mean={precision_mean:.3f}, 95% CI=({precision_lower:.3f}, {precision_upper:.3f})")
print(f"Recall: Mean={recall_mean:.3f}, 95% CI=({recall_lower:.3f}, {recall_upper:.3f})")
print(f"Accuracy: Mean={accuracy_mean:.3f}, 95% CI=({accuracy_lower:.3f}, {accuracy_upper:.3f})")




F1 Score: Mean=0.764, 95% CI=(0.606, 0.884)
Precision: Mean=1.000, 95% CI=(1.000, 1.000)
Recall: Mean=0.623, 95% CI=(0.458, 0.792)
Accuracy: Mean=0.988, 95% CI=(0.981, 0.994)


In [None]:
for sample_size in range(100, 1001, 100):
    f1_scores_sample = []

    for train_index, test_index in skf.split(X, y):
        # Select training and test data and ensure labels are integers
        train_texts = X[train_index[:sample_size]].tolist()
        test_texts = X[test_index].tolist()
        train_labels = y[train_index[:sample_size]].astype(int)  # Ensure labels are integers
        test_labels = y[test_index].astype(int)  # Ensure labels are integers

        # Tokenize the datasets
        train_encodings = tokenize_function(train_texts)
        test_encodings = tokenize_function(test_texts)

        # Create custom PyTorch datasets
        train_dataset = CustomTextDataset(train_encodings, train_labels)
        test_dataset = CustomTextDataset(test_encodings, test_labels)

        # Load the RoBERTa model for classification (2 classes)
        model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

        # Set up training arguments
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=3,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
        )

        # Define Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset
        )

        # Train the model
        trainer.train()

        # Get predictions
        predictions = trainer.predict(test_dataset).predictions.argmax(-1)  # Use argmax to get class predictions

        # Calculate F1 score
        f1 = f1_score(test_labels, predictions, average='binary')
        f1_scores_sample.append(f1)

    # Store the mean F1 score for this sample size
    sample_sizes.append(sample_size)
    f1_scores.append(np.mean(f1_scores_sample))


In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Define a custom Dataset class
class CustomTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Tokenize the text data
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt")

# Placeholder for chronic_df (ensure it's a pandas DataFrame)
X = chronic_df['text'].values
y = chronic_df['label'].values

skf = StratifiedKFold(n_splits=5)

# Empty lists to store sample sizes and F1 scores
sample_sizes = []
f1_scores = []

# Main training loop (fixed to handle class indices correctly)
for sample_size in range(100, 1001, 100):
    f1_scores_sample = []

    for train_index, test_index in skf.split(X, y):
        # Select training and test data and ensure labels are integers
        train_texts = X[train_index[:sample_size]].tolist()
        test_texts = X[test_index].tolist()
        train_labels = y[train_index[:sample_size]].astype(int)  # Ensure labels are integers
        test_labels = y[test_index].astype(int)  # Ensure labels are integers

        # Tokenize the datasets
        train_encodings = tokenize_function(train_texts)
        test_encodings = tokenize_function(test_texts)

        # Create custom PyTorch datasets
        train_dataset = CustomTextDataset(train_encodings, train_labels)
        test_dataset = CustomTextDataset(test_encodings, test_labels)

        # Load the RoBERTa model for classification (2 classes)
        model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

        # Set up training arguments
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=3,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
        )

        # Define Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset
        )

        # Train the model
        trainer.train()

        # Get predictions
        predictions = trainer.predict(test_dataset).predictions.argmax(-1)  # Use argmax to get class predictions

        # Calculate F1 score
        f1 = f1_score(test_labels, predictions, average='binary')
        f1_scores_sample.append(f1)

    # Store the mean F1 score for this sample size
    sample_sizes.append(sample_size)
    f1_scores.append(np.mean(f1_scores_sample))

# Create a DataFrame for the results
results_df = pd.DataFrame({'Sample Size': sample_sizes, 'F1 Score': f1_scores})

# Plot the results (as before)
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(results_df['Sample Size'], results_df['F1 Score'], marker='o', linestyle='-', color='b')
plt.title('Chronic Loneliness Sample Size vs F1 Score for RoBERTa Model')
plt.xlabel('Sample Size')
plt.ylabel('F1 Score')
plt.grid(True)
plt.savefig("chronic_loneliness_sample_size_f1_roberta.png")

# Save results to CSV
results_df.to_csv("chronic_loneliness_sample_size_vs_f1_score_roberta.csv", index=False)



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,0.8167
20,0.8051
30,0.7343


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

Step,Training Loss
10,0.7514
20,0.7196
30,0.6602


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

Step,Training Loss
10,0.7514
20,0.7196
30,0.6602


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

Step,Training Loss
10,0.7514
20,0.7196
30,0.6602


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

Step,Training Loss
10,0.7514
20,0.7196
30,0.6602


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

Step,Training Loss
10,0.7545
20,0.7165
30,0.6492
40,0.5649
50,0.3444
60,0.2006
70,0.067


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,0.5138
20,0.505
30,0.4461
40,0.3863
50,0.2034
60,0.0977
70,0.1887


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,0.5138
20,0.505
30,0.4461
40,0.3863
50,0.2034
60,0.0977
70,0.1887


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,0.5138
20,0.505
30,0.4461
40,0.3863
50,0.2034
60,0.0977
70,0.1887


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,0.5138
20,0.505
30,0.4461
40,0.3863
50,0.2034
60,0.0977
70,0.1887


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,0.5057
20,0.507
30,0.4447
40,0.3852
50,0.1931
60,0.1571
70,0.0678
80,0.2294
90,0.0753


In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import RobertaTokenizer

# Load the tokenizer (assuming you used 'roberta-base' tokenizer)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Input sentence for binary classification (positive for chronic_social_disconnection_label)
sentence = "TOL WELL. SOCIAL SERVICE CONSULT FOR HOMELESSNESS. PT ESTRANGED FROM FAMILY PT IN AFIB 80s TO 140s, BP 90 TO 100s SYSTOLIC."

# Tokenize the input sentence
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)

# Forward pass through your pre-trained model with attention outputs
outputs = model(**inputs, output_attentions=True)  # Add output_attentions=True to get attention weights
logits = outputs.logits
attentions = outputs.attentions  # Extract attention weights

# Convert logits to predicted label (binary classification)
predicted_label = torch.argmax(logits, dim=1).item()

# Get the tokens from the tokenizer
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

# Average the attention weights across all heads in the last attention layer
# Attention shape: (num_layers, num_heads, batch_size, seq_length, seq_length)
attention = attentions[-1]  # Last layer attention
attention = attention.mean(dim=1).squeeze(0).detach().numpy()  # Average over heads, remove batch dim

# Visualization function
def plot_attention(tokens, attention, title="RoBERTa Attention Visualization"):
    # Generate heatmap
    plt.figure(figsize=(10, 10))
    sns.heatmap(attention, xticklabels=tokens, yticklabels=tokens, cmap="YlGnBu", annot=False)
    plt.title(title)
    plt.show()

# Plot the attention for the sentence
plot_attention(tokens, attention, title="RoBERTa Attention Visualization for Chronic Social Disconnection Label")

# Print the predicted label for clarity
print(f"Predicted label: {predicted_label} (0 = Negative, 1 = Positive)")
