In [7]:
import pandas as pd
comments=pd.read_excel('evaluations_overall_comments.xlsx')

In [8]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL,from_tf=True)

# download label mapping
# labels=[]
# mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
# with urllib.request.urlopen(mapping_link) as f:
#     html = f.read().decode('utf-8').split("\n")
#     csvreader = csv.reader(html, delimiter='\t')
# labels = [row[1] for row in csvreader if len(row) > 1]

# PT

model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)

comments['processed']=comments['Answer:'].apply(lambda x: preprocess(x))
comments['encoded']=comments['processed'].apply(lambda x:tokenizer(x,return_tensors='pt'))
comments['output']=comments['encoded'].apply(lambda x:model(**x))
comments['scores']=comments['output'].apply(lambda x:softmax(x[0][0].detach().numpy()))
print(comments)

         Target:                      Evaluator:       Rotation Dates:  \
0     Eliza Chen      Dr. Ong, Andrew Ming Liang    7/1/2022-7/31/2022   
1     Eliza Chen         Dr. Chan, Webber Pak Wo    7/1/2022-7/31/2022   
2     Eliza Chen              Dr. Lim, Chee Hooi    7/1/2022-7/31/2022   
3     Eliza Chen         Dr. Chang, Jason Pik Eu    8/1/2022-8/31/2022   
4     Eliza Chen              Dr. Tan, Chee Kiat    8/1/2022-8/31/2022   
..           ...                             ...                   ...   
163  Rachel Yeap     Dr. Tan, Malcolm Teck Kiang    7/1/2022-7/31/2022   
164  Rachel Yeap  Dr. Khor, Christopher Jen Lock    8/1/2022-8/31/2022   
165  Rachel Yeap               Dr. Liou, Wei Lun    8/1/2022-8/31/2022   
166  Rachel Yeap             Dr. Loo, Khang Ning    9/1/2022-10/2/2022   
167  Rachel Yeap        Dr. Kwek, Andrew Boon Eu  10/3/2022-10/31/2022   

                         Service:  \
0                             - -   
1       SHS-GASTRO:BASIC ENDO-SGH   


In [None]:
import torch
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence

# # Load tokenizer and model
# model_name = "cardiffnlp/twitter-roberta-base-sentiment"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load and preprocess data
dataset = load_dataset("imdb")
train_dataset = dataset["train"].map(lambda example: {"text": example["text"], "label": example["label"]}).shuffle(seed=42).select([i for i in list(range(500))])
val_dataset = dataset["test"].map(lambda example: {"text": example["text"], "label": example["label"]}).shuffle(seed=42).select([i for i in list(range(50))])
print(dataset,'dataset')
print(train_dataset,'train_dataset')

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True,max_length=300)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
# DO NOT RUN

# Define training arguments and trainer
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="no",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_strategy = 'no'
)

def compute_accuracy(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return accuracy_score(labels, preds)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_accuracy
)

# Train model
trainer.train()
trainer.save_model("trained-model")


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments

# Load CSV file using pandas
data = pd.read_csv("generated_comments.csv")
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL,from_tf=True)
# Perform train/test split
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

# Load tokenizer from Hugging Face
# tokenizer = AutoTokenizer.from_pretrained("your_pretrained_tokenizer")

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

train_dataset = Dataset.from_pandas(train_data)
train_dataset = train_dataset.map(tokenize_function, batched=True)

test_dataset = Dataset.from_pandas(test_data)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create a dictionary of datasets
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

# Define Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
)

# Train the model
trainer.train()
trainer.save_model('csv_batch16')

Map:   0%|          | 0/38 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
                                             
 33%|███▎      | 3/9 [00:05<00:10,  1.71s/it]

{'eval_loss': 0.13799996674060822, 'eval_runtime': 0.1235, 'eval_samples_per_second': 40.472, 'eval_steps_per_second': 8.094, 'epoch': 1.0}


                                             
 67%|██████▋   | 6/9 [00:11<00:05,  1.83s/it]

{'eval_loss': 0.05180676653981209, 'eval_runtime': 0.1649, 'eval_samples_per_second': 30.325, 'eval_steps_per_second': 6.065, 'epoch': 2.0}


                                             
100%|██████████| 9/9 [00:16<00:00,  1.85s/it]


{'eval_loss': 0.03951571136713028, 'eval_runtime': 0.192, 'eval_samples_per_second': 26.045, 'eval_steps_per_second': 5.209, 'epoch': 3.0}
{'train_runtime': 16.6532, 'train_samples_per_second': 6.846, 'train_steps_per_second': 0.54, 'train_loss': 0.49953216976589626, 'epoch': 3.0}


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL,from_tf=True)

# Load CSV file using pandas
data = pd.read_csv("generated_comments.csv")

# Perform train/test split
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

# Load tokenizer from Hugging Face
# tokenizer = AutoTokenizer.from_pretrained("your_pretrained_tokenizer")

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

train_dataset = Dataset.from_pandas(train_data)
train_dataset = train_dataset.map(tokenize_function, batched=True)

test_dataset = Dataset.from_pandas(test_data)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create a dictionary of datasets
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})
# Create a function to compute metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "epoch",
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.001,
    push_to_hub=False,
    load_best_model_at_end=True
)

# Define Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    compute_metrics=compute_metrics

)

# Train the model
trainer.train()
print(trainer.evaluate())
trainer.save_model('finetuned-model')
tokenizer.save_pretrained('finetuned-model')

Map:   0%|          | 0/38 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
                                             
 33%|███▎      | 3/9 [00:05<00:09,  1.66s/it]

{'eval_loss': 0.13799896836280823, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 0.1592, 'eval_samples_per_second': 31.412, 'eval_steps_per_second': 6.282, 'epoch': 1.0}


                                             
 67%|██████▋   | 6/9 [00:11<00:05,  1.80s/it]

{'eval_loss': 0.05180616304278374, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 0.1498, 'eval_samples_per_second': 33.378, 'eval_steps_per_second': 6.676, 'epoch': 2.0}


                                             
100%|██████████| 9/9 [00:18<00:00,  1.88s/it]

{'eval_loss': 0.039515234529972076, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 0.1513, 'eval_samples_per_second': 33.046, 'eval_steps_per_second': 6.609, 'epoch': 3.0}


100%|██████████| 9/9 [00:20<00:00,  2.27s/it]


{'train_runtime': 20.4722, 'train_samples_per_second': 5.569, 'train_steps_per_second': 0.44, 'train_loss': 0.4995316399468316, 'epoch': 3.0}


100%|██████████| 1/1 [00:00<00:00, 87.11it/s]


{'eval_loss': 0.039515234529972076, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 0.2487, 'eval_samples_per_second': 20.104, 'eval_steps_per_second': 4.021, 'epoch': 3.0}


('finetuned-model\\tokenizer_config.json',
 'finetuned-model\\special_tokens_map.json',
 'finetuned-model\\vocab.json',
 'finetuned-model\\merges.txt',
 'finetuned-model\\added_tokens.json',
 'finetuned-model\\tokenizer.json')

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL,from_tf=True)
# Load CSV file using pandas
data = pd.read_csv("generated_comments.csv")

# Perform train/test split
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

# Load tokenizer from Hugging Face
# tokenizer = AutoTokenizer.from_pretrained("your_pretrained_tokenizer")

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

train_dataset = Dataset.from_pandas(train_data)
train_dataset = train_dataset.map(tokenize_function, batched=True)

test_dataset = Dataset.from_pandas(test_data)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create a dictionary of datasets
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})
# Create a function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    pos_f1 = f1_score(labels == 0, preds == 0, average='weighted')
    neg_f1 = f1_score(labels == 1, preds == 1, average='weighted')
    neu_f1 = f1_score(labels == 2, preds == 2, average='weighted')
    weighted_f1 = (pos_f1 + neg_f1 + neu_f1) / 3
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'weighted_f1': weighted_f1,
        'pos_f1': pos_f1,
        'neg_f1': neg_f1,
        'neu_f1': neu_f1,
    }
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "epoch",
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.0011,
    push_to_hub=False,
    load_best_model_at_end=True
)

# Define Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    compute_metrics=compute_metrics

)

# Train the model
trainer.train()
print(trainer.evaluate())
trainer.save_model('csv_batch8')


[AAsking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.

[A
[A

[A
[A
[A
[A
[A
[A

                                              

[A[A                                        
[A                                  


 33%|███▎      | 5/15 [00:44<00:12,  1.28s/it]
[A

[A[A

[A[A

{'eval_loss': 0.057017575949430466, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_weighted_f1': 1.0, 'eval_pos_f1': 1.0, 'eval_neg_f1': 1.0, 'eval_neu_f1': 1.0, 'eval_runtime': 0.1451, 'eval_samples_per_second': 34.462, 'eval_steps_per_second': 6.892, 'epoch': 1.0}



[A
[A
[A
[A
[A

                                              

[A[A                                         
[A                                          


 33%|███▎      | 5/15 [00:52<00:12,  1.28s/it]
[A

[A[A

[A[A

{'eval_loss': 0.027921680361032486, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_weighted_f1': 1.0, 'eval_pos_f1': 1.0, 'eval_neg_f1': 1.0, 'eval_neu_f1': 1.0, 'eval_runtime': 0.1464, 'eval_samples_per_second': 34.162, 'eval_steps_per_second': 6.832, 'epoch': 2.0}



[A
[A
[A
[A
[A

                                              

[A[A                                         
[A                                          


 33%|███▎      | 5/15 [01:01<00:12,  1.28s/it]
[A

[A[A

[A[A

{'eval_loss': 0.028729181736707687, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_weighted_f1': 1.0, 'eval_pos_f1': 1.0, 'eval_neg_f1': 1.0, 'eval_neu_f1': 1.0, 'eval_runtime': 0.1558, 'eval_samples_per_second': 32.091, 'eval_steps_per_second': 6.418, 'epoch': 3.0}


                                              

[A[A                                         
 33%|███▎      | 5/15 [01:03<00:12,  1.28s/it]
100%|██████████| 15/15 [00:26<00:00,  1.74s/it]


{'train_runtime': 26.1413, 'train_samples_per_second': 4.361, 'train_steps_per_second': 0.574, 'train_loss': 0.42616421381632485, 'epoch': 3.0}



100%|██████████| 1/1 [00:00<00:00, 36.90it/s]


{'eval_loss': 0.027921680361032486, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_weighted_f1': 1.0, 'eval_pos_f1': 1.0, 'eval_neg_f1': 1.0, 'eval_neu_f1': 1.0, 'eval_runtime': 0.146, 'eval_samples_per_second': 34.258, 'eval_steps_per_second': 6.852, 'epoch': 3.0}


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL,from_tf=True)
# Load CSV file using pandas
data = pd.read_excel("generated_comments.xlsx")

# Perform train/test split
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

# Load tokenizer from Hugging Face
# tokenizer = AutoTokenizer.from_pretrained("your_pretrained_tokenizer")

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

train_dataset = Dataset.from_pandas(train_data)
train_dataset = train_dataset.map(tokenize_function, batched=True)

test_dataset = Dataset.from_pandas(test_data)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create a dictionary of datasets
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

# Define Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
)

# Train the model
trainer.train()
trainer.save_model('excel_batch16')



[AAsking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.

[A
[A

[A
[A
[A
[A
[A
[A

                                              


[A[A[A                            
[A                                          

 33%|███▎      | 5/15 [01:28<00:12,  1.28s/it]

[A[A
[A

[A[A

{'eval_loss': 0.6665213108062744, 'eval_runtime': 0.1813, 'eval_samples_per_second': 49.632, 'eval_steps_per_second': 5.515, 'epoch': 1.0}



[A
[A
[A
[A
[A

                                              


[A[A[A                            
[A                                          

 33%|███▎      | 5/15 [01:39<00:12,  1.28s/it] 

[A[A
[A

[A[A

{'eval_loss': 0.7316714525222778, 'eval_runtime': 0.18, 'eval_samples_per_second': 49.987, 'eval_steps_per_second': 5.554, 'epoch': 2.0}



[A
[A
[A
[A
[A

                                              


[A[A[A                            
[A                                          

 33%|███▎      | 5/15 [01:52<00:12,  1.28s/it] 

[A[A
[A

                                              
[A                                           

 33%|███▎      | 5/15 [01:52<00:12,  1.28s/it] 
100%|██████████| 15/15 [00:36<00:00,  2.41s/it]


{'eval_loss': 0.7393313050270081, 'eval_runtime': 0.1829, 'eval_samples_per_second': 49.204, 'eval_steps_per_second': 5.467, 'epoch': 3.0}
{'train_runtime': 36.1322, 'train_samples_per_second': 6.642, 'train_steps_per_second': 0.415, 'train_loss': 0.5734076182047526, 'epoch': 3.0}
