# Sentiment Classification with Transformers

## Import dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import csv
import sys
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Resolves error 'field larger than field limit (131072) as some lines contain text more than 30k words
csv.field_size_limit(sys.maxsize)
df = pd.read_csv("/content/drive/MyDrive/raw_data/raw_data/fulltrain.csv", header = None, engine='python', names=['class','text'])

In [None]:
df.head()

Unnamed: 0,class,text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Preprocessing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import spacy
import string

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
en = spacy.load('en_core_web_sm')

sw_spacy = en.Defaults.stop_words

In [None]:
# Get all punctuations
punctuations = string.punctuation
# Store all punctuations as a list
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”", "``", "''"]

def stopword_remover(df, colname):
  df_copy = df.copy() # make edits on df_copy instead of df
  for index,row in df.iterrows():
    # Split the sentence into words using word_tokenize
    word_list = nltk.word_tokenize(row[colname])
    
    # First check if each word belongs in the spacy's stopwod list, else emove it
    # list comprehension, only keep the word that is not present in the sw_spacy list
    word_list_remove_sw = [token for token in word_list if token.lower() not in sw_spacy]
    # word_list_remove_punctuations = [token for token in word_list_remove_sw if token.lower() not in SYMBOLS]

    sentence = " ".join(word_list_remove_sw)
    
    # fix the output of each text with the new lemmatized and no stopwords text.
    df_copy.loc[df_copy.index == index, [colname]] = sentence

    return df_copy

In [None]:
df = stopword_remover(df, 'text')

## Tokenization & Input Formatting

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
articles = df.text.values
labels = df["class"].values - 1

In [None]:
# Print the original sentence.
print(' Original: ', articles[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(articles[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(articles[0])))

 Original:  little decade ago , hockey fans blessed slate games night , Thursday sources confirmed ninth consecutive year NHL players locked , slim hopes agreement sight . like yesterday Martin St. Louis Lightning teammates raising Stanley Cup , high school hockey coach onetime ESPN analyst Barry Melrose said . Obviously , Im hoping sides come reach agreement , Im starting think misses hockey anymore . Nope . old Barry . Id love catch Atlanta Thrashers game . Observers noted arena doors reopen , NHL face greater challenge convincing fans return hockey instead watching popular sports like football , basketball , baseball , SlamBall .
Tokenized:  ['little', 'decade', 'ago', ',', 'hockey', 'fans', 'blessed', 'slate', 'games', 'night', ',', 'Thursday', 'sources', 'confirmed', 'ninth', 'consecutive', 'year', 'NHL', 'players', 'locked', ',', 'slim', 'hopes', 'agreement', 'sight', '.', 'like', 'yesterday', 'Martin', 'St', '.', 'Louis', 'Lightning', 'teammates', 'raising', 'Stanley', 'Cup', ',

In [None]:
max_len = 0

# For every article...
for article in articles:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(article, add_special_tokens=True)

    # Update the maximum article length.
    max_len = max(max_len, len(input_ids))

print('Max article length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (1029 > 512). Running this sequence through the model will result in indexing errors


Max article length:  215311


In [None]:
# Tokenize all of the articles and map the tokens to their word IDs.
input_ids = []
attention_masks = []

# For every article...
for article in articles:
    encoded_dict = tokenizer.encode_plus(
                        article,                    
                        add_special_tokens = True, 
                        truncation = True,
                        max_length = 256,     
                        pad_to_max_length = True,
                        return_attention_mask = True,  
                        return_tensors = 'pt',   
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print article 0
print('Original: ', articles[0])
print('Token IDs:', input_ids[0])



Original:  little decade ago , hockey fans blessed slate games night , Thursday sources confirmed ninth consecutive year NHL players locked , slim hopes agreement sight . like yesterday Martin St. Louis Lightning teammates raising Stanley Cup , high school hockey coach onetime ESPN analyst Barry Melrose said . Obviously , Im hoping sides come reach agreement , Im starting think misses hockey anymore . Nope . old Barry . Id love catch Atlanta Thrashers game . Observers noted arena doors reopen , NHL face greater challenge convincing fans return hockey instead watching popular sports like football , basketball , baseball , SlamBall .
Token IDs: tensor([  101,  1376,  4967,  2403,   117,  4700,  3899, 15865, 15989,  1638,
         1480,   117,  9170,  3509,  3659,  6948,  4776,  1214,  6521,  2139,
         4594,   117, 17393,  7816,  3311,  3617,   119,  1176,  8128,  2405,
         1457,   119,  2535, 13479, 13646,  5920,  5481,  1635,   117,  1344,
         1278,  4700,  2154,  1141,  

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

43,968 training samples
4,886 validation samples


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
from transformers import BertForSequenceClassification, DistilBertForSequenceClassification, RobertaForSequenceClassification, AdamW, BertConfig

model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels = 4, # The number of output labels
    output_attentions = False, 
    output_hidden_states = False, 
)

model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (28996, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )




In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 2

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put model into eval mode
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
            loss = outputs.loss
            logits = outputs.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of  1,374.    Elapsed: 0:00:54.
  Batch    80  of  1,374.    Elapsed: 0:01:50.
  Batch   120  of  1,374.    Elapsed: 0:02:46.
  Batch   160  of  1,374.    Elapsed: 0:03:43.
  Batch   200  of  1,374.    Elapsed: 0:04:39.
  Batch   240  of  1,374.    Elapsed: 0:05:36.
  Batch   280  of  1,374.    Elapsed: 0:06:33.
  Batch   320  of  1,374.    Elapsed: 0:07:29.
  Batch   360  of  1,374.    Elapsed: 0:08:26.
  Batch   400  of  1,374.    Elapsed: 0:09:23.
  Batch   440  of  1,374.    Elapsed: 0:10:19.
  Batch   480  of  1,374.    Elapsed: 0:11:16.
  Batch   520  of  1,374.    Elapsed: 0:12:13.
  Batch   560  of  1,374.    Elapsed: 0:13:09.
  Batch   600  of  1,374.    Elapsed: 0:14:06.
  Batch   640  of  1,374.    Elapsed: 0:15:03.
  Batch   680  of  1,374.    Elapsed: 0:15:59.
  Batch   720  of  1,374.    Elapsed: 0:16:56.
  Batch   760  of  1,374.    Elapsed: 0:17:52.
  Batch   800  of  1,374.    Elapsed: 0:18:49.
  Batch   840  of  1,374.    Elapsed: 0:19:46.


In [None]:
# Roberta 2 epochs stopword removal

# Display floats with two decimal places.
# pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.107311,0.035412,0.989899,0:09:07,0:00:18
2,0.024053,0.029761,0.992239,0:09:10,0:00:18


In [None]:
# Roberta 2 epochs stopword punc removal 256

# Display floats with two decimal places.
# pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.074543,0.021193,0.995711,0:32:29,0:01:13
2,0.006997,0.022777,0.995302,0:32:28,0:01:13


## Evaluate on Test Set

In [None]:
# Load the dataset into a pandas dataframe.
df = pd.read_csv("/content/drive/MyDrive/raw_data/raw_data/balancedtest.csv", header=None, names=['label', 'text'])

df = stopword_remover(df, 'text')

# Report the number of articles.
print('Number of test articles: {:,}\n'.format(df.shape[0]))

Number of test articles: 3,000



In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Create sentence and label lists
articles = df.text.values
labels = df.label.values - 1

# Tokenize all of the articles and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every article...
for article in articles:
    encoded_dict = tokenizer.encode_plus(
                        article,       
                        add_special_tokens = True,
                        truncation = True,
                        max_length = 256,         
                        pad_to_max_length = True,
                        return_attention_mask = True, 
                        return_tensors = 'pt',  
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 32  

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)



In [None]:
# Prediction on test set

print('Predicting labels for {:,} test articles...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs.logits

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 3,000 test articles...
    DONE.


In [None]:
# Roberta 2 epochs stopword removal 64

from sklearn.metrics import accuracy_score, f1_score

# Calculate accuracy and F1 score
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels)
pred_labels = np.argmax(predictions, axis=1)
acc = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, average='micro')

print("Accuracy:", acc)
print("F1 Score:", f1)

Accuracy: 0.7063333333333334
F1 Score: 0.7063333333333334


In [None]:
# Roberta 2 epochs stopword removal 256 micro

from sklearn.metrics import accuracy_score, f1_score

# Calculate accuracy and F1 score
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels)
pred_labels = np.argmax(predictions, axis=1)
acc = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, average='micro')

print("Accuracy:", acc)
print("F1 Score:", f1)

Accuracy: 0.7023333333333334
F1 Score: 0.7023333333333334


In [None]:
# BERT 64

from sklearn.metrics import accuracy_score, f1_score

# Calculate accuracy and F1 score
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels)
pred_labels = np.argmax(predictions, axis=1)
acc = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, average='micro')

print("Accuracy:", acc)
print("F1 Score:", f1)

Accuracy: 0.67
F1 Score: 0.67


In [None]:
# Roberta 2 epochs stopword removal 256

from sklearn.metrics import accuracy_score, f1_score

# Calculate accuracy and F1 score
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels)
pred_labels = np.argmax(predictions, axis=1)
acc = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, average='weighted')

print("Accuracy:", acc)
print("F1 Score:", f1)

Accuracy: 0.7113333333333334
F1 Score: 0.6528983422524486


## Save Model

In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = '/content/drive/MyDrive/raw_data/roberta_256_stopword/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to /content/drive/MyDrive/raw_data/roberta_256_stopword/


('/content/drive/MyDrive/raw_data/roberta_256_stopword/tokenizer_config.json',
 '/content/drive/MyDrive/raw_data/roberta_256_stopword/special_tokens_map.json',
 '/content/drive/MyDrive/raw_data/roberta_256_stopword/vocab.json',
 '/content/drive/MyDrive/raw_data/roberta_256_stopword/merges.txt',
 '/content/drive/MyDrive/raw_data/roberta_256_stopword/added_tokens.json',
 '/content/drive/MyDrive/raw_data/roberta_256_stopword/tokenizer.json')

## Load model

In [None]:
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("/content/drive/MyDrive/raw_data/roberta_256_stopword/")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/raw_data/roberta_256_stopword/")

model.to(device)


Some weights of the model checkpoint at /content/drive/MyDrive/raw_data/roberta_256_stopword/ were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/raw_data/roberta_256_stopword/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop