In [1]:
# Install the transformers library
!pip install transformers
!pip install datasets
!pip install wandb
!pip install pytorch-lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 18.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 64.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 64.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 2

In [1]:
!nvidia-smi

Sun Dec  4 12:33:51 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    29W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import torch
from torch import nn, optim
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
import wandb
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import timedelta
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import helpers as hlp
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'{device=}')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


device=device(type='cuda', index=0)


In [3]:
t_pos = pd.read_table("train_pos_full.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_pos['label'] = 1
t_neg = pd.read_table("train_neg_full.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_neg['label'] = 0
df = pd.concat((t_pos,t_neg), ignore_index=True)
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_stopwords(x))
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_punct(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.add_space(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_white_space(x))
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_words_digits(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.to_lower(x))
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_specific_words(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_repeating_char(x))
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_single_char(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.lemmatize(x))
train_df, val_df = train_test_split(df,test_size=.05,random_state=42)
print(train_df.shape,val_df.shape)

(1684834, 2) (88676, 2)


In [4]:
class TweetDataset(Dataset):
  def __init__(self,tweets_df, tokenizer, max_len):
    super().__init__()
    self.tweets = tweets_df.tweet.to_numpy()
    self.targets = tweets_df.label.to_numpy()
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.tweets)
  
  def __getitem__(self, item):
    tweets = str(self.tweets[item])
    targets = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      tweets,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'tweet_text': tweets,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(targets, dtype=torch.long)
    }
  
class TweetDataModule(pl.LightningDataModule):

  def __init__(self,train_dataset,val_dataset, batch_size=8):
    super().__init__()
    self.train_dataset = train_dataset
    self.val_dataset = val_dataset
    self.batch_size = batch_size

  def train_dataloader(self):
      return DataLoader(self.train_dataset,self.batch_size)

  def val_dataloader(self):
      return DataLoader(self.val_dataset, self.batch_size)


class SentimentClassifier(pl.LightningModule):
  def __init__(self, model_name, lr = 2e-5, adam_eps=1e-8, weight_decay=0., warmup_steps=0):
    super(SentimentClassifier, self).__init__()
    self.save_hyperparameters()
    self.bert = BertModel.from_pretrained(model_name)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, 2)
    self.loss_fn = nn.CrossEntropyLoss().to(device)
    self.lr = lr
    self.adam_eps = adam_eps
    self.warmup_steps = warmup_steps

  def forward(self, input_ids, attention_mask):
    pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )[1]
    output = self.drop(pooled_output)
    return F.softmax(self.out(output), dim=1)

  def training_step(self, batch, batch_idx):
      input_ids = batch['input_ids']
      attention_mask = batch['attention_mask']
      targets = batch['targets']
      logits = self.forward(input_ids,attention_mask)

      loss = self.loss_fn(logits,targets)
      self.log('train/loss', loss)
      return loss

  def validation_step(self, batch, batch_idx):
      input_ids = batch['input_ids']
      attention_mask = batch['attention_mask']
      targets = batch['targets']
      logits = self.forward(input_ids,attention_mask)
      preds = torch.argmax(logits,dim=1)
      acc = (preds == targets).sum()/len(targets)
      loss = self.loss_fn(logits,targets)
      self.log_dict({'val/loss': loss, 'val/acc': acc})
      return {'val_loss':loss, 'val_accuracy': acc}

  def configure_optimizers(self):
      optimizer = torch.optim.Adam(
          self.parameters(),
          lr=self.lr,
          eps=self.adam_eps
          )
      
      scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
      scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
      return [optimizer], [scheduler]


In [None]:
wandb.init(project="TweetSentimentClassifier", entity="tcastigl", name='epochs1_big_dataset')
pl.seed_everything(42)
BATCH_SIZE=32
TOKENIZER_MAX_LEN=128
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

dm = TweetDataModule(
    TweetDataset(train_df,tokenizer, TOKENIZER_MAX_LEN),
    TweetDataset(val_df,tokenizer, TOKENIZER_MAX_LEN),
    batch_size=BATCH_SIZE
  )
wandb.config.update({'batch_size': BATCH_SIZE,
                     'dataset':  'small',
                     'tokenizer_max_len': TOKENIZER_MAX_LEN})

model = SentimentClassifier(PRE_TRAINED_MODEL_NAME)
model = model.to(device)

logger = pl.loggers.WandbLogger(
    save_dir='logs',
    project='bertClassifier',
    name='try',
)
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=Path('logs/', 'checkpoints'),
    filename="best-model-epoch={epoch:02d}",
    monitor="loss/val",
    train_time_interval=timedelta(minutes=20),
    save_on_train_epoch_end=True,
    # auto_insert_metric_name=False,
    save_top_k=1,
    save_last=True,
    mode="min",
)
trainer = pl.Trainer(
    max_epochs=1,
    logger=logger,
    callbacks= [checkpoint_callback],
    log_every_n_steps=500,
    val_check_interval=5000,
    enable_progress_bar=True,
    accelerator='gpu',
    devices=1
)


trainer.fit(model, datamodule=dm)
wandb.finish()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtcastigl[0m. Use [1m`wandb login --relogin`[0m to force relogin


INFO:lightning_lite.utilities.seed:Global seed set to 42
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: Tru

Sanity Checking: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: 0it [00:00, ?it/s]

In [None]:
torch.cuda.empty_cache()
data=next(iter(train_dataloader))
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
logits = model.forward(input_ids,attention_mask)



NameError: ignored

In [None]:
loss_fn = nn.CrossEntropyLoss().to(device)
loss_fn(logits,data['targets'].to(device))

tensor(0.7074, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
a = torch.tensor([0,0,1])
b = torch.tensor([0,1,1])
(a==b).sum()/len(a)

tensor(0.6667)

INFO:lightning_lite.utilities.seed:Global seed set to 42
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WAN

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/click/termui.py", line 129, in prompt_func
    return f("")
  File "/usr/local/lib/python3.8/dist-packages/click/termui.py", line 54, in hidden_prompt_func
    return getpass.getpass(prompt)
  File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 843, in getpass
    return self._input_request(prompt,
  File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 904, in _input_request
    raise KeyboardInterrupt("Interrupted by user") from None
KeyboardInterrupt: Interrupted by user

During handling of the above exception, another exception occurred:

Traceback (most recent call 

Exception: ignored

In [None]:
prediction = trainer.predict(tokenized_datasets['val'])
preds = prediction.predictions.argmax(1)
(preds == tokenized_datasets['val']['label']).mean()

In [None]:
# Load data
test_df = pd.read_table("test_data.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')

# Processing
#test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_stopwords(x))
#test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_punct(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.add_space(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_white_space(x))
#test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_words_digits(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.to_lower(x))
#test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_specific_words(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_repeating_char(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.lemmatize(x))

test_dataset=DatasetDict()
test_dataset["test"]=Dataset.from_pandas(test_df)

def preprocess_function(examples):
    return tokenizer(examples["tweet"], truncation=True)

tokenized_test_dataset = test_dataset.map(preprocess_function, 
                                      batched=True, 
                                      num_proc=4)
del test_dataset #save space

In [None]:
test_predictions = trainer.predict(tokenized_test_dataset['test'])
test_preds = test_predictions.predictions.argmax(1)
test_preds[test_preds == 0] = -1
test_preds = pd.DataFrame(test_preds)
test_preds.index += 1
test_preds.to_csv('test_preds.csv',header='Prediction')