In [1]:
# Install the transformers library
#!pip install transformers
#!pip install datasets
#!pip install wandb
#!pip install pytorch-lightning

In [2]:
!nvidia-smi

Sun Dec  4 17:56:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P0    28W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import torch
from torch import nn, optim
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
import wandb
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import timedelta
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import helpers as hlp
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'{device=}')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


device=device(type='cuda', index=0)


In [4]:
t_pos = pd.read_table("train_pos.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_pos['label'] = 1
t_neg = pd.read_table("train_neg.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_neg['label'] = 0
df = pd.concat((t_pos,t_neg), ignore_index=True)
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_stopwords(x))
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_punct(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.add_space(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_white_space(x))
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_words_digits(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.to_lower(x))
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_specific_words(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_repeating_char(x))
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_single_char(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.lemmatize(x))
train_df, val_df = train_test_split(df,test_size=.1,random_state=42)
print(train_df.shape,val_df.shape)

(177273, 2) (19697, 2)


In [5]:
class TweetDataset(Dataset):
  """
  torch dataset class specific for our project
  """
  def __init__(self,tweets_df, tokenizer, max_len):
    super().__init__()
    self.tweets = tweets_df.tweet.to_numpy()
    self.targets = tweets_df.label.to_numpy()
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.tweets)
  
  def __getitem__(self, item):
    """
    encodes the input with the bert tokenizer
    """
    tweets = str(self.tweets[item])
    targets = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      tweets,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'tweet_text': tweets,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(targets, dtype=torch.long)
    }
  
class TweetDataModule(pl.LightningDataModule):

  def __init__(self,train_dataset, val_dataset, batch_size=8):
    super().__init__()
    self.train_dataset = train_dataset
    self.val_dataset = val_dataset
    self.batch_size = batch_size

  def train_dataloader(self):
      return DataLoader(self.train_dataset,self.batch_size)

  def val_dataloader(self):
      return DataLoader(self.val_dataset, self.batch_size)


class SentimentClassifier(pl.LightningModule):
  """
  Sentiment classifier model. adds a feed-forward NN to bert-base-uncased.
  """
  def __init__(self, model_name, lr = 2e-5, adam_eps=1e-8, weight_decay=0., warmup_steps=0):
    super(SentimentClassifier, self).__init__()
    self.save_hyperparameters()
    self.bert = BertModel.from_pretrained(model_name)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, 2)
    self.loss_fn = nn.CrossEntropyLoss().to(device)
    self.lr = lr
    self.adam_eps = adam_eps
    self.warmup_steps = warmup_steps

  def forward(self, input_ids, attention_mask):
    pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )[1]
    output = self.drop(pooled_output)
    return F.softmax(self.out(output), dim=1)

  def training_step(self, batch, batch_idx):
      input_ids = batch['input_ids']
      attention_mask = batch['attention_mask']
      targets = batch['targets']
      logits = self.forward(input_ids,attention_mask)

      loss = self.loss_fn(logits,targets)
      self.log('train/loss', loss)
      return loss

  def validation_step(self, batch, batch_idx):
      input_ids = batch['input_ids']
      attention_mask = batch['attention_mask']
      targets = batch['targets']
      logits = self.forward(input_ids,attention_mask)
      preds = torch.argmax(logits,dim=1)
      acc = (preds == targets).sum()/len(targets)
      loss = self.loss_fn(logits,targets)
      self.log_dict({'val/loss': loss, 'val/acc': acc})
      return {'val_loss':loss, 'val_accuracy': acc}

  def predict_step(self, batch, batch_idx):
      input_ids = batch['input_ids']
      attention_mask = batch['attention_mask']
      logits = self.forward(input_ids,attention_mask)
      preds = torch.argmax(logits,dim=1)
      return logits, preds
  
  def configure_optimizers(self):
      optimizer = torch.optim.Adam(
          self.parameters(),
          lr=self.lr,
          eps=self.adam_eps
          )
      
      scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
      scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
      return [optimizer], [scheduler]


In [None]:
wandb.init(project="TweetSentimentClassifier", entity="tcastigl", name='epochs1_small_dataset')
pl.seed_everything(42)
BATCH_SIZE=64
TOKENIZER_MAX_LEN=128
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

dm = TweetDataModule(
    TweetDataset(train_df,tokenizer, TOKENIZER_MAX_LEN),
    TweetDataset(val_df,tokenizer, TOKENIZER_MAX_LEN),
    batch_size=BATCH_SIZE
  )
wandb.config.update({'batch_size': BATCH_SIZE,
                     'dataset':  'small',
                     'tokenizer_max_len': TOKENIZER_MAX_LEN})

#model = SentimentClassifier(PRE_TRAINED_MODEL_NAME, weight_decay=.2)
model = SentimentClassifier.load_from_checkpoint("logs/checkpoints/last.ckpt")
model = model.to(device)

logger = pl.loggers.WandbLogger(
    save_dir='logs',
    project='bertClassifier',
    name='try',
    resume='must'
)
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=Path('logs/', 'checkpoints'),
    filename="best-model-epoch={epoch:02d}",
    monitor="val/loss",
    train_time_interval=timedelta(minutes=20),
    save_on_train_epoch_end=True,
    # auto_insert_metric_name=False,
    save_top_k=1,
    save_last=True,
    mode="min",
)
trainer = pl.Trainer(
    max_epochs=2,
    logger=logger,
    callbacks= [checkpoint_callback],
    log_every_n_steps=50,
    val_check_interval=500,
    enable_progress_bar=True,
    accelerator='gpu',
    devices=1
)


trainer.fit(model, datamodule=dm)
wandb.finish()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtcastigl[0m. Use [1m`wandb login --relogin`[0m to force relogin


INFO:lightning_lite.utilities.seed:Global seed set to 42
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: Tru

Sanity Checking: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
test_df = pd.read_table("test_data.txt", header=None, dtype=str, names=['tweet'], on_bad_lines='skip')
for i, tweet in enumerate(test_df.tweet):
  test_df.loc[i, 'tweet'] = ''.join(tweet.split(',')[1:])

# Processing
#test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_stopwords(x))
#test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_punct(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.add_space(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_white_space(x))
#test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_words_digits(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.to_lower(x))
#test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_specific_words(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_repeating_char(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.lemmatize(x))
test_df.shape


In [None]:
class TweetTestDataset(Dataset):
  def __init__(self,tweets_df, tokenizer, max_len):
    super().__init__()
    self.tweets = tweets_df.tweet.to_numpy()
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.tweets)
  
  def __getitem__(self, item):
    tweets = str(self.tweets[item])
    encoding = self.tokenizer.encode_plus(
      tweets,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'tweet_text': tweets,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
    }

In [None]:
test_dataloader = DataLoader(TweetTestDataset(test_df,tokenizer, TOKENIZER_MAX_LEN))
model = SentimentClassifier.load_from_checkpoint("logs/checkpoints/last.ckpt")
preds = trainer.predict(model, test_dataloader)

In [None]:
test_preds = np.array([int(preds[i][1]) for i in range(len(preds))])
test_preds[test_preds == 0] = -1
test_preds = pd.DataFrame(test_preds)
test_preds.index += 1
test_preds.to_csv('test_preds.csv',header='Prediction')