In [None]:
from pathlib import Path
import sys
import os
BASE_PATH = Path('/kaggle/input/tweet-dataset/')
print(sys.path)
sys.path.append('/kaggle/input/tweet-dataset/')
sys.path.append('/kaggle/input/bertClassifier-ckpt/')
sys.path.append('/kaggle/input/notebook9c29aaf992/')
print(sys.path)
os.environ["WANDB_API_KEY"] = '9583728ef6ed77991e73653a08b4ee2d328b1fd9'

In [2]:
import torch
from torch import nn, optim
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
import wandb
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import timedelta
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
dler = nltk.downloader.Downloader()
dler._update_index()
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import helpers as hlp
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding,
    AutoModel
)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
for device_nb in range(torch.cuda.device_count()):
    torch.cuda.set_device(device_nb)
    print(torch.cuda.get_device_name(), device_nb)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!


Tesla T4 0
Tesla T4 1


In [3]:
print(20*'-','loading data', 20*'-')
t_pos = pd.read_table(BASE_PATH / "train_pos_full.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_pos['label'] = 1
t_neg = pd.read_table(BASE_PATH /"train_neg_full.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_neg['label'] = 0
df = pd.concat((t_pos,t_neg), ignore_index=True)
print(20*'-','preprocessing data', 20*'-')
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_stopwords(x))
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_punct(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.add_space(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_white_space(x))
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_words_digits(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.to_lower(x))
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_specific_words(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_repeating_char(x))
#df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_single_char(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.unslang(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.lemmatize(x))
train_df, val_df = train_test_split(df,test_size=.025,random_state=42)
print(train_df.shape,val_df.shape)

-------------------- loading data --------------------
-------------------- preprocessing data --------------------
(2396839, 2) (61458, 2)


In [4]:
class TweetDataset(Dataset):
  """
  torch dataset class specific for our project
  """
  def __init__(self,tweets_df, tokenizer, max_len):
    super().__init__()
    self.tweets = tweets_df.tweet.to_numpy()
    self.targets = tweets_df.label.to_numpy()
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.tweets)
  
  def __getitem__(self, item):
    """
    encodes the input with the bert tokenizer
    """
    tweets = str(self.tweets[item])
    targets = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      tweets,
      add_special_tokens=True,
      max_length=self.max_len,
      #padding=True,
      #truncation=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'tweet_text': tweets,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(targets, dtype=torch.long)
    }
  
class TweetDataModule(pl.LightningDataModule):

  def __init__(self,train_dataset, val_dataset, batch_size=8):
    super().__init__()
    self.train_dataset = train_dataset
    self.val_dataset = val_dataset
    self.batch_size = batch_size

  def train_dataloader(self):
      return DataLoader(self.train_dataset,self.batch_size)

  def val_dataloader(self):
      return DataLoader(self.val_dataset, self.batch_size)


class SentimentClassifier(pl.LightningModule):
  """
  Sentiment classifier model. adds a feed-forward NN to bert-base-uncased.
  """
  def __init__(self, model_name, lr = 2e-5, adam_eps=1e-8, weight_decay=0., warmup_steps=0, freeze_bert=False):
    super(SentimentClassifier, self).__init__()
    self.save_hyperparameters()
    self.bert = BertModel.from_pretrained(model_name)
    if freeze_bert:
        self.bert.eval()
            # freeze params
        for param in self.bert.parameters():
                param.requires_grad = False

    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, 2)
    self.loss_fn = nn.CrossEntropyLoss()
    self.lr = lr
    self.adam_eps = adam_eps
    self.warmup_steps = warmup_steps
    self.freeze_bert = freeze_bert
    
  def forward(self, input_ids, attention_mask):
        
    pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )[1]
    output = self.drop(pooled_output)
    return F.softmax(self.out(output), dim=1)

  def training_step(self, batch, batch_idx):
      input_ids = batch['input_ids']
      attention_mask = batch['attention_mask']
      targets = batch['targets']
      logits = self.forward(input_ids,attention_mask)

      loss = self.loss_fn(logits,targets)
      self.log('train/loss', loss)
      return loss

  def validation_step(self, batch, batch_idx):
      input_ids = batch['input_ids']
      attention_mask = batch['attention_mask']
      targets = batch['targets']
      logits = self.forward(input_ids,attention_mask)
      preds = torch.argmax(logits,dim=1)
      acc = (preds == targets).sum()/len(targets)
      loss = self.loss_fn(logits,targets)
      self.log_dict({'val/loss': loss, 'val/acc': acc})
      return {'val_loss':loss, 'val_accuracy': acc}

  def predict_step(self, batch, batch_idx):
      input_ids = batch['input_ids']
      attention_mask = batch['attention_mask']
      logits = self.forward(input_ids,attention_mask)
      preds = torch.argmax(logits,dim=1)
      return logits, preds
  
  def configure_optimizers(self):
      optimizer = torch.optim.Adam(
          self.parameters(),
          lr=self.lr,
          eps=self.adam_eps
          )
      
      scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
      scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
      return [optimizer], [scheduler]

In [5]:
wandb.init(project="TweetSentimentClassifier", entity="tcastigl", name='bert_base_uncased_unslanged_conti2')
pl.seed_everything(42)
BATCH_SIZE=256
TOKENIZER_MAX_LEN=128
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
#tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
dm = TweetDataModule(
    TweetDataset(train_df,tokenizer, TOKENIZER_MAX_LEN),
    TweetDataset(val_df,tokenizer, TOKENIZER_MAX_LEN),
    batch_size=BATCH_SIZE
  )
wandb.config.update({'batch_size': BATCH_SIZE,
                     'dataset':  'big',
                     'tokenizer_max_len': TOKENIZER_MAX_LEN})

model = SentimentClassifier(PRE_TRAINED_MODEL_NAME, weight_decay=.2)
model = SentimentClassifier.load_from_checkpoint('/kaggle/input/notebook9c29aaf992/logs/checkpoints/last.ckpt', weight_decay=.2)

logger = pl.loggers.WandbLogger(
    save_dir='logs',
    project='TweetSentimentClassifier',
    name='bertweet_try',
    resume='must'
)
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=Path('logs/', 'checkpoints'),
    filename="best-model-epoch={epoch:02d}",
    monitor="val/loss",
    train_time_interval=timedelta(minutes=120),
    save_on_train_epoch_end=True,
    # auto_insert_metric_name=False,
    save_top_k=1,
    save_last=True,
    mode="min",
)
trainer = pl.Trainer(
    max_time="00:07:00:00",
    max_epochs=4,
    logger=logger,
    callbacks= [checkpoint_callback],
    log_every_n_steps=10,
    val_check_interval=5000,
    enable_progress_bar=True,
    accelerator='gpu',
    devices=2,
    strategy='dp'
)
print(20*'-','starting training', 20*'-')
trainer.fit(model, datamodule=dm)
#wandb.finish()
print(20*'-','training done', 20*'-')

[34m[1mwandb[0m: Currently logged in as: [33mtcastigl[0m. Use [1m`wandb login --relogin`[0m to force relogin


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/bertClassifier-ckpt/last.ckpt'

In [6]:
print(20*'-','loading test data', 20*'-')
test_df = pd.read_table(BASE_PATH / "test_data.txt", header=None, dtype=str, names=['tweet'], on_bad_lines='skip')
for i, tweet in enumerate(test_df.tweet):
  test_df.loc[i, 'tweet'] = ''.join(tweet.split(',')[1:])
print(20*'-','preprocessing test data', 20*'-')
# Processing
#test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_stopwords(x))
#test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_punct(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.add_space(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_white_space(x))
#test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_words_digits(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.to_lower(x))
#test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_specific_words(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.remove_repeating_char(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: hlp.lemmatize(x))
test_df.shape

-------------------- loading test data --------------------
-------------------- preprocessing test data --------------------


(10000, 1)

In [7]:
class TweetTestDataset(Dataset):
  def __init__(self,tweets_df, tokenizer, max_len):
    super().__init__()
    self.tweets = tweets_df.tweet.to_numpy()
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.tweets)
  
  def __getitem__(self, item):
    tweets = str(self.tweets[item])
    encoding = self.tokenizer.encode_plus(
      tweets,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'tweet_text': tweets,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
    }

In [8]:

#BATCH_SIZE=128
#TOKENIZER_MAX_LEN=128
#PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
#tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
#test_dataloader = DataLoader(TweetTestDataset(test_df,tokenizer, TOKENIZER_MAX_LEN))
#model = SentimentClassifier.load_from_checkpoint('/kaggle/input/bertclassifiertrained/last (1).ckpt')
'''
logger = pl.loggers.WandbLogger(
    save_dir='logs',
    project='bertClassifier',
    name='try',
)
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=Path('logs/', 'checkpoints'),
    filename="best-model-epoch={epoch:02d}",
    monitor="val/loss",
    train_time_interval=timedelta(minutes=120),
    save_on_train_epoch_end=True,
    # auto_insert_metric_name=False,
    save_top_k=1,
    save_last=True,
    mode="min",
)
trainer = pl.Trainer(
    max_time="00:12:00:00",
    max_epochs=2,
    callbacks= [checkpoint_callback],
    log_every_n_steps=50,
    val_check_interval=5000,
    enable_progress_bar=True,
    accelerator='gpu',
    devices=2,
    strategy='dp'
)

wandb.init(project="TweetSentimentClassifier", entity="tcastigl", name='testing_big_dataset_freeze_bert')
'''
print(20*'-','computing and saving predictions', 20*'-')
test_dataloader = DataLoader(TweetTestDataset(test_df,tokenizer, TOKENIZER_MAX_LEN))
preds = trainer.predict(model, test_dataloader)

test_preds = np.array([int(preds[i][1]) for i in range(len(preds))])
test_preds[test_preds == 0] = -1
test_preds = pd.DataFrame(test_preds)
test_preds.index += 1
test_preds.to_csv('test_preds_try.csv',header='Prediction')

-------------------- computing and saving predictions --------------------


Predicting: 106it [00:00, ?it/s]

