
Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).

But, it’s not always clear whether a person’s words are actually announcing a disaster. Take this example:

"Oh plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE"


The author explicitly uses the word “ABLAZE” but means it metaphorically. This is clear to a human right away, especially with the visual aid. But it’s less clear to a machine.

I want to build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t. I have access to a dataset of 10,000 tweets that were hand classified. The language used is Python, the framework is pytorch-lightning, transformers, and the base model is dabert. 

Model are predicting whether a given tweet is about a real disaster or not. If so, predict a 1. If not, predict a 0.


Columns :
id - a unique identifier for each tweet
text - the text of the tweet
location - the location the tweet was sent from (may be blank)
keyword - a particular keyword from the tweet (may be blank)
target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)

This is Example Code:


import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader
from transformers import DabertTokenizer, DabertModel
import pandas as pd
from sklearn.model_selection import train_test_split
from torch import nn
import nltk
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('omw-1.4')



def get_sentence(c1, c2, c3):
    if pd.isnull(c1):
        c1 = ""
    if pd.isnull(c2):
        c2 = ""
    if pd.isnull(c3):
        c3 = ""
    return f"{c1} {c2} {c3}"
    
def clean_text(data):
    """
    input: data: a dataframe containing texts to be cleaned
    return: the same dataframe with an added column of clean text
    """
    data['clean_text'] = data['text'].str.lower()
    stop_words = list(stopwords.words('english'))
    punctuations = list(punctuation)
    clean_text = []
    lemmatizer = WordNetLemmatizer()
    for idx, row in enumerate(data['clean_text']):
        split_text = row.split()
        clean_text = [lemmatizer.lemmatize(word) for word in split_text if word not in stop_words and word not in punctuation]
        
        clean_text = ' '.join(clean_text)
        data.loc[idx]['clean_text'] = clean_text
    return data
    
class DisasterTweetClassifier(nn.Module):
    def __init__(self, model_name="dabert-base-model",use_other_features=False,clean_text=False):
        super().__init__()
        self.num_classes = 2
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)     
        self.data = pd.read_csv("path/to/training/data.csv")
        if use_other_features:
            self.data["text"]=self.data.apply(lambda x: get_sentence(x["keyword"],x["location"],x["text"]),axis=1)
        if clean_text:
            self.data=clean_text(self.data)
        self.train_data,self.val_data=train_test_split(self.data,test_size=0.2,random_state=42)
        self.loss_fn = nn.CrossEntropyLoss()
        
        

    def call(self, input_ids):
        # Forward all of the input data through the model
        logits = self.model(input_ids)[0]
        return logits


    def forward(self, input_ids):
        return self.call(input_ids)

    def training_step(self, batch, batch_idx):
        # Training step
        input_ids, labels = batch
        logits = self.forward(input_ids)
        loss = self.loss_fn(logits, labels)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        # Validation step
        input_ids, labels = batch
        logits = self.forward(input_ids)
        val_loss = self.loss_fn(logits, labels)
        return {"val_loss": val_loss}

    def validation_epoch_end(self, outputs):
        # Validation end
        avg_val_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        return {"avg_val_loss": avg_val_loss}

               

    def train_dataloader(self):
        # Return a DataLoader for the training set
        # This will be used to iterate over the training data during training

        
        input_ids = [self.tokenizer.encode(text) for text in self.train_data["text"]]   
        # Create PyTorch tensors for the input and target variables
        input_ids = torch.tensor(input_ids)
        labels = torch.tensor(self.train_data["target"])
        train_data = torch.utils.data.TensorDataset(input_ids, labels)
        train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
        return train_dataloader

    def val_dataloader(self):
        # Return a DataLoader for the validation set
        # This will be used to iterate over the validation data during training
     
        input_ids = [self.tokenizer.encode(text) for text in self.val_data["text"]]
   

        # Create PyTorch tensors for the input and target variables
        input_ids = torch.tensor(input_ids) 
        labels = torch.tensor(self.val_data["target"])
        val_data = torch.utils.data.TensorDataset(input_ids,labels)
        val_dataloader = DataLoader(val_data, batch_size=32, shuffle=False)
        return val_dataloader





You don't have to make the explanation short or not. How should I make the Dataloader part?

In [None]:
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader
from transformers import DabertTokenizer, DabertModel

class DisasterTweetClassifier(pl.LightningModule):
    def __init__(self, num_classes=2,test_data=None,train_data=None,val_data=None):
        super().__init__()
        self.num_classes = num_classes
        self.tokenizer = DabertTokenizer.from_pretrained("dabert-base-model")
        self.model = DabertModel.from_pretrained("dabert-base-model")
        self.train_data=train_data

    def forward(self, input_ids):
        # Forward pass of the model
        logits = self.model(input_ids)[0]
        return logits

    def training_step(self, batch, batch_idx):
        # Training step
        input_ids, labels = batch
        logits = self.forward(input_ids)
        loss = self.loss_fn(logits, labels)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        # Validation step
        input_ids, labels = batch
        logits = self.forward(input_ids)
        val_loss = self.loss_fn(logits, labels)
        return {"val_loss": val_loss}

    def validation_epoch_end(self, outputs):
        # Validation end
        avg_val_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        return {"avg_val_loss": avg_val_loss}

    def prepare_data(self):
        # Load and prepare the data
        # This could involve reading the data from a file,
        # preprocessing it, and creating PyTorch tensors from it
        pass

    def train_dataloader(self):
        # Return a DataLoader for the training set
        train_dataloader = DataLoader(self.train_data, batch_size=32, shuffle=True)
        return train_dataloader

    def val_dataloader(self):
        # Return a DataLoader for the validation set
        val_dataloader = DataLoader(self.val_data, batch_size=32, shuffle=False)
        return val_dataloader


In [None]:
import pandas as pd
import torch

# Load the training data from a file or database
data = pd.read_csv("path/to/training/data.csv")

# Tokenize the text, location, and keyword columns using the Dabert tokenizer
tokenizer = DabertTokenizer.from_pretrained("dabert-base-model")
input_ids = [tokenizer.encode(text) for text in data["text"]]
locations = [tokenizer.encode(location) for location in data["location"]]
keywords = [tokenizer.encode(keyword) for keyword in data["keyword"]]

# Create PyTorch tensors for the input and target variables
input_ids = torch.tensor(input_ids)
locations = torch.tensor(locations)
keywords = torch.tensor(keywords)
labels = torch.tensor(data["target"])

# Combine the input and target tensors into a single dataset
train_data = torch.utils.data.TensorDataset(input_ids, locations, keywords, labels)


: 

In [None]:
class DisasterTweetClassifier(nn.Module):
    def __init__(self, model_name="dabert-base-model"):
        super().__init__()
        self.num_classes = 2
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)     
        self.data = pd.read_csv("path/to/training/data.csv")
        self.train_data,self.val_data=train_test_split(self.data,test_size=0.2,random_state=42)

    def forward(self, input_ids):
        # Forward pass of the model
        logits = self.model(input_ids)[0]
        return logits

    def call(self, input_ids):
        # Call method to forward input through the model
        logits = self.forward(input_ids)
        return logits

    def training_step(self, batch, batch_idx):
        # Training step
        input_ids, labels = batch
        logits = self.forward(


In [3]:
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from sklearn.model_selection import train_test_split
from torch import nn
import nltk
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('omw-1.4')



def get_sentence(c1, c2, c3):
    if pd.isnull(c1):
        c1 = ""
    if pd.isnull(c2):
        c2 = ""
    if pd.isnull(c3):
        c3 = ""
    return f"{c1} {c2} {c3}"
    
def clean_text(data):
    """
    input: data: a dataframe containing texts to be cleaned
    return: the same dataframe with an added column of clean text
    """
    data['clean_text'] = data['text'].str.lower()
    stop_words = list(stopwords.words('english'))
    punctuations = list(punctuation)
    clean_text = []
    lemmatizer = WordNetLemmatizer()
    for idx, row in enumerate(data['clean_text']):
        split_text = row.split()
        clean_text = [lemmatizer.lemmatize(word) for word in split_text if word not in stop_words and word not in punctuation]
        
        clean_text = ' '.join(clean_text)
        data.loc[idx]['clean_text'] = clean_text
    return data
    
class DisasterTweetClassifier(nn.Module):
    def __init__(self, model_name="dabert-base-model",use_other_features=False,clean_text=False):
        super().__init__()
        self.num_classes = 2
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)     
        self.data = pd.read_csv("/home/ybi/study/Kaggle/Natural-Language-Processing-with-Disaster-Tweets/Data/train.csv")
        if use_other_features:
            self.data["text"]=self.data.apply(lambda x: get_sentence(x["keyword"],x["location"],x["text"]),axis=1)
        if clean_text:
            self.data=clean_text(self.data)
        self.train_data,self.val_data=train_test_split(self.data,test_size=0.2,random_state=42)
        self.loss_fn = nn.CrossEntropyLoss()
        
        

    def call(self, input_ids):
        # Forward all of the input data through the model
        logits = self.model(input_ids)[0]
        return logits


    def forward(self, input_ids):
        return self.call(input_ids)

    def training_step(self, batch, batch_idx):
        # Training step
        input_ids, labels = batch
        logits = self.forward(input_ids)
        loss = self.loss_fn(logits, labels)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        # Validation step
        input_ids, labels = batch
        logits = self.forward(input_ids)
        val_loss = self.loss_fn(logits, labels)
        return {"val_loss": val_loss}

    def validation_epoch_end(self, outputs):
        # Validation end
        avg_val_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        return {"avg_val_loss": avg_val_loss}

               

    def train_dataloader(self):
        # Return a DataLoader for the training set
        # This will be used to iterate over the training data during training

        
        input_ids = [self.tokenizer.encode(text) for text in self.train_data["text"]]   
        # Create PyTorch tensors for the input and target variables
        input_ids = torch.tensor(input_ids)
        labels = torch.tensor(self.train_data["target"])
        train_data = torch.utils.data.TensorDataset(input_ids, labels)
        train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
        return train_dataloader

    def val_dataloader(self):
        # Return a DataLoader for the validation set
        # This will be used to iterate over the validation data during training
     
        input_ids = [self.tokenizer.encode(text) for text in self.val_data["text"]]
   

        # Create PyTorch tensors for the input and target variables
        input_ids = torch.tensor(input_ids) 
        labels = torch.tensor(self.val_data["target"])
        val_data = torch.utils.data.TensorDataset(input_ids,labels)
        val_dataloader = DataLoader(val_data, batch_size=32, shuffle=False)
        return val_dataloader




# Instantiate the DisasterTweetClassifier model
model = DisasterTweetClassifier()

# Instantiate a PyTorch Lightning Trainer
trainer = pl.Trainer(gpus=1)

# Train the model using the fit method
trainer.fit(model, num_epochs=10, learning_rate=1e-5)


ImportError: cannot import name 'DabertTokenizer' from 'transformers' (/home/ybi/anaconda3/envs/SIMCSE/lib/python3.8/site-packages/transformers/__init__.py)

In [None]:
!