<a href="https://colab.research.google.com/github/fatmabdj/TweetsDisaster-NLP-/blob/main/tweetDisaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [None]:

# Step 1: Load your data
# Download the Kaggle dataset and place it in your folder, or use your own CSV with 'text' and 'target' columns
df = pd.read_csv('train.csv')  # Replace with your file path, e.g., 'disaster_tweets.csv'
df.head()


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
import re
def clean_text(text):
         text = re.sub(r'http\S+', '', text)  # Remove URLs
         text = re.sub(r'@\w+', '', text)     # Remove mentions
         text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
         return text.lower()
df['clean_text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...


In [None]:

X = df['clean_text'].tolist()  # List of tweet texts
y = df['target'].tolist()  # List of labels (1 = disaster, 0 = not disaster)

In [None]:

# Optional: Quick data check
print(f"Total tweets: {len(X)}")
print(f"Disaster tweets: {sum(y)}, Non-disaster: {len(y) - sum(y)}")


Total tweets: 7613
Disaster tweets: 3271, Non-disaster: 4342


In [None]:
# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Step 2: Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 labels: 0 (not disaster) and 1 (disaster)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Use GPU if available (faster training)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:

# Step 3: Tokenize the data
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_function(X_train)
test_encodings = tokenize_function(X_test)


In [None]:

# Step 4: Create PyTorch datasets
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, y_train)
test_dataset = TweetDataset(test_encodings, y_test)


In [None]:

# Step 5: Set up training arguments
training_args = TrainingArguments(
    output_dir='./disaster_results',  # Save model here
    num_train_epochs=3,               # Train for 3 epochs (increase if needed)
    per_device_train_batch_size=8,    # Batch size (adjust based on your GPU/CPU)
    per_device_eval_batch_size=8,
    warmup_steps=500,                 # Helps with training stability
    weight_decay=0.01,                # Regularization
    logging_dir='./logs',             # Logs
    logging_steps=10,
    eval_strategy="epoch",            # Evaluate after each epoch
    report_to="none",                 # Disable wandb to avoid login issues
    save_strategy="epoch",            # Save model after each epoch
)

In [None]:

# Step 6: Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3373,0.423885
2,0.3954,0.450779
3,0.2712,0.631289


TrainOutput(global_step=2286, training_loss=0.3732411067212556, metrics={'train_runtime': 325.1551, 'train_samples_per_second': 56.189, 'train_steps_per_second': 7.03, 'total_flos': 516381140583000.0, 'train_loss': 0.3732411067212556, 'epoch': 3.0})

In [None]:
# Step 7: Evaluate on test set
predictions = trainer.predict(test_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)
accuracy = accuracy_score(y_test, preds)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, preds, target_names=['Not Disaster', 'Disaster']))

Test Accuracy: 0.8365
Classification Report:
              precision    recall  f1-score   support

Not Disaster       0.85      0.87      0.86       874
    Disaster       0.82      0.79      0.80       649

    accuracy                           0.84      1523
   macro avg       0.83      0.83      0.83      1523
weighted avg       0.84      0.84      0.84      1523



In [None]:
# Step 8: Save the trained model for later use
trainer.save_model('./disaster_bert_model')
print("Model saved to './disaster_bert_model'")
from transformers import BertTokenizer, BertForSequenceClassification
import torch

Model saved to './disaster_bert_model'


In [None]:
# Load model
model = BertForSequenceClassification.from_pretrained('./disaster_bert_model')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def tweet_prediction(tweet_text):
    inputs = tokenizer(tweet_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
    return "Disaster" if pred == 1 else "Not Disaster"
!pip install pyspellchecker

from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spelling(text):
    words = text.split()
    corrected = [spell.correction(word) for word in words]
    return ' '.join(corrected)

tweet = "My heart is breaking after that breakup. It's like a tsunami of emotions."
corrected_tweet = correct_spelling(tweet)
prediction = tweet_prediction(corrected_tweet)
print(prediction)

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.4-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.4-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.4
Disaster


In [None]:
!pip install pyspellchecker


Collecting pyspellchecker
  Downloading pyspellchecker-0.8.4-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.4-py3-none-any.whl (7.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.2/7.2 MB[0m [31m224.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m126.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.4


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load model
model = BertForSequenceClassification.from_pretrained('./disaster_bert_model')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def tweet_prediction(tweet_text):
    inputs = tokenizer(tweet_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
    return "Disaster" if pred == 1 else "Not Disaster"

from spellchecker import SpellChecker
spell = SpellChecker()

def correct_spelling(text):
    words = text.split()
    corrected = [spell.correction(word) for word in words]
    return ' '.join(corrected)

tweet = "My heart is breaking after that breakup. It's like a tsunami of emotions."
corrected_tweet = correct_spelling(tweet)
prediction = tweet_prediction(corrected_tweet)
print(prediction)

Disaster


In [None]:
!zip -r disaster_bert_model.zip disaster_bert_model


  adding: disaster_bert_model/ (stored 0%)
  adding: disaster_bert_model/special_tokens_map.json (deflated 42%)
  adding: disaster_bert_model/vocab.txt (deflated 53%)
  adding: disaster_bert_model/config.json (deflated 49%)
  adding: disaster_bert_model/model.safetensors (deflated 7%)
  adding: disaster_bert_model/tokenizer_config.json (deflated 75%)


In [None]:
from google.colab import files
files.download('disaster_bert_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


!cp -r disaster_bert_model /content/drive/MyDrive/

Mounted at /content/drive
