In [1]:
!pip install transformers[torch] datasets scikit-learn
!pip install accelerate -U
!pip install --upgrade transformers

import numpy as np
import torch
import transformers
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



In [2]:
from datasets import load_dataset

dataset = load_dataset("venetis/disaster_tweets")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/310 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/988k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7613 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target'],
        num_rows: 7613
    })
})


In [3]:
print(dataset['train'][0])

{'id': 1, 'keyword': None, 'location': None, 'text': 'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all', 'target': 1}


In [4]:
import re
from transformers import AutoTokenizer
import torch
from datasets import Dataset, DatasetDict

In [5]:
from sklearn.model_selection import train_test_split

# Convert the dataset to a pandas DataFrame
df = dataset['train'].to_pandas()

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.1, stratify=df['target'], random_state=42)

train_df['labels'] = train_df['target']
test_df['labels'] = test_df['target']

# Convert DataFrames back to Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [6]:
train_dataset

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'labels', '__index_level_0__'],
    num_rows: 6851
})

In [7]:
import re

# Define the text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Preprocess text in each split
def preprocess_function(examples):
    examples['text'] = [preprocess_text(text) for text in examples['text']]
    return examples

# Apply the preprocessing function
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/6851 [00:00<?, ? examples/s]

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

In [8]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
tokenized_train = tokenizer(train_dataset['text'])
tokenized_test = tokenizer(test_dataset['text'])

In [10]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

In [11]:
tokenized_datasets_train = train_dataset.map(tokenize_function, batched=True)
tokenized_datasets_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6851 [00:00<?, ? examples/s]

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets_train

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6851
})

In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
!pip install evaluate
import evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [15]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [16]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
predictions = trainer.predict(tokenized_datasets_test)
#print(predictions.predictions.shape, predictions.label_ids.shape)

preds = np.argmax(predictions.predictions, axis=-1)

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.504,0.525821,0.788714,0.773558
2,0.384,0.496457,0.832021,0.792208
3,0.2823,0.663554,0.80315,0.770642


TrainOutput(global_step=2571, training_loss=0.38815212731971577, metrics={'train_runtime': 198.3233, 'train_samples_per_second': 103.634, 'train_steps_per_second': 12.964, 'total_flos': 311069725753500.0, 'train_loss': 0.38815212731971577, 'epoch': 3.0})

In [19]:
predictions = trainer.predict(tokenized_datasets_test)
preds = np.argmax(predictions.predictions, axis=-1)

# Save predictions to a DataFrame
test_df['predictions'] = preds

# Save the DataFrame to a CSV file including text, true labels, and predictions
test_df[['text', 'labels', 'predictions']].to_csv('predictions.csv', index=False)

# Print a message to indicate completion
print("Predictions saved to predictions.csv")

Predictions saved to predictions.csv


In [20]:
import pandas as pd

new_df = pd.read_csv('/content/test.csv')

new_df['text'] = new_df['text'].apply(preprocess_text)

new_dataset = Dataset.from_pandas(new_df)
tokenized_new_dataset = new_dataset.map(tokenize_function, batched=True)
trainer = Trainer(model=model, data_collator=data_collator, tokenizer=tokenizer)
predictions = trainer.predict(tokenized_new_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

new_df['predictions'] = preds


Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

Predictions saved to new_predictions.csv


In [24]:
new_df = new_df.rename(columns={'predictions': 'target'})

# Save the DataFrame to a CSV file including id and target
new_df[['id', 'target']].to_csv('new_predictions_2.csv', index=False)

# Print a message to indicate completion
print("Predictions saved to new_predictions.csv")

Predictions saved to new_predictions.csv
