In [1]:
import torch
from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-multilingual-cased")

In [2]:
from datasets import load_dataset
ds = load_dataset("csv", data_files="title_conference.csv", split="train", delimiter = ',')

Using custom data configuration default-b6f591c4d5f61f22
Reusing dataset csv (C:\Users\paperspace\.cache\huggingface\datasets\csv\default-b6f591c4d5f61f22\0.0.0\652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


In [3]:
ds=ds.train_test_split(test_size=0.1)

In [4]:
def main():
    global tokenized_datasets
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
    def tokenize_function(examples):
        #concat_str = [t + " " + s for t, s in zip(examples["title"], examples["selftext"])]
        return tokenizer(examples["Title"], padding='max_length', truncation=True, max_length = 64)
    tokenized_datasets = ds['train'].map(tokenize_function, batched=True, num_proc = 4)

if __name__ == "__main__":
    main()

    

In [5]:
tokenized_datasets

Dataset({
    features: ['Title', 'Conference', 'input_ids', 'attention_mask'],
    num_rows: 2256
})

In [6]:
#tokenized_datasets.cleanup_cache_files()
#ds.cleanup_cache_files()

In [None]:
!pip install deepspeed

In [None]:
%%bash
cat <<'EOT' > ds_config_zero3.json
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
EOT

In [6]:
# DeepSpeed requires a distributed environment even when only one process is used.
# This emulates a launcher in the notebook
import os

#os.environ["MASTER_ADDR"] = "localhost"
#os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
#os.environ["RANK"] = "0"
#os.environ["LOCAL_RANK"] = "0"
#os.environ["WORLD_SIZE"] = "1"
#os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "gloo" # for windows OS

from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

training_args = TrainingArguments(
    output_dir = "Distil_Model",
    per_device_train_batch_size = 32,
    num_train_epochs = 200,
    #evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=3e-5,
    weight_decay=0.01,
    push_to_hub=False,
    #deepspeed="ds_config_zero3.json"
)

In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

In [8]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: Title, Conference. If Title, Conference are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2256
  Num Epochs = 200
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 14200


Step,Training Loss
500,2.7701
1000,2.1816
1500,1.8688
2000,1.6304
2500,1.4091
3000,1.2163
3500,1.0578
4000,0.954
4500,0.807
5000,0.754




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=14200, training_loss=0.7947192957703496, metrics={'train_runtime': 3598.6491, 'train_samples_per_second': 125.38, 'train_steps_per_second': 3.946, 'total_flos': 7491873746534400.0, 'train_loss': 0.7947192957703496, 'epoch': 200.0})

In [9]:
trainer.save_model()

Saving model checkpoint to Distil_Model
Configuration saved in Distil_Model\config.json
Model weights saved in Distil_Model\pytorch_model.bin


In [12]:
from transformers import AutoModel
model = AutoModel.from_pretrained("Distil_Model")

loading configuration file Distil_Model\config.json
Model config DistilBertConfig {
  "_name_or_path": "Distil_Model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.21.2",
  "vocab_size": 119547
}

loading weights file Distil_Model\pytorch_model.bin
Some weights of the model checkpoint at Distil_Model were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing Di

In [13]:
with torch.no_grad():
    ds_with_bert_embeddings_v2 = ds["train"].map(lambda example: {'bert_embeddings': torch.mean(model(**tokenizer(example["Title"], return_tensors="pt")).last_hidden_state, dim = 1).squeeze().numpy()})

  0%|          | 0/2256 [00:00<?, ?ex/s]

In [17]:
ds_with_bert_embeddings_v2.add_faiss_index(column='bert_embeddings')
def predict(query):
    with torch.no_grad():
        question_embedding = torch.mean(model(**tokenizer(query, return_tensors="pt")).last_hidden_state, dim = 1).squeeze().numpy()
    
    scores, retrieved_examples = ds_with_bert_embeddings_v2.get_nearest_examples('bert_embeddings', question_embedding, k=3)
    return retrieved_examples["Conference"][0] + "|" + retrieved_examples["Title"][0]

  0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
ds.save_to_disk("Saved_Datasets")

ds_with_bert_embeddings_v2.save_faiss_index('bert_embeddings', 'Saved_Datasets\\bert_embeddings.faiss')
ds_with_bert_embeddings_v2.drop_index
ds_with_bert_embeddings_v2.save_to_disk("Saved_Datasets")

Loading cached processed dataset at C:\Users\paperspace\.cache\huggingface\datasets\csv\default-b6f591c4d5f61f22\0.0.0\652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-18787776b1735576.arrow
Loading cached processed dataset at C:\Users\paperspace\.cache\huggingface\datasets\csv\default-b6f591c4d5f61f22\0.0.0\652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-fa0044f1d62893dd.arrow


ValueError: please remove all the indexes using `dataset.drop_index` before saving a dataset

In [18]:
with torch.no_grad():
    ds_with_test_predictions = ds["test"].map(lambda example: {'Predictions': predict(example["Title"])})

  0%|          | 0/251 [00:00<?, ?ex/s]

In [19]:
ds_with_test_predictions.to_csv("distil_predictions_conference_v5.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

39450