In [28]:
from transformers import TextClassificationPipeline, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline 
import pandas as pd
from datasets import load_dataset, Dataset
from pymongo import MongoClient

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [3]:
db_client = MongoClient(host="localhost", port=27017)
db = db_client["Website_Chatbot"]
collection = db["MITS"]

questions = []
for document in collection.find():
    questions += document["questions"]

non_context_data = pd.read_csv("Data/data.csv")
context_data = questions

In [4]:
def preprocess_fn(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
data = [{"label":0, "text":x[1].Question} for x in non_context_data.iterrows()]
data += [{"label":1, "text":x} for x in context_data]
data_df = pd.DataFrame(data)
data_dataset = Dataset.from_pandas(data_df)
tokenized_data = data_dataset.map(preprocess_fn, batched=True)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_gpu_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    # eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

In [32]:
# tmp_token = tokenizer("Who are you", truncation=True)
# trainer.predict(tmp_token)
# tmp_token
tmp_model = AutoModelForSequenceClassification.from_pretrained("results/checkpoint-3500")
pipe = TextClassificationPipeline(model=tmp_model, tokenizer=tokenizer)

loading configuration file results/checkpoint-3500\config.json
Model config DistilBertConfig {
  "_name_or_path": "results/checkpoint-3500",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.13.0",
  "vocab_size": 30522
}

loading weights file results/checkpoint-3500\pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at results/checkpoint-3500.
If your

[{'label': 'LABEL_0', 'score': 0.9999891519546509}]

In [36]:
pipe("principal")

[{'label': 'LABEL_0', 'score': 0.9999831914901733}]

In [None]:
import torch 
torch.cuda.empty_cache()
import mlflow
mlflow.end_run()

In [None]:
imdb = load_dataset("imdb")
tokenized_imdb = imdb.map(preprocess_fn, batched=True)