In [26]:
from transformers import TextClassificationPipeline, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline 
import pandas as pd
from datasets import load_dataset, Dataset
from pymongo import MongoClient
from imblearn.over_sampling import RandomOverSampler
import numpy as np


In [2]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

In [9]:
db_client = MongoClient(host="localhost", port=27017)
db = db_client["Website_Chatbot"]
collection = db["MITS"]

questions = []
for document in collection.find():
    questions += document["questions"]

non_context_data = pd.read_csv("Data/data.csv")
context_data = questions

In [10]:
def preprocess_fn(examples):
    return tokenizer(examples["text"], truncation=True)

In [19]:
data = [{"label":0, "text":x[1].Question} for x in non_context_data.iterrows()]
data += [{"label":1, "text":x} for x in context_data]
data_df = pd.DataFrame(data, index=None)
data_dataset = Dataset.from_pandas(data_df)

100%|██████████| 12/12 [00:00<00:00, 52.39ba/s]


In [30]:
# data_df.label.value_counts()
text = np.array(data_df.index).reshape(11243, -1)
labels = data_df.label
over_sampler = RandomOverSampler(random_state=46)
x_resampled, y_resampled = over_sampler.fit_resample(text, labels)

In [32]:
x_resampled.shape

(19586, 1)

In [None]:
# x_resampled = [data_df.text.iloc[x].item() for x in x_resampled]


In [58]:
x_resampled = list(map(lambda x:x.strip(), x_resampled))

In [59]:
y_resampled = list(y_resampled)

In [60]:
sampled_df = pd.DataFrame({"label": y_resampled, "text": x_resampled})
data_dataset = Dataset.from_pandas(sampled_df)
tokenized_data = data_dataset.map(preprocess_fn, batched=True)

100%|██████████| 20/20 [00:00<00:00, 54.05ba/s]


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_gpu_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    # eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

In [63]:
# tmp_token = tokenizer("Who are you", truncation=True)
# trainer.predict(tmp_token)
# tmp_token
tmp_model = AutoModelForSequenceClassification.from_pretrained("results/checkpoint-12000")
pipe = TextClassificationPipeline(model=tmp_model, tokenizer=tokenizer)

loading configuration file results/checkpoint-12000\config.json
Model config DistilBertConfig {
  "_name_or_path": "results/checkpoint-12000",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.13.0",
  "vocab_size": 30522
}

loading weights file results/checkpoint-12000\pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at results/checkpoint-12000.
If 

In [71]:
result = pipe("Who is the vice principal")

In [74]:
result[0]['label']

'LABEL_0'

In [None]:
import torch 
torch.cuda.empty_cache()
import mlflow
mlflow.end_run()

In [None]:
imdb = load_dataset("imdb")
tokenized_imdb = imdb.map(preprocess_fn, batched=True)