In [23]:
from transformers import TextClassificationPipeline, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline 
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from pymongo import MongoClient
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from imblearn.under_sampling import RandomUnderSampler
import numpy as np

In [2]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [4]:
db_client = MongoClient(host="localhost", port=27017)
db = db_client["Website_Chatbot"]
collection = db["MITS"]

questions = []
for document in collection.find():
    questions += document["questions"]

non_context_data = pd.read_csv("Data/data.csv")
context_data = questions

def preprocess_fn(examples):
    return tokenizer(examples["text"], truncation=True)

data = [{"label":0, "text":x[1].Question} for x in non_context_data.iterrows()]
data += [{"label":1, "text":x} for x in context_data]
data_df = pd.DataFrame(data, index=None)
data_dataset = Dataset.from_pandas(data_df)


### Under Smapling

In [5]:
text = np.array(data_df.index).reshape(11243, -1)
labels = data_df.label
over_sampler = RandomUnderSampler(random_state=46)
x_resampled, y_resampled = over_sampler.fit_resample(text, labels)
x_resampled = [data_df.text.iloc[x].item() for x in x_resampled]

In [6]:
final_df = pd.DataFrame({"text":x_resampled, "label":y_resampled})
hf_dataset = Dataset.from_pandas(final_df)
tokenized_data = hf_dataset.map(preprocess_fn, batched=True)

100%|██████████| 3/3 [00:00<00:00, 25.64ba/s]


In [7]:
# Train, Test, Validation split
train_test = tokenized_data.train_test_split(test_size=.3)
test_valid = train_test['test'].train_test_split(test_size=.4)

# x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=.2)

# x_train, x_validation , y_train, y_validation  = train_test_split(x_train, y_train, test_size=.25)


In [45]:
# print(len(x_train))
# print(len(x_test))
# print(len(x_validation))
# test_valid
train_test

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2030
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 870
    })
})

In [8]:
final_dataset = DatasetDict({
    "train": train_test['train'],
    "test": test_valid['train'],
    "valid": test_valid['test']

})

In [15]:
# y_resampled = list(y_resampled)
# sampled_df = pd.DataFrame({"label": y_resampled, "text": x_resampled})
# data_dataset = Dataset.from_pandas(sampled_df)
# tokenized_data = data_dataset.map(preprocess_fn, batched=True)

100%|██████████| 3/3 [00:00<00:00, 27.18ba/s]


In [9]:
final_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2030
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 522
    })
    valid: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 348
    })
})

In [51]:
training_args = TrainingArguments(
    output_dir="./results_undersampling",
    learning_rate=2e-5,
    per_gpu_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset['train'],
    eval_dataset=final_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 2030
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1

{'loss': 0.0005, 'learning_rate': 1.2125984251968505e-05, 'epoch': 3.94}


Model weights saved in ./results_undersampling\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results_undersampling\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results_undersampling\checkpoint-500\special_tokens_map.json
                                       
  0%|          | 0/635 [06:54<?, ?it/s]            Saving model checkpoint to ./results_undersampling\checkpoint-1000
Configuration saved in ./results_undersampling\checkpoint-1000\config.json


{'loss': 0.0022, 'learning_rate': 4.251968503937008e-06, 'epoch': 7.87}


Model weights saved in ./results_undersampling\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in ./results_undersampling\checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./results_undersampling\checkpoint-1000\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


                                       
100%|██████████| 1270/1270 [02:26<00:00,  8.69it/s]

{'train_runtime': 146.4722, 'train_samples_per_second': 138.593, 'train_steps_per_second': 8.671, 'train_loss': 0.0010962399104096758, 'epoch': 10.0}





TrainOutput(global_step=1270, training_loss=0.0010962399104096758, metrics={'train_runtime': 146.4722, 'train_samples_per_second': 138.593, 'train_steps_per_second': 8.671, 'train_loss': 0.0010962399104096758, 'epoch': 10.0})

In [58]:
under_sampled_trained_model = AutoModelForSequenceClassification.from_pretrained("results_undersampling/checkpoint-1000/")
under_sampled_pipeline = TextClassificationPipeline(model=under_sampled_trained_model, tokenizer=tokenizer)

loading configuration file results_undersampling/checkpoint-1000/config.json
Model config DistilBertConfig {
  "_name_or_path": "results_undersampling/checkpoint-1000/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.13.0",
  "vocab_size": 30522
}

loading weights file results_undersampling/checkpoint-1000/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model che

In [20]:
# result = pipeline(final_dataset['test'])
x_test = final_dataset['test']['text']
y_test = final_dataset['test']['label']

In [17]:
prediction = pipeline(x_test)

In [18]:
prediction = [0 if x['label'] =="LABEL_0" else 1 for x in prediction]

In [24]:
print(f"The accuracy score is {accuracy_score(y_test, prediction)}")
print(f"The precision score is {precision_score(y_test, prediction)}")
print(f"The recall score is {recall_score(y_test, prediction)}")
print(f"The f1 score is {f1_score(y_test, prediction)}")

The accuracy score is 0.9980842911877394
The precision score is 0.9961389961389961
The recall score is 1.0
The f1 score is 0.9980657640232108


In [48]:
import torch 
torch.cuda.empty_cache()
import mlflow
mlflow.end_run()

### Over Sampling

In [41]:
text = np.array(data_df.index).reshape(11243, -1)
labels = data_df.label
over_sampler = RandomOverSampler(random_state=46)
x_resampled, y_resampled = over_sampler.fit_resample(text, labels)
x_resampled = [data_df.text.iloc[x].item() for x in x_resampled]

In [42]:
final_df = pd.DataFrame({"text":x_resampled, "label":y_resampled})
hf_dataset = Dataset.from_pandas(final_df)
tokenized_data = hf_dataset.map(preprocess_fn, batched=True)

100%|██████████| 20/20 [00:01<00:00, 19.49ba/s]


In [44]:
train_test = tokenized_data.train_test_split(test_size=.3)
test_valid = train_test['test'].train_test_split(test_size=.4)

In [47]:
# train_test
# test_valid
final_dataset = DatasetDict({
    "train": train_test['train'],
    "test": test_valid['train'],
    "valid": test_valid['test']

})

In [48]:
final_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 13710
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3525
    })
    valid: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2351
    })
})

In [49]:
training_args = TrainingArguments(
    output_dir="./results_oversampling",
    learning_rate=2e-5,
    per_gpu_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset['train'],
    eval_dataset=final_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 13710
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 8570
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
  6%|▌         | 500/8570 [00:59<14:46,  9.10it/s] Saving model checkpoint to ./results_oversampling\checkpoint-500
Configuration saved in ./

{'loss': 0.0452, 'learning_rate': 1.8833138856476082e-05, 'epoch': 0.58}


Model weights saved in ./results_oversampling\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-500\special_tokens_map.json
 12%|█▏        | 1000/8570 [01:58<12:55,  9.76it/s] Saving model checkpoint to ./results_oversampling\checkpoint-1000
Configuration saved in ./results_oversampling\checkpoint-1000\config.json


{'loss': 0.0077, 'learning_rate': 1.766627771295216e-05, 'epoch': 1.17}


Model weights saved in ./results_oversampling\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-1000\special_tokens_map.json
 18%|█▊        | 1500/8570 [02:57<13:26,  8.77it/s]  Saving model checkpoint to ./results_oversampling\checkpoint-1500
Configuration saved in ./results_oversampling\checkpoint-1500\config.json


{'loss': 0.0032, 'learning_rate': 1.649941656942824e-05, 'epoch': 1.75}


Model weights saved in ./results_oversampling\checkpoint-1500\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-1500\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-1500\special_tokens_map.json
 23%|██▎       | 2000/8570 [03:57<12:38,  8.66it/s]  Saving model checkpoint to ./results_oversampling\checkpoint-2000
Configuration saved in ./results_oversampling\checkpoint-2000\config.json


{'loss': 0.0019, 'learning_rate': 1.5332555425904317e-05, 'epoch': 2.33}


Model weights saved in ./results_oversampling\checkpoint-2000\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-2000\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-2000\special_tokens_map.json
 29%|██▉       | 2500/8570 [04:57<10:22,  9.75it/s]  Saving model checkpoint to ./results_oversampling\checkpoint-2500
Configuration saved in ./results_oversampling\checkpoint-2500\config.json


{'loss': 0.0, 'learning_rate': 1.4165694282380397e-05, 'epoch': 2.92}


Model weights saved in ./results_oversampling\checkpoint-2500\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-2500\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-2500\special_tokens_map.json
 35%|███▌      | 3000/8570 [05:56<09:40,  9.59it/s]  Saving model checkpoint to ./results_oversampling\checkpoint-3000
Configuration saved in ./results_oversampling\checkpoint-3000\config.json


{'loss': 0.0, 'learning_rate': 1.2998833138856476e-05, 'epoch': 3.5}


Model weights saved in ./results_oversampling\checkpoint-3000\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-3000\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-3000\special_tokens_map.json
 41%|████      | 3500/8570 [06:55<11:25,  7.40it/s]  Saving model checkpoint to ./results_oversampling\checkpoint-3500
Configuration saved in ./results_oversampling\checkpoint-3500\config.json


{'loss': 0.0, 'learning_rate': 1.1831971995332557e-05, 'epoch': 4.08}


Model weights saved in ./results_oversampling\checkpoint-3500\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-3500\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-3500\special_tokens_map.json
 47%|████▋     | 4000/8570 [07:54<08:15,  9.22it/s]  Saving model checkpoint to ./results_oversampling\checkpoint-4000
Configuration saved in ./results_oversampling\checkpoint-4000\config.json


{'loss': 0.0, 'learning_rate': 1.0665110851808636e-05, 'epoch': 4.67}


Model weights saved in ./results_oversampling\checkpoint-4000\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-4000\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-4000\special_tokens_map.json
 53%|█████▎    | 4500/8570 [08:53<06:20, 10.69it/s]  Saving model checkpoint to ./results_oversampling\checkpoint-4500
Configuration saved in ./results_oversampling\checkpoint-4500\config.json


{'loss': 0.0, 'learning_rate': 9.498249708284714e-06, 'epoch': 5.25}


Model weights saved in ./results_oversampling\checkpoint-4500\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-4500\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-4500\special_tokens_map.json
 58%|█████▊    | 5000/8570 [09:52<05:42, 10.42it/s]  Saving model checkpoint to ./results_oversampling\checkpoint-5000
Configuration saved in ./results_oversampling\checkpoint-5000\config.json


{'loss': 0.0, 'learning_rate': 8.331388564760793e-06, 'epoch': 5.83}


Model weights saved in ./results_oversampling\checkpoint-5000\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-5000\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-5000\special_tokens_map.json
 64%|██████▍   | 5500/8570 [10:52<05:45,  8.88it/s]Saving model checkpoint to ./results_oversampling\checkpoint-5500
Configuration saved in ./results_oversampling\checkpoint-5500\config.json


{'loss': 0.0, 'learning_rate': 7.164527421236873e-06, 'epoch': 6.42}


Model weights saved in ./results_oversampling\checkpoint-5500\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-5500\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-5500\special_tokens_map.json
 70%|███████   | 6000/8570 [11:49<04:20,  9.86it/s]  Saving model checkpoint to ./results_oversampling\checkpoint-6000
Configuration saved in ./results_oversampling\checkpoint-6000\config.json


{'loss': 0.0, 'learning_rate': 5.9976662777129524e-06, 'epoch': 7.0}


Model weights saved in ./results_oversampling\checkpoint-6000\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-6000\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-6000\special_tokens_map.json
 76%|███████▌  | 6500/8570 [12:49<04:37,  7.47it/s]Saving model checkpoint to ./results_oversampling\checkpoint-6500
Configuration saved in ./results_oversampling\checkpoint-6500\config.json


{'loss': 0.0, 'learning_rate': 4.830805134189031e-06, 'epoch': 7.58}


Model weights saved in ./results_oversampling\checkpoint-6500\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-6500\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-6500\special_tokens_map.json
 82%|████████▏ | 7000/8570 [13:46<03:13,  8.13it/s]Saving model checkpoint to ./results_oversampling\checkpoint-7000
Configuration saved in ./results_oversampling\checkpoint-7000\config.json


{'loss': 0.0, 'learning_rate': 3.6639439906651113e-06, 'epoch': 8.17}


Model weights saved in ./results_oversampling\checkpoint-7000\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-7000\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-7000\special_tokens_map.json
 88%|████████▊ | 7500/8570 [14:44<01:51,  9.63it/s]Saving model checkpoint to ./results_oversampling\checkpoint-7500
Configuration saved in ./results_oversampling\checkpoint-7500\config.json


{'loss': 0.0, 'learning_rate': 2.4970828471411906e-06, 'epoch': 8.75}


Model weights saved in ./results_oversampling\checkpoint-7500\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-7500\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-7500\special_tokens_map.json
 93%|█████████▎| 8000/8570 [15:43<01:04,  8.84it/s]Saving model checkpoint to ./results_oversampling\checkpoint-8000
Configuration saved in ./results_oversampling\checkpoint-8000\config.json


{'loss': 0.0, 'learning_rate': 1.3302217036172696e-06, 'epoch': 9.33}


Model weights saved in ./results_oversampling\checkpoint-8000\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-8000\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-8000\special_tokens_map.json
 99%|█████████▉| 8500/8570 [16:38<00:08,  8.70it/s]Saving model checkpoint to ./results_oversampling\checkpoint-8500
Configuration saved in ./results_oversampling\checkpoint-8500\config.json


{'loss': 0.0, 'learning_rate': 1.633605600933489e-07, 'epoch': 9.92}


Model weights saved in ./results_oversampling\checkpoint-8500\pytorch_model.bin
tokenizer config file saved in ./results_oversampling\checkpoint-8500\tokenizer_config.json
Special tokens file saved in ./results_oversampling\checkpoint-8500\special_tokens_map.json
100%|█████████▉| 8569/8570 [16:50<00:00,  8.16it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 8570/8570 [16:51<00:00,  8.48it/s]

{'train_runtime': 1011.442, 'train_samples_per_second': 135.549, 'train_steps_per_second': 8.473, 'train_loss': 0.0033963261495195663, 'epoch': 10.0}





TrainOutput(global_step=8570, training_loss=0.0033963261495195663, metrics={'train_runtime': 1011.442, 'train_samples_per_second': 135.549, 'train_steps_per_second': 8.473, 'train_loss': 0.0033963261495195663, 'epoch': 10.0})

In [59]:
over_sampled_trained_model = AutoModelForSequenceClassification.from_pretrained("results_oversampling/checkpoint-8500/")
over_sampled_pipeline = TextClassificationPipeline(model=over_sampled_trained_model, tokenizer=tokenizer)

loading configuration file results_oversampling/checkpoint-8500/config.json
Model config DistilBertConfig {
  "_name_or_path": "results_oversampling/checkpoint-8500/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.13.0",
  "vocab_size": 30522
}

loading weights file results_oversampling/checkpoint-8500/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkp

In [53]:
x_test = final_dataset["test"]['text']
y_test = final_dataset["test"]['label']

prediction = pipeline(x_test)

prediction = [0 if x['label'] == "LABEL_0" else 1 for x in prediction]

In [55]:
len(prediction)

3525

In [56]:
pipeline("who is the principal")

[{'label': 'LABEL_0', 'score': 0.9999991655349731}]

In [54]:
print(f"The accuracy score is {accuracy_score(y_test, prediction)}")
print(f"The precision score is {precision_score(y_test, prediction)}")
print(f"The recall score is {recall_score(y_test, prediction)}")
print(f"The f1 score is {f1_score(y_test, prediction)}")

The accuracy score is 0.9997163120567376
The precision score is 0.9994353472614342
The recall score is 1.0
The f1 score is 0.9997175939000282


In [64]:
real_world_question = ["Who is the principal", "who is the principal of mits",
"Good morning", "Who are you", "what is the eco club", 
"Where is the college located", "where is your location"]



In [65]:
#Over sampled model
over_sampled_pipeline(real_world_question)

[{'label': 'LABEL_0', 'score': 0.9999991655349731},
 {'label': 'LABEL_1', 'score': 0.999991774559021},
 {'label': 'LABEL_0', 'score': 0.9999994039535522},
 {'label': 'LABEL_0', 'score': 0.9999995231628418},
 {'label': 'LABEL_0', 'score': 0.999011754989624},
 {'label': 'LABEL_1', 'score': 0.8898599147796631},
 {'label': 'LABEL_0', 'score': 0.9999995231628418}]

In [66]:
under_sampled_pipeline(real_world_question)

[{'label': 'LABEL_1', 'score': 0.9999964237213135},
 {'label': 'LABEL_1', 'score': 0.999997615814209},
 {'label': 'LABEL_0', 'score': 0.9999982118606567},
 {'label': 'LABEL_0', 'score': 0.9999986886978149},
 {'label': 'LABEL_1', 'score': 0.9999985694885254},
 {'label': 'LABEL_1', 'score': 0.9999967813491821},
 {'label': 'LABEL_0', 'score': 0.9999982118606567}]