# Imports and Installation



In [1]:
#transformer models from hugging face open source website
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.4 MB/s[0m eta [36m0:00:0

In [2]:
#import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from transformers import TFAutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split

# Exploring the Dataset

In [3]:
# Load the data
train_url = 'https://raw.githubusercontent.com/google-research-datasets/query-wellformedness/master/train.tsv'
test_url = 'https://raw.githubusercontent.com/google-research-datasets/query-wellformedness/master/test.tsv'
dev_url = 'https://raw.githubusercontent.com/google-research-datasets/query-wellformedness/master/dev.tsv'
#Formatting the data into query and well_formed columns
df_train = pd.read_table(train_url, sep='\t',names=['query', 'well_formed'])
df_test = pd.read_table(test_url, sep='\t',names=['query', 'well_formed'])
df_dev = pd.read_table(dev_url, sep='\t',names=['query', 'well_formed'])
#combine datasets into one dataframe
df_combine = df_train.append(df_test,ignore_index = True)
df = df_combine.append(df_dev,ignore_index = True)
df

  df_combine = df_train.append(df_test,ignore_index = True)
  df = df_combine.append(df_dev,ignore_index = True)


Unnamed: 0,query,well_formed
0,The European Union includes how many ?,0.2
1,What are Mia Hamms accomplishment ?,0.4
2,Which form of government is still in place in ...,1.0
3,When was the canal de panama built ?,0.8
4,What color is the black box on commercial aero...,0.6
...,...,...
25095,Mission and vision of amul dairy ?,0.0
25096,How did Geography help the success of the amer...,1.0
25097,Will a job as an electrician still exist in th...,1.0
25098,How many terms is the governor allowed to serve ?,1.0


# Preprocessing the Data

In [4]:
# Preprocess the data
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
max_len = 128

X = []
for query in df['query']:
    encoded = tokenizer.encode_plus(
        query,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    X.append(encoded['input_ids'][0])
X = np.array(X)

Y = df['well_formed'].values

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Splitting the data

In [5]:
#splitting into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

# Transformer model

In [6]:
#building transformer model with linear activation, mean square error loss and mean absolute error metrics
def build_model(transformer, max_len):
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    sequence_output = transformer(input_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='linear')(cls_token)
    model = Model(inputs=input_ids, outputs=out)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

# Fine-tuning the model

In [7]:
#fine-tuning language recognizing model bert-base-uncased
transformer_model = TFAutoModel.from_pretrained('bert-base-uncased')
model = build_model(transformer_model, max_len)

# Train the model
batch_size = 32
epochs = 10

model.fit(
    X_train,
    Y_train,
    validation_data=(X_val, Y_val),
    batch_size=batch_size,
    epochs=epochs
)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c5f02f5ee00>

# Testing the results

In [8]:
queries = ["What were the reasons for everyone to leave the company?", "tell me way city to the", "What is the capital of France?", "what was the reasons for everyone to leave the company"]

results = []

for query in queries:
    encoded = tokenizer.encode_plus(
        query,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    X_test = encoded['input_ids']

    prediction = model.predict(X_test)[0][0]

    results.append((query, prediction))

for query, prediction in results:
    print(f"The predicted well-formedness score for query '{query}' is: {prediction}")


The predicted well-formedness score for query 'What were the reasons for everyone to leave the company?' is: 0.9441385269165039
The predicted well-formedness score for query 'tell me way city to the' is: -0.07047920674085617
The predicted well-formedness score for query 'What is the capital of France?' is: 0.9906370639801025
The predicted well-formedness score for query 'what was the reasons for everyone to leave the company' is: 0.5781500339508057


# Comparing with another pre-trained model

In [None]:
#testing another trained model from the forums https://huggingface.co/salesken/query_wellformedness_score
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("salesken/query_wellformedness_score")
model = AutoModelForSequenceClassification.from_pretrained("salesken/query_wellformedness_score")


Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at salesken/query_wellformedness_score were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
sentences = ["What were the reasons for everyone to leave the company?", "tell me way city to the", "What is the capital of France?", "what was the reasons for everyone to leave the company"]

features = tokenizer(sentences,  padding=True, truncation=True, return_tensors="pt")
model.eval()
with torch.no_grad():
    scores = model(**features).logits
print(scores)

tensor([[0.9530],
        [0.0323],
        [1.0022],
        [0.2357]])


Reference

@InProceedings{FaruquiDas2018,
  title = {{Identifying Well-formed Natural Language Questions}},
  author = {Faruqui, Manaal and Das, Dipanjan},
  booktitle = {Proc. of EMNLP},
  year = {2018}
}