In [None]:
!pip install sentence_transformers

In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import torch
from torch.utils.data import DataLoader, TensorDataset

sns.set_theme()
tqdm.pandas()

# Modular function for the end-to-end combined pipeline:

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
project_path = "/content/drive/MyDrive/University/Winter 23 Courses/CSI 5180 - AI Virtual Assistants/Project"

kmeans_model_path = f"{project_path}/models/combined_approach/kmeans.pkl"
bert_model_path = f"{project_path}/models/bert_inscope_original_data/saved_models"

In [None]:
kmeans = pickle.load(open(kmeans_model_path, 'rb'))
mpnet = SentenceTransformer("all-mpnet-base-v2")

tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_path)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
bert_model.to(device)
bert_model.eval()

THRESHOLD = 0.95
df_test = pd.read_csv(f"{project_path}/data/original_dataset/original_test_data.csv", dtype={"query":"string","label":"string"})
class_labels = sorted(df_test["label"].unique())
class_labels.remove('oos')

In [6]:
def compute_distance_of_closest_cluster(df):
    cluster_distances = kmeans.transform(mpnet.encode(df["query"].values))
    cluster_min_distances = cluster_distances.min(axis=1)
    df["min_distances"] = cluster_min_distances

    return df

In [7]:
def predict_oos(df):
    '''
    input df is a Pandas dataframe with the text input under the column named 'query'
    ''' 
    sentence_embeddings = []
    embedding_matrix = mpnet.encode(df["query"].values)
    for i in range(embedding_matrix.shape[0]):
        sentence_embeddings.append(embedding_matrix[i])  

    df["mpnet_embeddings_sentence"] = sentence_embeddings
    df = compute_distance_of_closest_cluster(df)
    df["pred_1"] = df.min_distances.apply(lambda x: "oos" if x>THRESHOLD else "in-scope")

    return df

In [47]:
def predict_inscope(df, single_instance):
    query_list = df["query"].to_list()
    encoded_query_list = tokenizer(query_list, padding=True, truncation=True, return_tensors='pt').to(device)
    input_ids = encoded_query_list['input_ids']
    token_type_ids = encoded_query_list['token_type_ids']
    attention_mask = encoded_query_list['attention_mask']

    if single_instance:
        with torch.no_grad():
            logits = bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
            predictions = torch.argmax(logits, dim=1)
    else:
        input_data = TensorDataset(input_ids, token_type_ids, attention_mask)
        dataloader = DataLoader(input_data, batch_size=64)

        predictions = []
        for batch_input_ids, batch_token_type_ids, batch_attention_mask in dataloader:
            with torch.no_grad():
                logits = bert_model(input_ids=batch_input_ids, token_type_ids=batch_token_type_ids, attention_mask=batch_attention_mask)[0]
                batch_predictions = torch.argmax(logits, dim=1)

            predictions.extend(batch_predictions.tolist())

    df['pred_2'] = ''
    for i, row in df.iterrows():
        if row['pred_1'] == 'oos':
            df.at[i, 'pred_2'] = 'oos'
        else:
            df.at[i, 'pred_2'] = class_labels[predictions[i]]

    return df

In [49]:
def predict_intent(input_data):

    if isinstance(input_data, str):
        df = pd.DataFrame([[input_data]], columns=['query'])
        df = predict_oos(df)
        predict_inscope(df, single_instance=True)
        print(f"\nInput query: {df['query'].iloc[0]}")
        print(f"Predicted intent: {df['pred_2'].iloc[0]}")
    else:
        df = predict_oos(input_data)
        df = predict_inscope(input_data, single_instance=False)
        
        report = classification_report(df["label"], df["pred_2"])
        print(f"\nClassification report for test queries:")
        print(report)

        df["binary_label"] =  df.label.apply(lambda x: "oos" if x=="oos" else "in-scope")
        oos_report = classification_report(df["binary_label"], df["pred_1"])
        print(f"\nBinary Classification report (oos or inscope):")
        print(oos_report)

    return

## Evaluate Test dataset on the Combined Approach

In [20]:
predict_intent(df_test)


Classification report for test queries:
                           precision    recall  f1-score   support

      accept_reservations       0.94      0.97      0.95        30
          account_blocked       0.90      0.93      0.92        30
                    alarm       1.00      0.97      0.98        30
       application_status       1.00      1.00      1.00        30
                      apr       0.94      1.00      0.97        30
            are_you_a_bot       0.94      1.00      0.97        30
                  balance       0.91      1.00      0.95        30
             bill_balance       0.90      0.87      0.88        30
                 bill_due       0.81      0.97      0.88        30
              book_flight       0.91      0.97      0.94        30
               book_hotel       0.94      1.00      0.97        30
               calculator       0.85      0.93      0.89        30
                 calendar       0.93      0.90      0.92        30
          calendar_u

Evaluate Combined Approach on test sentences

In [52]:
while (user_input := input("\nEnter a query (or press Enter to quit): ")) != "":
    predict_intent(user_input)


Enter a query (or press Enter to quit): call kevin now please

Input query: call kevin now please
Predicted intent: make_call

Enter a query (or press Enter to quit): make me some soup

Input query: make me some soup
Predicted intent: meal_suggestion

Enter a query (or press Enter to quit): build me a table

Input query: build me a table
Predicted intent: restaurant_reservation

Enter a query (or press Enter to quit): can you design and build a desk for me

Input query: can you design and build a desk for me
Predicted intent: oos

Enter a query (or press Enter to quit): I need to remember to make lunch

Input query: I need to remember to make lunch
Predicted intent: reminder_update

Enter a query (or press Enter to quit): 
