In [1]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1

In [6]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import torch
from torch.utils.data import DataLoader, TensorDataset

sns.set_theme()
tqdm.pandas()

# Modular function for the end-to-end combined pipeline:

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
project_path = "/content/drive/MyDrive/University/Winter 23 Courses/CSI 5180 - AI Virtual Assistants/Project"

kmeans_model_path = f"{project_path}/models/combined_approach/kmeans.pkl"
bert_model_path = f"{project_path}/models/bert_inscope_original_data/saved_models"

In [7]:
kmeans = pickle.load(open(kmeans_model_path, 'rb'))
mpnet = SentenceTransformer("all-mpnet-base-v2")

tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_path)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
bert_model.to(device)
bert_model.eval()

THRESHOLD = 0.95
df_test = pd.read_csv(f"{project_path}/data/original_dataset/original_test_data.csv", dtype={"query":"string","label":"string"})
class_labels = sorted(df_test["label"].unique())
class_labels.remove('oos')

In [31]:
def compute_distance_of_closest_cluster(df):
    cluster_distances = kmeans.transform(mpnet.encode(df["query"].values))
    cluster_min_distances = cluster_distances.min(axis=1)
    df["min_distances"] = cluster_min_distances

    return df

def predict_oos(df):
    '''
    input df is a Pandas dataframe with the text input under the column named 'query'
    ''' 
    sentence_embeddings = []
    embedding_matrix = mpnet.encode(df["query"].values)
    for i in range(embedding_matrix.shape[0]):
        sentence_embeddings.append(embedding_matrix[i])  

    df["mpnet_embeddings_sentence"] = sentence_embeddings
    df = compute_distance_of_closest_cluster(df)
    df["pred_1"] = df.min_distances.apply(lambda x: "oos" if x>THRESHOLD else "in-scope")

    return df

def predict_inscope(df):
    query_list = df_test["query"].to_list()
    encoded_query_list = tokenizer(query_list, padding=True, truncation=True, return_tensors='pt').to(device)
    input_ids = encoded_query_list['input_ids']
    token_type_ids = encoded_query_list['token_type_ids']
    attention_mask = encoded_query_list['attention_mask']

    input_data = TensorDataset(input_ids, token_type_ids, attention_mask)
    dataloader = DataLoader(input_data, batch_size=64)

    predictions = []
    for batch_input_ids, batch_token_type_ids, batch_attention_mask in dataloader:
        with torch.no_grad():
            logits = bert_model(input_ids=batch_input_ids, token_type_ids=batch_token_type_ids, attention_mask=batch_attention_mask)[0]
            batch_predictions = torch.argmax(logits, dim=1)

        predictions.extend(batch_predictions.tolist())

    df['pred_2'] = ''
    for i, row in df.iterrows():
        if row['pred_1'] == 'oos':
            df.at[i, 'pred_2'] = 'oos'
        else:
            df.at[i, 'pred_2'] = class_labels[predictions[i]]

    return df

def predict_intent(df):

    df = predict_oos(df)
    df = predict_inscope(df)

    return df


In [32]:
df_results = predict_intent(df_test)

In [37]:
report = classification_report(df_results["label"], df_results["pred_2"])

In [39]:
print(report)

                           precision    recall  f1-score   support

      accept_reservations       0.94      0.97      0.95        30
          account_blocked       0.90      0.93      0.92        30
                    alarm       1.00      0.97      0.98        30
       application_status       1.00      1.00      1.00        30
                      apr       0.94      1.00      0.97        30
            are_you_a_bot       0.94      1.00      0.97        30
                  balance       0.91      1.00      0.95        30
             bill_balance       0.90      0.87      0.88        30
                 bill_due       0.81      0.97      0.88        30
              book_flight       0.91      0.97      0.94        30
               book_hotel       0.94      1.00      0.97        30
               calculator       0.85      0.93      0.89        30
                 calendar       0.93      0.90      0.92        30
          calendar_update       0.97      0.97      0.97     

In [18]:
df_test["binary_label"] =  df_test.label.apply(lambda x: "oos" if x=="oos" else "in-scope")
pd.DataFrame(classification_report(df_test["binary_label"], df_test["pred"], output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
in-scope,0.950548,0.982444,0.966233,4500.0
oos,0.906949,0.77,0.832883,1000.0
accuracy,0.943818,0.943818,0.943818,0.943818
macro avg,0.928749,0.876222,0.899558,5500.0
weighted avg,0.942621,0.943818,0.941988,5500.0
