In [None]:
!pip install git+https://github.ibm.com/Kalyan-Dutia/encoders
!pip install scikit-learn
!pip install pandas
!pip install numpy

# Finding Similar Utterance Pairs / Identifying Mislabeled Utterances
This notebook shows how you can use the sentence vectors produced by the *encoders* library to find similar pairs of utterances.

At a high level this works in exactly the same way at the Watson Assistant feature. The main difference is likely to be a slightly different *encoder* used to transform utterances into vectors. 

In [None]:
from encoders import encoders
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import os

import config

In [None]:
# ADD FILENAME HERE (relative to data folder)
# the file should be a CSV with columns utterance, intent; no header row
filename = "workspace_training/puppy_questions.csv"

(note that we can get a similarity matrix from utterances in two lines of code: everything after `similarity_mat` is just producing the output of utterances that are similar)

In [None]:
def find_similar_pairs(df, diff_intents=True, threshold=0.8):
    """
    Given a dataframe with `utterance, intent` columns, returns a dataframe with all pairs of utterances with a similarity above the threshold.
    With diff_intents=True this becomes a function for identifying potentially mislabelled utterances - those which are similar, but have different intent labels.

    :param df: dataframe containing `utterance, intent` columns 
    :param diff_intents: whether to only return similar utterances which have different intent labels
    :param threshold: the minimum similarity of utterance pairs to return
    """
    print("Creating embedding matrix..")
    embedding_mat = encoders(df['utterance'].tolist())

    print("Finding similar pairs...")
    similarity_mat = cosine_similarity(embedding_mat)

    similar_pairs = np.where(similarity_mat > threshold)
    utterances_from_idx_pair = lambda x: (df.loc[x[0], :], df.loc[x[1], :])
    sim_but_not_same_pairs = [a for a in zip(similar_pairs[0], similar_pairs[1]) if a[0]!=a[1]]
    # remove duplicates with convoluted list(set())
    sim_but_not_same_pairs = list(set([tuple(sorted(i)) for i in sim_but_not_same_pairs]))

    sim_df = pd.DataFrame(columns=['utterance 1', 'intent 1', 'utterance 2', 'intent 2', 'similarity'])
    for p in sim_but_not_same_pairs:
        series = {
            'utterance 1': df.loc[p[0], 'utterance'],
            'intent 1': df.loc[p[0], 'intent'],
            'utterance 2': df.loc[p[1], 'utterance'],
            'intent 2': df.loc[p[1], 'intent'],
            'similarity': similarity_mat[p[0]][p[1]]
        }

        sim_df = sim_df.append(series, ignore_index=True)
        
    if diff_intents:
        sim_df = sim_df[sim_df['intent 1'] != sim_df['intent 2']]
    
    return sim_df.sort_values('similarity', ascending=False)
    

In [None]:
df = pd.read_csv(os.path.join(config.data_dir, filename), header=None, names=['utterance', 'intent'])
sim_df = find_similar_pairs(df, diff_intents=False, threshold=0.8)

In [None]:
sim_df