## Intent Classification: Simple ML approach

## Get Started

In [13]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

## Load data

In [1]:
multiwoz_path = "../Multiwoz/"

In [2]:
def load_data(file_name):
  '''Given file name, read the file and return a list of lines'''
  with open(multiwoz_path+file_name, "r", encoding="utf-8") as f:
    data = f.read().splitlines()

  return data

In [3]:
X_train = load_data("WOZ_train_utt.txt")
y_train = load_data("WOZ_train_ans.txt")
X_dev = load_data("WOZ_dev_utt.txt")
y_dev = load_data("WOZ_dev_ans.txt")
X_test = load_data("WOZ_test_utt.txt")

In [4]:
X_train[:5]

['Guten Tag, I am staying overnight in Cambridge and need a place to sleep. I need free parking and internet.',
 'Hi there! Can you give me some info on Cityroomz?',
 'I am looking for a hotel named alyesbray lodge guest house.',
 'I am looking for a restaurant. I would like something cheap that has Chinese food.',
 "I'm looking for an expensive restaurant in the centre if you could help me."]

In [5]:
y_train[:5]

['find_hotel|hotel-area=centre|hotel-internet=yes|hotel-parking=yes',
 'find_hotel|hotel-name=cityroomz',
 'find_hotel|hotel-name=alyesbray lodge guest house',
 'find_restaurant|restaurant-food=chinese|restaurant-pricerange=cheap',
 'find_restaurant|restaurant-area=centre|restaurant-pricerange=expensive']

## Construct dataset for binary classification

In [6]:
def prepare_for_binary_classification(X, y):
  '''Split tags in y, pick only the first tag and remove `find_`'''
  y_binary = [answer.split("|")[0].split("_")[-1] for answer in y]

  return X, y_binary

In [7]:
X_binary_train, y_binary_train = prepare_for_binary_classification(X_train, y_train)
X_binary_dev, y_binary_dev = prepare_for_binary_classification(X_dev, y_dev)
X_binary_test = X_test

In [8]:
y_binary_train[:5]

['hotel', 'hotel', 'hotel', 'restaurant', 'restaurant']

## Baseline Score

In [9]:
def fit_transform_and_predict(X_train, y_train, X_dev, vec, cls):
  '''
  Generate predictions based on the arguments.
  X_train (list) : train data
  y_train (list) : train tags
  X_dev (list) : development (or test) data to be predicted
  vec : vectorizer
  cls : classifier
  '''
  # vectorize
  X_train_enc = vec.fit_transform(X_train, y_train)
  X_dev_enc = vec.transform(X_dev)

  # train
  cls.fit(X_train_enc, y_train)

  # predict
  preds = cls.predict(X_dev_enc)

  return preds

In [14]:
cls = DummyClassifier(strategy="most_frequent")
vec = CountVectorizer()

In [15]:
dev_pred = fit_transform_and_predict(X_binary_train, y_binary_train, X_binary_dev, vec, cls)

In [16]:
print("dev acc (Dummy): ", accuracy_score(y_binary_dev, dev_pred))

dev acc (Dummy):  0.5254237288135594


## Try different ML systems: SVC, Logistic Regression, and Random Forest

In [17]:
# svc
svc = SVC()
dev_svc_pred = fit_transform_and_predict(X_binary_train, y_binary_train, X_binary_dev, vec, svc)
print("dev acc (SVC): ", accuracy_score(y_binary_dev, dev_svc_pred))

dev acc (SVC):  0.9830508474576272


In [18]:
# lr
vec = CountVectorizer(ngram_range = (1, 1), 
                                        lowercase = True,
                                        stop_words = 'english')
lr = LogisticRegression()
dev_lr_pred = fit_transform_and_predict(X_binary_train, y_binary_train, X_binary_dev, vec, lr)
print("dev acc (lr): ", accuracy_score(y_binary_dev, dev_lr_pred))

dev acc (lr):  0.9951573849878934


In [19]:
# random forest

vec = CountVectorizer(ngram_range = (1, 2), 
                                        lowercase = True, 
                                        stop_words = 'english')
rf = RandomForestClassifier()
dev_rf_pred = fit_transform_and_predict(X_binary_train, y_binary_train, X_binary_dev, vec, rf)
print("dev acc (rf): ", accuracy_score(y_binary_dev, dev_lr_pred))

dev acc (rf):  0.9951573849878934


We chose `random forest classifier` for this task.

## Generate binary intent predictions for the development set and the test set

In [22]:
train_rf_pred = fit_transform_and_predict(X_binary_train, y_binary_train, X_binary_train, vec, rf)
dev_rf_pred = fit_transform_and_predict(X_binary_train, y_binary_train, X_binary_dev, vec, rf)
test_rf_pred = fit_transform_and_predict(X_binary_train, y_binary_train, X_binary_test, vec, rf)

In [23]:
assert len(dev_rf_pred) == 413
assert len(test_rf_pred) == 400
print("Success!")

Success!


In [24]:
with open("../predictions/0_train_binary_intent_pred.txt", "w", encoding="utf-8") as fout:
  for pred in train_rf_pred:
    fout.write(pred+"\n")

In [25]:
with open("../predictions/1_dev_binary_intent_pred.txt", "w", encoding="utf-8") as fout:
  for pred in dev_rf_pred:
    fout.write(pred+"\n")

In [26]:
with open("../predictions/2_test_binary_intent_pred.txt", "w", encoding="utf-8") as fout:
  for pred in test_rf_pred:
    fout.write(pred+"\n")

## Inspect weights of each feature

In [27]:
pipe_lr = make_pipeline(
    CountVectorizer(ngram_range = (1, 1), 
                                        lowercase = True,
                                        stop_words = 'english'),
    LogisticRegression()
)

pipe_lr.fit(X_binary_train, y_binary_train)
pipe_lr.score(X_binary_dev, y_binary_dev)

0.9951573849878934

In [28]:
weights = pipe_lr.named_steps['logisticregression'].coef_.flatten()
vocab = pipe_lr.named_steps['countvectorizer'].get_feature_names()
data={'features':vocab[-500:-480], 'coefficient':weights[-500:-480]}
inds = np.argsort(pipe_lr.named_steps['logisticregression'].coef_.flatten())

In [29]:
pipe_lr.classes_

array(['hotel', 'restaurant'], dtype='<U10')

In [30]:
hotal_words = [vocab[index] for index in inds[:20]]
restaurants_words = [vocab[index] for index in inds[-20:]]
hotel_words_weights = [(weights[index]) for index in inds[:20]]
restaurants_words_weights = [(weights[index]) for index in inds[-20:]]
pd.DataFrame(
    {
        "hotel feats": hotal_words, "hotel weights": hotel_words_weights,
        "restaurants feats": restaurants_words,"restaurants weights": restaurants_words_weights,
    })

Unnamed: 0,hotel feats,hotel weights,restaurants feats,restaurants weights
0,stay,-4.742314,italian,0.609503
1,hotel,-4.289174,called,0.615395
2,guesthouse,-3.356822,centre,0.61814
3,hotels,-2.094708,european,0.626479
4,star,-1.813542,india,0.627966
5,room,-1.761517,serves,0.69138
6,free,-1.686805,chinese,0.761186
7,guest,-1.581669,place,0.775645
8,guesthouses,-1.397516,expensive,0.806414
9,lodgings,-1.347547,indian,0.807875
