# Rule-based Approach
- This predictions use the intent predictions from the previous step.
- Aspects:
    - `hotel-stars`, `hotel-internet`, `hotel-parking`, (success)
    - `hotel-area`, `restaurant-area` (success)
    - `hotel-type` (fail)

In [1]:
import pandas as pd
import numpy as np
import nltk
import nltk
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeremy.zhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
train_df = pd.read_csv('../data/dioData_train.csv')
train_df.head()

Unnamed: 0,find_hotel,find_restaurant,utts,hotel-area,hotel-internet,hotel-parking,hotel-name,hotel-pricerange,hotel-type,hotel-stars,restaurant-food,restaurant-pricerange,restaurant-area,restaurant-name
0,1.0,,"Guten Tag, I am staying overnight in Cambridge...",centre,yes,yes,,,,,,,,
1,1.0,,Hi there! Can you give me some info on Cityroomz?,,,,cityroomz,,,,,,,
2,1.0,,I am looking for a hotel named alyesbray lodge...,,,,alyesbray lodge guest house,,,,,,,
3,,1.0,I am looking for a restaurant. I would like so...,,,,,,,,chinese,cheap,,
4,,1.0,I'm looking for an expensive restaurant in the...,,,,,,,,,expensive,centre,


In [3]:
train_df.count()

find_hotel               1609
find_restaurant          2151
utts                     3760
hotel-area                326
hotel-internet            349
hotel-parking             335
hotel-name                292
hotel-pricerange          329
hotel-type                537
hotel-stars               355
restaurant-food          1103
restaurant-pricerange    1047
restaurant-area          1009
restaurant-name           264
dtype: int64

#### Integrate train_data with the train_intent_predictions

In [4]:
def read_intent_pred_and_combine(df, file_path):
    '''read predicted intentions and combine with the current dataframe'''
    with open(file_path, "r", encoding="utf-8") as f:
        intent = f.read().splitlines()
    df["intent"] = intent
    return df

In [5]:
train_df = read_intent_pred_and_combine(train_df, "../predictions/0_train_binary_intent_pred.txt")

In [6]:
train_df.head()

Unnamed: 0,find_hotel,find_restaurant,utts,hotel-area,hotel-internet,hotel-parking,hotel-name,hotel-pricerange,hotel-type,hotel-stars,restaurant-food,restaurant-pricerange,restaurant-area,restaurant-name,intent
0,1.0,,"Guten Tag, I am staying overnight in Cambridge...",centre,yes,yes,,,,,,,,,hotel
1,1.0,,Hi there! Can you give me some info on Cityroomz?,,,,cityroomz,,,,,,,,hotel
2,1.0,,I am looking for a hotel named alyesbray lodge...,,,,alyesbray lodge guest house,,,,,,,,hotel
3,,1.0,I am looking for a restaurant. I would like so...,,,,,,,,chinese,cheap,,,restaurant
4,,1.0,I'm looking for an expensive restaurant in the...,,,,,,,,,expensive,centre,,restaurant


### Create functions for predictions

In [7]:
HOTEL_STAR_MAPS = {
    'zero': '0',
    'one': '1',
    'two': '2',
    'three': '3',
    'four': '4',
    'five': '5'
}

In [8]:
DONTCARE = r"(((don't|doesn't|does not|do not) (need( to)?|care))|(not picky)|((not|n't) (a )?necessity)|(is optional))"
NEGATION = r"((don't (want|need))|dont (want|need)|(doesn't need)|(don't need)|hate|(not needed))"

In [9]:
import re
def match_attribute(regex, utt):
    '''Match text in utterances with regex specified'''
    match = re.search(regex, utt)
    if match:
        return match.group()
    return None

In [10]:
def calculate_accuracy(subset_df, pred_column, pred_func):
    '''Calculate accuracy based on the column to predict, pred_column and 
    prediction function, pred_func within the DataFrame specified. Also, it
    writes error predictions to col-errors.csv'''
    assert "utts" in subset_df

    new_df = subset_df.copy()
    new_df[f"pred-{pred_column}"] = new_df["utts"].map(pred_func)
    investigated_columns = ["utts", pred_column, f"pred-{pred_column}"]
    new_df = new_df[investigated_columns].replace(np.nan, "dontknow")
    acc = accuracy_score(
        new_df[investigated_columns[1]], new_df[investigated_columns[2]]
    )
    return acc, new_df

In [11]:
def predict_hotel_type(utt):
    '''Preidct the hotel type based on the utterances'''
    match_hotel = match_attribute(r"hotel", utt)
    match_guesthouse = match_attribute(r"(guesthouse|guest house)", utt)
    if match_guesthouse:  
        if match_guesthouse == 'guest house':
            match_guesthouse = 'guesthouse'
        return match_guesthouse
    return match_hotel

In [12]:
def predict_hotel_internet(utt):
    """Predict whether the user needs hotel internet based on the utterances"""
    sents = re.split("(but|,|and)", utt.lower())
    for sent in sents:
        match_internet = match_attribute(
            r"(internet|Internet|wifi|wi-fi|free wifi)", sent
        )
        match_dontcare = match_attribute(DONTCARE, sent)
        match_negation = match_attribute(NEGATION, sent)

        if match_internet and match_dontcare:
            return "dontcare"
        elif match_internet and match_negation:
            return "no"
        elif match_internet:
            return "yes"
    return "dontknow"

In [13]:
def predict_hotel_parking(utt):
    '''Predict whether the user needs hotel parking based on the utterances'''
    sents = re.split("(but|,|and)", utt.lower())
    for sent in sents:
        match_parking = match_attribute(r'(parking|free parking)', sent)
        match_dontcare = match_attribute(DONTCARE, sent)
        match_negation = match_attribute(NEGATION, sent)
        
        if match_parking and match_negation:
            return "no"
        elif match_parking and match_dontcare:
            return "dontcare"
        elif match_parking:
            return "yes"
    return 'dontknow'

In [14]:
def predict_hotel_star(utt):
    """Predict whether the user request hotel stars based on the utterances"""
    match_star = match_attribute(
        r"((\d|zero|one|two|three|four|five)(-| )?star)|(star (rating )?of (\d|zero|one|two|three|four|five))",
        utt,
    )
    if match_star:
        if match_star[0] in {"0", "1", "2", "3", "4", "5"}:
            return match_star[0]
        elif match_star.startswith("star"):
            if match_star[-1] in {"0", "1", "2", "3", "4", "5"}:
                return match_star[-1]
            else:
                star = match_star.split()[-1]
                return HOTEL_STAR_MAPS[star]
        else:
            star = re.split(" |-", match_star)[0]
            return HOTEL_STAR_MAPS[star]
    return "dontknow"

In [15]:
def predict_area(utt):
    """Predict whether the hotel are is north/centre/east/west/south/dontcare based on Gazzetters"""
    direction = ["north", "centre", "east", "west", "south"]
    lowercase = lambda s: s[:1].lower() + s[1:] if s else ""
    ex = "city centre north"

    for dir in direction:
        if dir.capitalize() in utt:
            dir = lowercase(dir)
            return dir
        if dir in utt and "Cambridge" in utt:
            return dir
        if dir in utt:
            return dir
    if "center" in utt or "Center" in utt or "central" in utt:
        return "centre"
    if ex in utt or ex.capitalize() in utt:
        return "dontcare"
    if "east" in nltk.word_tokenize(utt):
        return "east"
    return "dontknow"

In [16]:
hotel_df = train_df[train_df['intent'] == "hotel"]
restaurant_df = train_df[train_df['intent'] == "restaurant"]

In [17]:
calculate_accuracy(hotel_df, 'hotel-type', predict_hotel_type)[0]

0.6681168427594779

In [18]:
calculate_accuracy(hotel_df, 'hotel-internet', predict_hotel_internet)[0]

0.9788688626476072

In [19]:
calculate_accuracy(hotel_df, 'hotel-parking', predict_hotel_parking)[0]

0.9770043505282784

In [20]:
calculate_accuracy(hotel_df, 'hotel-stars', predict_hotel_star)[0]

0.9937849596022374

In [21]:
calculate_accuracy(hotel_df, 'hotel-area', predict_area)[0]

0.9801118707271598

In [22]:
calculate_accuracy(restaurant_df, 'restaurant-area', predict_area)[0]

0.9721059972105998

#### It turns out that hotel-type is not predictable with Regex. But hotel-stars, hotel-parking and hotel-internet work really well with Regex.

### Evaluation on Dev (hotel-parking, hotel-internet, hotel-stars)

In [23]:
dev_df = pd.read_csv('../data/dioData_dev.csv')
dev_df = read_intent_pred_and_combine(dev_df, "../predictions/1_dev_binary_intent_pred.txt")

# separate development set 
hotel_dev_df = dev_df[dev_df['intent'] == "hotel"]
restaurant_dev_df = dev_df[dev_df['intent'] == "restaurant"]

X_train = hotel_df['utts']
X_dev = hotel_dev_df['utts']

y_pred_hotel_parking = X_dev.map(predict_hotel_parking)
y_pred_hotel_internet = X_dev.map(predict_hotel_internet)
y_pred_hotel_stars = X_dev.map(predict_hotel_star)

y_dev_hotel_parking = hotel_dev_df['hotel-parking'].replace(np.nan, 'dontknow')
y_dev_hotel_internet = hotel_dev_df['hotel-internet'].replace(np.nan, 'dontknow')
y_dev_hotel_stars = hotel_dev_df['hotel-stars']

print("hotel parking accuracy score: ", accuracy_score(y_dev_hotel_parking, y_pred_hotel_parking))
print("hotel internet accuracy score: ", accuracy_score(y_dev_hotel_internet, y_pred_hotel_internet))
print("hotel stars accuracy score: ", accuracy_score(y_dev_hotel_stars.replace(np.nan, 6).apply(int), y_pred_hotel_stars.replace('dontknow', '6').apply(int)))

dev_pred_df = pd.DataFrame(
    {
        "utts": X_dev,
        "hotel-internet": y_pred_hotel_internet,
        "hotel-parking": y_pred_hotel_parking,
        "hotel-stars": y_pred_hotel_stars,
    }
)

empty_dev = dev_df[["utts"]]

combined_dev = empty_dev.join(dev_pred_df, rsuffix="_hotel").replace(
    "dontknow", np.nan
)

combined_dev.pop('utts_hotel')
assert len(combined_dev) == 413
print("success!")

hotel parking accuracy score:  0.9746192893401016
hotel internet accuracy score:  0.9898477157360406
hotel stars accuracy score:  1.0
success!


In [24]:
# store combined development set in a file
combined_dev.to_csv('../predictions/dev_hotel_attr_pred.csv')

### Evaluation on Dev data (on hotel-area and restaurant area)



In [25]:
hotel_score, dev_hotel_pred = calculate_accuracy(hotel_dev_df, 'hotel-area', predict_area)
rest_score, dev_restaurant_pred = calculate_accuracy(restaurant_dev_df, 'restaurant-area', predict_area)

print("Accuracy score for hotel area on dev data", hotel_score)
print("Accuracy score for restaurant area on dev data", rest_score)

Accuracy score for hotel area on dev data 0.9949238578680203
Accuracy score for restaurant area on dev data 0.9629629629629629


In [26]:
dev_rest = dev_df[['utts']].join(dev_restaurant_pred, rsuffix = '_restaurant')
combined_dev = dev_rest.join(dev_hotel_pred, rsuffix = '_hotel').replace('dontknow', np.nan)
combined_dev = combined_dev.drop(columns = ['utts_restaurant', 'restaurant-area', 'hotel-area', 'utts_hotel'])

# store in a separate file for calculating accuracy score
combined_dev.to_csv('../predictions/dev_area_pred.csv')

### Prediction on Test (hotel-internet, hotel-parking, hotel-stars)

In [27]:
test_df = pd.read_csv('../data/dioData_test.csv')
test_df = read_intent_pred_and_combine(test_df, "../predictions/2_test_binary_intent_pred.txt")

hotel_test_df = test_df[test_df['intent'] == 'hotel']
restaurant_test_df = test_df[test_df['intent'] == 'restaurant']

In [28]:
hotel_test_df['hotel-stars'] = hotel_test_df['utts'].map(predict_hotel_star)
hotel_test_df['hotel-internet'] = hotel_test_df['utts'].map(predict_hotel_internet)
hotel_test_df['hotel-parking'] = hotel_test_df['utts'].map(predict_hotel_parking)

combined_test = test_df[['utts']].join(hotel_test_df, rsuffix="_hotel").replace(
    "dontknow", np.nan
)

# store in a csv file
combined_test[
    ["utts", "hotel-internet", "hotel-parking", "hotel-stars"]
].to_csv("../predictions/test_hotel_attr_pred.csv")

### Prediction on Test (hotel area, restaurant area)

In [29]:
_, hotel_test = calculate_accuracy(hotel_test_df, 'hotel-area', predict_area)
_, restaurant_test = calculate_accuracy(restaurant_test_df, 'restaurant-area', predict_area)

test_rest = test_df[['utts']].join(restaurant_test, rsuffix = '_restaurant')
combined_test = test_rest.join(hotel_test, rsuffix = '_hotel').replace('dontknow', np.nan)
combined_test = combined_test[["utts", "pred-restaurant-area", "pred-hotel-area"]]

# store in a csv file
combined_test.to_csv('../predictions/test_area_pred.csv')