## Predictions
- This notebook combines all generated predictions and generate the final predictions.

> **NOTE** Before run this code, make sure there is no `dev_all_pred.txt` and `test_all_pred.txt` in the `predictions` folder.

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

In [2]:
def combine_df(left_df, right_df, lsuffix=None):
    '''combine the left and right dataframe in a single dataframe'''
    combined_df = left_df.join(right_df, on='index', lsuffix=lsuffix)
    drop_columns = [f"index{lsuffix}", f"utts{lsuffix}"]
    combined_df = combined_df.drop(columns=drop_columns)
    return combined_df

In [3]:
dev_attr_pred = '../predictions/dev_hotel_attr_pred.csv'
dev_BERT_pred = '../predictions/dev_BERT_pred.csv'
dev_area_pred = '../predictions/dev_area_pred.csv'
dev_type_pred = '../predictions/dev_hotel_type_pred.csv'
dev_price_pred = '../predictions/dev_price_pred.csv'

In [4]:
def combine_all_predictions(attr_pred, BERT_pred, area_pred, type_pred, price_pred):
    '''
    Combine all predictions in a single dataframe.
    attr_pred -- a file path to hotel attribute prediction file
    BERT_pred -- a file path to BERT slot filling prediction file
    area_pred -- a file path to area prediction file
    type_pred -- a file path to hotel type prediction file
    price_pred -- a file path to prigerange prediction file
    combined_df (out) -- a dataframe which combined all predictions
    '''
    
    # rule-based hotel attributes
    hotel_df = pd.read_csv(attr_pred, index_col=0)
    hotel_df = hotel_df.reset_index()

    # slot filling BERT predictions
    slot_df = pd.read_csv(BERT_pred)

    # area predictions & renmae the dataframe
    area_df = pd.read_csv(area_pred, index_col=0)[['utts', 'pred-restaurant-area', 'pred-hotel-area']]
    area_df = area_df.reset_index()
    area_df = area_df.rename(
        columns={"pred-restaurant-area": "restaurant-area", "pred-hotel-area": "hotel-area"}
    )

    # hotel type predictions
    hotel_type_df = pd.read_csv(type_pred, index_col=0)
    hotel_type_df = hotel_type_df.reset_index()

    # price predictions
    price_df = pd.read_csv(price_pred, index_col=0)
    price_df = price_df.reset_index()

    # combine every separate dfs
    combined_df = combine_df(hotel_df, slot_df, lsuffix='_hotel')
    combined_df = combine_df(combined_df, area_df, lsuffix='_combined')
    combined_df = combine_df(combined_df, hotel_type_df, lsuffix='_combined')
    combined_df = combine_df(combined_df, price_df, lsuffix='_combined')
    combined_df = combined_df.drop(columns=['index', 'utts'])
    combined_df.info()
    
    return combined_df

In [5]:
dev_combined_df = combine_all_predictions(dev_attr_pred, dev_BERT_pred, dev_area_pred, dev_type_pred, dev_price_pred)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413 entries, 0 to 412
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   hotel-internet         43 non-null     object 
 1   hotel-parking          42 non-null     object 
 2   hotel-stars            34 non-null     float64
 3   hotel-name             52 non-null     object 
 4   restaurant-food        103 non-null    object 
 5   restaurant-name        46 non-null     object 
 6   restaurant-area        83 non-null     object 
 7   hotel-area             36 non-null     object 
 8   hotel-type             33 non-null     object 
 9   hotel-pricerange       37 non-null     object 
 10  restaurant-pricerange  84 non-null     object 
dtypes: float64(1), object(10)
memory usage: 35.6+ KB


In [6]:
with open('../predictions/1_dev_binary_intent_pred.txt') as f:
    intents = f.readlines()
intents = [intent.strip() for intent in intents]

In [7]:
def write_to_ans_format(df, intents, out_file):
    '''
    this function converts dataframe into expected answer format.
    df - combined dataframe to be used for generating answers
    intents -- a list of intention prediction
    out_file -- the name of the file to store output answers
    '''
    for (_, row), intent in zip(df.iterrows(), intents):
        line = ""
        if intent == 'restaurant':
            line += "find_restaurant"
        elif intent == 'hotel':
            line += 'find_hotel'
        new_attr = {}
        for attr, val in row.items():
            if str(val) != 'nan':
                if attr == 'hotel-stars':
                    val = int(val)
                new_attr[attr] = val
            
        # sort dict in alphabetical order
        new_attr = {key: value for key, value in sorted(new_attr.items(), key=lambda item: item[0])}
        for k, v in new_attr.items():
            line += f'|{k}={v}'
        with open(out_file, 'a') as fout:
            fout.write(line+'\n')

In [8]:
write_to_ans_format(dev_combined_df, intents, '../predictions/dev_all_pred.txt')

## Calculate development accuracy score

In [9]:
with open("../predictions/dev_all_pred.txt") as f:
    dev_predictions = [answer.strip() for answer in f.readlines()]  

In [10]:
woz_directory = 'Multiwoz/'
# code above should create dev_predictions
with open("../" + woz_directory + "WOZ_dev_ans.txt") as f:
    dev_correct = [answer.strip() for answer in f.readlines()]    
assert accuracy_score(dev_predictions, dev_correct) > 0.7
print("Success!")

Success!


In [11]:
accuracy_score(dev_predictions, dev_correct)

0.8450363196125908

## Prediction on test set

In [12]:
test_attr_pred = '../predictions/test_hotel_attr_pred.csv'
test_BERT_pred = '../predictions/test_BERT_pred.csv'
test_area_pred = '../predictions/test_area_pred.csv'
test_type_pred = '../predictions/test_hotel_type_pred.csv'
test_price_pred = '../predictions/test_price_pred.csv'

In [13]:
test_combined_df = combine_all_predictions(test_attr_pred, test_BERT_pred, test_area_pred, test_type_pred, test_price_pred)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   hotel-internet         35 non-null     object 
 1   hotel-parking          34 non-null     object 
 2   hotel-stars            38 non-null     float64
 3   hotel-name             42 non-null     object 
 4   restaurant-food        106 non-null    object 
 5   restaurant-name        35 non-null     object 
 6   restaurant-area        102 non-null    object 
 7   hotel-area             35 non-null     object 
 8   hotel-type             27 non-null     object 
 9   hotel-pricerange       38 non-null     object 
 10  restaurant-pricerange  86 non-null     object 
dtypes: float64(1), object(10)
memory usage: 34.5+ KB


In [14]:
with open('../predictions/2_test_binary_intent_pred.txt') as f:
    test_intents = f.readlines()
test_intents = [intent.strip() for intent in test_intents]

In [15]:
write_to_ans_format(test_combined_df, test_intents, '../predictions/test_all_pred.txt')

#### Output the test_all_pred in a kaggle csv format

In [16]:
with open("../predictions/test_all_pred.txt") as f:
    test_predictions = [answer.strip() for answer in f.readlines()]  

In [17]:
header = 'ID,Expected\n'
with open("../kaggle_answers.csv", "w", encoding="utf-8") as fout:
    fout.write(header)
    for line_num, answer in enumerate(test_predictions):
        fout.write(f'{line_num},"{answer}"\n')