# WIP - Restaurant Recommendation Dialog System

## Members:
- Karpiński, R.R. (Rafał)
- Pavan, L. (Lorenzo)
- Rodrigues Luchetti, G.L. (Gustavo)
- Teunissen, N.D. (Niels)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from re import search

## Reading 'dialog_acts.dat' into dataframe

In [2]:
with open('dialog_acts.dat', 'r') as f:
    data = f.readlines()
    data = list(map(lambda x: x.rstrip("\n").split(" ", 1), data))
    
df = pd.DataFrame(np.array(data), columns = ['label', 'text'])

In [3]:
df.head(3)

Unnamed: 0,label,text
0,inform,im looking for a moderately priced restaurant ...
1,inform,any part of town
2,inform,bistro food


## Pre-Processing
Looking for null values, irrelevant or noisy text (literally, removing 'tv_noise' and 'noise') and repeated values. Formatting labels into numbers.

In [26]:
# transforming labels into numbers
df['label_id'] = df['label'].factorize()[0]
label_dict = df[['label','label_id']].drop_duplicates().set_index('label_id')
label_dict

Unnamed: 0_level_0,label
label_id,Unnamed: 1_level_1
0,inform
1,confirm
2,affirm
3,request
4,thankyou
5,
6,bye
7,reqalts
8,negate
9,hello


In [29]:
from sklearn.model_selection import train_test_split
# X - independent features (excluding target variable).
# y - dependent variables (target we're looking to predict).

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_id'], test_size=0.15, random_state=10
)

print(X_train.head(3))
print()
print(y_train.head(3))

11472    moderately priced restaurant in the north part...
25465                                can i get the address
8867                                               address
Name: text, dtype: object

11472    0
25465    3
8867     3
Name: label_id, dtype: int64


## Building Baseline Systems

In [7]:
def keyword_matching(text):
    """
    Rule-based prediction of dialog acts based on utterances.
    Arguments:
        text: an utterance ('can i get the address').
        
    Returns:
        Returns a prediction for the given utterance ('request').
    """

    if search(r'\bhow about\b|\bwhat about\b|\banything else\b|\bare there\b|\bis there\b|\bwhat else\b', df.loc[i,'text']):
        return 'reqalts'
    elif search(r'\byes\b|\byeah\b|\bcorrect\b', text):
        return 'affirm'
    elif search(r'\bthank you\b', text):
        return 'thankyou'
    elif search(r'\bgoodbye\b|\bbye\b', text):
        return 'bye'
    elif search(r'\bdoes it\b|\bis it\b|\bdo they\b|\bis that\b|\bis there\b', text):
        return 'confirm'
    elif search(r'\bwhat is\b|\bwhats\b|\bmay i\b|\bcould i\b|\bwhat\b|\bprice range\b|\bpost code\b|\btype of\b|\baddress\b|\bphone number\b|\bcan i\b|\bcould i\b|\bcould you\b|\bdo you\b|\bi want+.address\b|\bi want+.phone\b|\bi would\b|\bwhere is\b', df.loc[i,'text']):
        return 'request'
    elif search(r'\bno\b|\bnot\b', text):
        return 'negate'
    elif search(r'\blooking for\b|\bdont care\b|\bdoesnt matter\b|\bexpensive\b|\bcheap\b|\bmoderate\b|\bi need\b|\bi want\b|\bfood\b|\bnorth\b', text):
        return 'inform'
    elif search(r'\bdont\b', text):
        return'deny'
    elif search(r'\bhello\b', text):
        predic'hello'
    elif search(r'\brepeat\b', text):
        predictions.append('repeat')
    elif search(r'\bmore\b', text):
        predictions.append('reqmore')
    elif search(r'\bstart\b', text):
        predictions.append('restart')
    elif search(r'\bokay\b|\bkay\b', text):
        predictions.append('ack')
    else:
        return 'inform'

In [8]:
def new_sentence(utterance):
    """
    Rule-based prediction of a dialog act based on a phrase.
    Arguments:
        utterance: string
        
    Returns:
        Returns the predicted dialog act.
    """
    utterance = utterance.lower()
    if search(r'\bhow about\b|\bwhat about\b|\banything else\b|\bare there\b|\bis there\b|\bwhat else\b', utterance):
        return 'reqalts'
    elif search(r'\byes\b|\byeah\b|\bcorrect\b',utterance):
        return 'affirm'
    elif search(r'\bthank you\b', utterance):
        return 'thankyou'
    elif search(r'\bgoodbye\b|\bbye\b', utterance):
        return 'bye'
    elif search(r'\bdoes it\b|\bis it\b|\bdo they\b|\bis that\b|\bis there\b', utterance):
        return 'confirm'
    elif search(r'\bwhat is\b|\bwhats\b|\bmay i\b|\bcould i\b|\bwhat\b|\bprice range\b|\bpost code\b|\btype of\b|\baddress\b|\bphone number\b|\bcan i\b|\bcould i\b|\bcould you\b|\bdo you\b|\bi want+.address\b|\bi want+.phone\b|\bi would\b|\bwhere is\b', utterance):
        return 'request'
    elif search(r'\bno\b|\bnot\b', utterance):
        return 'negate'
    elif search(r'\blooking for\b|\bdont care\b|\bdoesnt matter\b|\bexpensive\b|\bcheap\b|\bmoderate\b|\bi need\b|\bi want\b|\bfood\b|\bnorth\b',utterance):
        return 'inform'
    elif search(r'\bdont\b', utterance):
        return 'deny'
    elif search(r'\bhello\b|\bhi\b|\bhey\b', utterance):
        return 'hello'
    elif search(r'\brepeat\b', utterance):
        return 'repeat'
    elif search(r'\bmore\b', utterance):
        return 'reqmore'
    elif search(r'\bstart\b', utterance):
        return 'restart'
    elif search(r'\bokay\b|\bkay\b',utterance):
        return 'ack'
    else:
        return 'inform'
    

## Building Classifier Models

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0, max_iter=400),
]

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, # scale the words frequency in logarithmic scale
                        min_df=5, # remove the words which has occurred in less than ‘min_df’ number of files
                        ngram_range=(1, 2), # don't know what role n-grams play in vectorisation
                        stop_words='english', # it removes stop words which are predefined in ‘english’.
                        lowercase=True # everything to lowercase
                        )

X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfid = tfidf.fit_transform(X_test).toarray()

print(X_train_tfid)
print(y_train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
11472    0
25465    3
8867     3
25417    4
24660    0
        ..
9372     6
7291     3
17728    3
7293     0
17673    7
Name: label_id, Length: 21675, dtype: int64


In [11]:
from sklearn.model_selection import cross_val_score

CV = 2
entries = []
cv_df = pd.DataFrame(index=range(CV * len(models)))

for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X_train_tfid, y_train, scoring='accuracy')
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])



In [12]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']

acc 

Unnamed: 0_level_0,Mean Accuracy,Standard deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
LogisticRegression,0.88,0.002394
MultinomialNB,0.85707,0.00192
RandomForestClassifier,0.68286,0.025147


seems like LogisticRegression is the best or we can tweek the hyperparameters more. 

In [15]:
model = LogisticRegression(random_state=0, max_iter=400)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

ValueError: could not convert string to float: 'moderately priced restaurant in the north part of town'

## Evaluations

In [18]:
import sklearn.metrics

### Baseline Systems

In [16]:
def plot_confusion_matrix(labels,predictions):
    """Plots the confusion matrix
    Arguments:
    labels: array-like of shape (n_samples,)
    predictions: array-like of shape (n_samples,)
    Returns
    -------
    plot
        plots the confusion matrix
    """
    plt.rcParams.update(plt.rcParamsDefault)

    plt.rcParams['figure.figsize'] = [10, 10]
    plt.rcParams['font.size'] = 8
    sklearn.metrics.ConfusionMatrixDisplay.from_predictions(labels,predictions)

    plt.show()

In [17]:
predictions = (rules(df))
def baselineAccuracy(predictions, df):
    """Calculates the accuracy
        Arguments:
        predictions: list
        df: a pandas dataframe that contains a column named text with utterances.
        Returns
        -------
    Returns:
        Returns the accuracy
    """
    count = 0
    for i in range(0,len(predictions)):
        
        if(predictions[i].lower() == df.loc[i,'label'].lower()):
            count += 1
    return "Accuracy: "+str(round(count / len(predictions)*100,1))+"%"

NameError: name 'rules' is not defined

In [None]:
baselineAccuracy(predictions, df)

In [None]:
plot_confusion_matrix(df['label'],predictions)

In [None]:
 def metrics_overview(labels, predictions):
        """Prints metrics
        Arguments:
        labels: array-like of shape (n_samples,)
        predictions: array-like of shape (n_samples,)
        
        Prints different metrics related to the confusion matrix.
        """
        edges_confusion_matrix = sklearn.metrics.confusion_matrix(labels,predictions)

        FP = edges_confusion_matrix.sum(axis=0) - np.diag(edges_confusion_matrix)  
        
        FN = edges_confusion_matrix.sum(axis=1) - np.diag(edges_confusion_matrix)
        
        TP = np.diag(edges_confusion_matrix)
        
        TN = edges_confusion_matrix.sum() - (FP + FN + TP)
        
        
        # Sensitivity, hit rate, recall, or true positive rate
        TPR = TP/(TP+FN)
        print('TPR',TPR)
        print('Average TPR',np.average(TPR))
        print('_______________________________')
        # Specificity or true negative rate
        TNR = TN/(TN+FP)
        print('TNR',TNR)
        print('Average TNR',np.average(TNR))
        print('_______________________________')

        # Precision or positive predictive value
        PPV = TP/(TP+FP)
        print('PPV',PPV)
        print('Average PPV',np.average(PPV))
        print('_______________________________')

        # Negative predictive value
        NPV = TN/(TN+FN)
        print('NPV',NPV)
        print('Average NPV',np.average(NPV))
        print('_______________________________')

        # Fall out or false positive rate
        FPR = FP/(FP+TN)
        print('FPR',FPR)
        print('Average FPR',np.average(FPR))
        print('_______________________________')

        # False negative rate
        FNR = FN/(TP+FN)
        print('FNR',FNR)
        print('Average FNR',np.average(FNR))
        print('_______________________________')

        # False discovery rate
        FDR = FP/(TP+FP)
        print('FDR',FDR)
        print('Average FDR',np.average(FDR))
        print('_______________________________')

        # Overall accuracy
        ACC = (TP+TN)/(TP+FP+FN+TN)
        print('ACC',ACC)
        print('Average ACC',np.average(ACC))
        print('_______________________________')

        F1 = 2*((PPV*TPR)/(PPV+TPR))
        F1 = F1[~np.isnan(F1)]
        print('F1',F1)
        print('Average F1',np.average(F1))
        print('_______________________________')
        print((FP+FN)/(TP+FP+FN+TN))
        
metrics_overview(df['label'],predictions)

### Proper Models (Random Forest, Multinomial NB, Logistic Regression)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics

print(metrics.classification_report(label_test, label_pred, 
                                    target_names= df['label'].unique()))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 

conf_mat = confusion_matrix(label_test, label_pred)
plt.figure(figsize = (20,5))
sns.heatmap(conf_mat, annot=True, cmap='Greens', fmt='d',
            xticklabels=label_dict.label.values, 
            yticklabels=label_dict.label.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')

In [None]:
df.describe()

In [None]:
df.groupby('label').describe().sort_values(('text','count'))