## ML-approach to hotel-type, restaraunt-pricerange, and hotel-pricerange

## 1. Hotel-type predictions

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

In [2]:
def read_df_with_pred_intent(df_filename, intent_filename):
    '''combine intent predictions with the given dataframe'''
    df = pd.read_csv(df_filename)
    with open(intent_filename) as f:
        binary = f.read().splitlines()
    df['intent'] = binary
    return df

In [3]:
train_df = read_df_with_pred_intent('dioData_train.csv', 'train_binary_intent_pred.txt')
dev_df = read_df_with_pred_intent('dioData_dev.csv', 'dev_binary_intent_pred.txt')

In [4]:
hotel_train_df = train_df[train_df['intent'] == 'hotel']
hotel_dev_df = dev_df[dev_df['intent'] == 'hotel']

In [5]:
X_train = hotel_train_df['utts']
y_train = hotel_train_df['hotel-type'].replace(np.nan, 'dontknow')
X_dev = hotel_dev_df['utts']
y_dev = hotel_dev_df['hotel-type'].replace(np.nan, 'dontknow')

### Baseline DummyClassifier

In [6]:
pipe = make_pipeline(CountVectorizer(), DummyClassifier())

In [7]:
pipe.fit(X_train, y_train)



Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('dummyclassifier', DummyClassifier())])

In [8]:
pipe.score(X_train, y_train)

0.4872591671845867

### SVC

In [9]:
pipe_svc = make_pipeline(CountVectorizer(), SVC())
pipe_svc.fit(X_train, y_train)
print("training score: ", pipe_svc.score(X_train, y_train))
print("validation score: ", pipe_svc.score(X_dev, y_dev))

training score:  0.8831572405220633
validation score:  0.8686868686868687


### Random Forest

In [10]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
pipe_rf = make_pipeline(CountVectorizer(ngram_range = (1, 3), 
                                        lowercase = True, 
                                        stop_words = 'english'
#                                         max_features = 2500
                                       ), 
                        RandomForestClassifier())
pipe_rf.fit(X_train, y_train)
print("training score: ", round(pipe_rf.score(X_train, y_train), 3))
print("validation score: ", round(pipe_rf.score(X_dev, y_dev), 3))

training score:  0.98
validation score:  0.818


### Prediction on dev set

In [11]:
X_dev_hotel_type_pred = pipe_svc.predict(X_dev)

In [12]:
dev_hotel_type_pred_df = pd.DataFrame(
    {
        "utts": X_dev,
        "hotel-type": X_dev_hotel_type_pred,
    }
)

In [13]:
empty_dev = dev_df[["utts"]]

In [14]:
combined_dev = empty_dev.join(dev_hotel_type_pred_df, rsuffix="_hotel").replace(
    "dontknow", np.nan
)

In [15]:
combined_dev.pop('utts_hotel')

0                                                    NaN
1      My husband and I are celebrating our anniversa...
2                                                    NaN
3      Are there any accommodations in the east part ...
4      I'm looking for a nice place to stay, somewher...
                             ...                        
408    I'm looking for info about 4-star accommodatio...
409                                                  NaN
410                                                  NaN
411                                                  NaN
412                                                  NaN
Name: utts_hotel, Length: 413, dtype: object

In [16]:
combined_dev.to_csv('dev_hotel_type_pred.csv')

### Prediction on TEST set

In [17]:
test_df = read_df_with_pred_intent('dioData_test.csv', 'test_binary_intent_pred.txt')

In [18]:
hotel_test_df = test_df[test_df['intent'] == 'hotel']

In [19]:
X_test = hotel_test_df['utts']

In [20]:
import warnings
warnings.filterwarnings("ignore")

In [21]:
hotel_test_df['hotel-type'] = pipe_svc.predict(X_test)

In [22]:
combined_dev = test_df[['utts']].join(hotel_test_df, rsuffix="_hotel").replace(
    "dontknow", np.nan
)

In [23]:
combined_dev[
    ["utts", "hotel-type"]
].to_csv("test_hotel_type_pred.csv")

## 2. Hotel-pricerange/Restaraunt-pricerange 

In [24]:
train_utt = train_df["utts"].tolist()
dev_utt = dev_df["utts"].tolist()
test_utt = test_df["utts"].tolist()

### 2.1. Restaraunt-pricerange 

In [25]:
rest_preds = train_df["restaurant-pricerange"].tolist()
dev_rest_preds = dev_df["restaurant-pricerange"].tolist()

### SVC

In [26]:
pipe_SVC_rest = make_pipeline(CountVectorizer(), SVC())
pipe_SVC_rest.fit(train_utt, rest_preds)
print("training score: ", pipe_SVC_rest.score(train_utt, rest_preds))
print("validation score: ", pipe_SVC_rest.score(dev_utt, dev_rest_preds))

training score:  0.9859042553191489
validation score:  0.9806295399515739


### Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression
pipe_lr_rest = make_pipeline(CountVectorizer(ngram_range = (1,2), 
                                        lowercase = True,
                                        stop_words = 'english'),
                        LogisticRegression())
pipe_lr_rest.fit(train_utt, rest_preds)
print("training score: ", pipe_lr_rest.score(train_utt, rest_preds))
print("validation score: ", pipe_lr_rest.score(dev_utt, dev_rest_preds))

training score:  0.9986702127659575
validation score:  0.9854721549636803


### RandomForest (best)

In [28]:
from sklearn.ensemble import RandomForestClassifier
pipe_rf_rest = make_pipeline(CountVectorizer(ngram_range = (1, 2), 
                                        lowercase = True, 
                                        stop_words = 'english'),
                                        # max_features = 1000), 
                        RandomForestClassifier())
pipe_rf_rest.fit(train_utt, rest_preds)
print("training score: ", pipe_rf_rest.score(train_utt, rest_preds))
print("validation score: ", pipe_rf_rest.score(dev_utt, dev_rest_preds))

training score:  1.0
validation score:  0.9878934624697336


### 2.2. Hotel-pricerange

In [29]:
hotel_preds = train_df["hotel-pricerange"].tolist()
dev_hotel_preds = dev_df["hotel-pricerange"].tolist()

In [30]:
pipe = make_pipeline(CountVectorizer(), DummyClassifier())
pipe.fit(train_utt, hotel_preds)
pipe.score(train_utt, hotel_preds)

0.8375

### SVC

In [31]:
pipe_SVC_hotel = make_pipeline(CountVectorizer(), SVC())
pipe_SVC_hotel.fit(train_utt, hotel_preds)
print("training score: ", pipe_SVC_hotel.score(train_utt, hotel_preds))
print("validation score: ", pipe_SVC_hotel.score(dev_utt, dev_hotel_preds))

training score:  0.9933510638297872
validation score:  0.9806295399515739


### Logistic Regression (best)

In [32]:
pipe_lr_hotel = make_pipeline(CountVectorizer(ngram_range = (1,2), 
                                        lowercase = True,
                                        stop_words = 'english'),
                        LogisticRegression())
pipe_lr_hotel.fit(train_utt, hotel_preds)
print("training score: ", pipe_lr_hotel.score(train_utt, hotel_preds))
print("validation score: ", pipe_lr_hotel.score(dev_utt, dev_hotel_preds))

training score:  0.997872340425532
validation score:  0.9927360774818402


### RandomForest

In [33]:
pipe_rf_hotel = make_pipeline(CountVectorizer(ngram_range = (1, 2), 
                                        lowercase = True, 
                                        stop_words = 'english'),
                                        #max_features = 1000), 
                        RandomForestClassifier())
pipe_rf_hotel.fit(train_utt, hotel_preds)
print("training score: ", pipe_rf_hotel.score(train_utt, hotel_preds))
print("validation score: ", pipe_rf_hotel.score(dev_utt, dev_hotel_preds))

training score:  0.9997340425531915
validation score:  0.9757869249394673


## 3. Combine and output hotel/restaraunce-pricerange

In [34]:
final = pd.DataFrame({"utts":dev_utt,
              "hotel-pricerange": pipe_lr_hotel.predict(dev_utt).tolist(),
              "restaurant-pricerange": pipe_rf_rest.predict(dev_utt).tolist()})
final.to_csv('dev_price_pred.csv')

In [35]:
final_test = pd.DataFrame({"utts":test_utt,
              "hotel-pricerange": pipe_lr_hotel.predict(test_utt).tolist(),
              "restaurant-pricerange": pipe_rf_rest.predict(test_utt).tolist()})
final_test.to_csv('test_price_pred.csv')