# Restaurant Type Classification Using NLP

**Author: David Lightfoot**

In [None]:
!pip uninstall -y numpy pandas scipy
!pip install numpy==1.24.3 pandas==2.0.3 scipy==1.11.4 gensim==4.3.2

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Found existing installation: pandas 2.2.2
Uninstalling pandas-2.2.2:
  Successfully uninstalled pandas-2.2.2
Found existing installation: scipy 1.15.3
Uninstalling scipy-1.15.3:
  Successfully uninstalled scipy-1.15.3
Collecting numpy==1.24.3
  Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting pandas==2.0.3
  Downloading pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy==1.11.4
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gensim==4.3.2
  Downloading gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Downloading numpy-1.24.3-cp311-cp311

In [None]:
import pickle
import pandas as pd
import itertools
from collections import Counter
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from gensim.models import word2vec
import os
import string

In [None]:
import ssl
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
from google.colab import files
print("upload train.csv file:")
uploaded_train = files.upload()
print("upload test.csv file:")
uploaded_test = files.upload()

upload train.csv file:


Saving train.csv to train.csv
upload test.csv file:


Saving test.csv to test.csv


In [None]:
def build_vocab(sentences):
    word_counts = Counter(itertools.chain(*sentences))
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [None]:
def get_embeddings(inp_data, vocabulary_inv, size_features=200,
                   mode='skipgram',
                   min_word_count=2,
                   context=7):
    model_name = "embedding"
    model_name = os.path.join(model_name)
    num_workers = 15
    downsampling = 1e-3
    print('Training Word2Vec model...')
    sentences = [[vocabulary_inv[w] for w in s] for s in inp_data]
    if mode == 'skipgram':
        sg = 1
        print('Model: skip-gram')
    elif mode == 'cbow':
        sg = 0
        print('Model: CBOW')
    embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                        sg=sg,
                                        vector_size=size_features,
                                        min_count=min_word_count,
                                        window=context,
                                        sample=downsampling,
                                        epochs=20)
    print("Saving Word2Vec model {}".format(model_name))
    embedding_weights = np.zeros((len(vocabulary_inv), size_features))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model.wv:
            embedding_weights[i] = embedding_model.wv[word]
        else:
            embedding_weights[i] = np.random.uniform(-0.25, 0.25,
                                                     embedding_model.vector_size)
    return embedding_weights

In [None]:
def preprocess_df(df):
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["text"]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        filtered_words = []
        for word in words_list:
            if word.lower() not in stop_words and len(word) > 1:
                lemma = lemmatizer.lemmatize(word.lower(), get_wordnet_pos(word.lower()))
                filtered_words.append(lemma)
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

In [None]:
def extract_cuisine_keyword(text):
    if not isinstance(text, str):
        return np.zeros(10)
    text = text.lower()
    features = np.zeros(10)
    # trackers for different cuisine types
    asian_cuisines = []
    non_asian_cuisines = []
    # american (new) - index 0
    if any(word in text for word in ['contemporary', 'farm-to-table', 'gastropub', 'artisanal']):
        features[0] = 1
        non_asian_cuisines.append("american (new)")
    # american (traditional) - index 1
    if any(word in text for word in ['diner', 'homestyle', 'southern', 'bbq', 'grill']):
        features[1] = 1
        non_asian_cuisines.append("american (traditional)")
    # asian fusion - index 2
    if any(word in text for word in ['fusion', 'hybrid', 'multi', 'cultural', 'blend']):
        features[2] = 1
    # canadian (new) - index 3
    if any(word in text for word in ['canadian', 'poutine', 'toronto', 'montreal', 'quebec']):
        features[3] = 1
        non_asian_cuisines.append("canadian (new)")
    # chinese - index 4
    if any(word in text for word in ['chinese', 'hotpot', 'wok', 'dim', 'szechuan', 'beijing']):
        features[4] = 1
        asian_cuisines.append("chinese")
    # italian - index 5
    if any(word in text for word in ['italian', 'pasta', 'spaghetti', 'lasagna', 'risotto', 'carbonara']):
        features[5] = 1
        non_asian_cuisines.append("italian")
    # japanese - index 6
    if any(word in text for word in ['japanese', 'sushi', 'sashimi', 'ramen', 'miso', 'udon']):
        features[6] = 1
        asian_cuisines.append("japanese")
    # mediterranean - index 7
    if any(word in text for word in ['mediterranean', 'hummus', 'falafel', 'pita', 'tahini']):
        features[7] = 1
        non_asian_cuisines.append("mediterranean")
    # mexican - index 8
    if any(word in text for word in ['mexican', 'taco', 'burrito', 'quesadilla', 'enchilada', 'mole']):
        features[8] = 1
        non_asian_cuisines.append("mexican")
    # thai - index 9
    if any(word in text for word in ['thai', 'pad', 'tom', 'basil', 'lemongrass', 'bangkok']):
        features[9] = 1
        asian_cuisines.append("thai")
    # check for multiple asian cuisines
    if len(asian_cuisines) >= 2:
        features[2] = 1
    # check for asian + non-asian cuisines
    if len(asian_cuisines) >= 1 and len(non_asian_cuisines) >= 1:
        features[2] = 1
    return features

In [None]:
def extract_sentiment_features(text):
    if not isinstance(text, str):
        return np.zeros(4)
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return np.array([
        sentiment['neg'],
        sentiment['neu'],
        sentiment['pos'],
        sentiment['compound']
    ])

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from nltk.sentiment.vader import SentimentIntensityAnalyzer

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train["text"] = df_train["review"]
df_test["text"] = df_test["review"]
df_train = preprocess_df(df_train)
df_test = preprocess_df(df_test)

label_encoder = LabelEncoder()
df_train["encoded_label"] = label_encoder.fit_transform(df_train["label"])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f"Label mapping: {label_mapping}")

tagged_data = [word_tokenize(_d) for i, _d in enumerate(df_train["text"])]
word_counts, vocabulary, vocabulary_inv = build_vocab(tagged_data)
inp_data = [[vocabulary[word] for word in text] for text in tagged_data]
embedding_weights = get_embeddings(inp_data, vocabulary_inv)

tagged_train_data = [word_tokenize(_d) for i, _d in enumerate(df_train["text"])]
tagged_test_data = [word_tokenize(_d) for i, _d in enumerate(df_test["text"])]

cuisine_terms = set()
for cuisine_list in [
    ['contemporary', 'farm-to-table', 'gastropub', 'artisanal'],  # american (new)
    ['diner', 'homestyle', 'southern', 'bbq', 'grill'],  # american (traditional)
    ['fusion', 'hybrid', 'multi', 'cultural', 'blend'],  # asian fusion
    ['canadian', 'poutine', 'toronto', 'montreal', 'quebec'],  # canadian (new)
    ['chinese', 'hotpot', 'wok', 'dim', 'szechuan', 'beijing'],  # chinese
    ['italian', 'pasta', 'spaghetti', 'lasagna', 'risotto', 'carbonara'],  # italian
    ['japanese', 'sushi', 'sashimi', 'ramen', 'miso', 'udon'],  # japanese
    ['mediterranean', 'hummus', 'falafel', 'pita', 'tahini'],  # mediterranean
    ['mexican', 'taco', 'burrito', 'quesadilla', 'enchilada', 'mole'],  # mexican
    ['thai', 'pad', 'tom', 'basil', 'lemongrass', 'bangkok']  # thai
]:
    cuisine_terms.update(cuisine_list)
cuisine_weight = 4.0 # higher weight for cuisine terms to improve classification accuracy
print(f"Using fixed cuisine weight: {cuisine_weight}")

train_vec = []
for doc in tagged_train_data:
    vec = 0
    total_weight = 0
    for w in doc:
        weight = cuisine_weight if w.lower() in cuisine_terms else 1.0
        vec += embedding_weights[vocabulary[w]] * weight
        total_weight += weight
    vec = vec / total_weight if total_weight > 0 else vec
    train_vec.append(vec)

test_vec = []
for doc in tagged_test_data:
    vec = 0
    total_weight = 0
    for w in doc:
        try:
            weight = cuisine_weight if w.lower() in cuisine_terms else 1.0
            vec += embedding_weights[vocabulary[w]] * weight
            total_weight += weight
        except KeyError:
            continue
    vec = vec / total_weight if total_weight > 0 else vec
    test_vec.append(vec)

train_cuisine_features = np.array([extract_cuisine_keyword(text) for text in df_train["text"]])
test_cuisine_features = np.array([extract_cuisine_keyword(text) for text in df_test["text"]])
train_sentiment_features = np.array([extract_sentiment_features(text) for text in df_train["review"]])
test_sentiment_features = np.array([extract_sentiment_features(text) for text in df_test["review"]])

train_features = np.hstack([
    np.array(train_vec),
    train_cuisine_features,
    train_sentiment_features
])

test_features = np.hstack([
    np.array(test_vec),
    test_cuisine_features,
    test_sentiment_features
])

np.random.seed(42)
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(df_train["encoded_label"]), y=df_train["encoded_label"])
sample_weights = np.array([class_weights[i] for i in df_train["encoded_label"]])

clf = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.08,
    subsample=0.85,
    colsample_bytree=0.85,
    random_state=42
)
clf.fit(train_features, df_train["encoded_label"], sample_weight=sample_weights)

numeric_preds = clf.predict(test_features)
preds = label_encoder.inverse_transform(numeric_preds)

dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(preds):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)

dic_df = pd.DataFrame.from_dict(dic)
dic_df.to_csv("predicted.csv", index=False)

print(f"done")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Label mapping: {'american (new)': 0, 'american (traditional)': 1, 'asian fusion': 2, 'canadian (new)': 3, 'chinese': 4, 'italian': 5, 'japanese': 6, 'mediterranean': 7, 'mexican': 8, 'thai': 9}
Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding
Using fixed cuisine weight: 4.0
done


In [None]:
from google.colab import files
files.download('predicted.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>