In [1]:
import nltk
import numpy as np
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from nltk import pos_tag
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk import ngrams


In [2]:
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Han\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Han\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Han\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
dt0 = pd.read_csv("New_York_reviews.csv") # read data file

In [5]:
dt0.head()

Unnamed: 0.1,Unnamed: 0,parse_count,restaurant_name,rating_review,sample,review_id,title_review,review_preview,review_full,date,city,url_restaurant,author_id
0,1,2,Lido,5,Positive,review_773559838,A Regular Treat,My wife and I have been eating dinner frequent...,My wife and I have been eating dinner frequent...,"October 8, 2020",New_York_City_New_York,https://www.tripadvisor.com/Restaurant_Review-...,UID_0
1,2,3,Lido,4,Positive,review_769429529,Good neighborhood spot!,Came with family for Labor Day weekend brunch ...,Came with family for Labor Day weekend brunch ...,"September 8, 2020",New_York_City_New_York,https://www.tripadvisor.com/Restaurant_Review-...,UID_1
2,3,4,Lido,1,Negative,review_745700258,Disappointing,Food was mediocre at best. The lamb chops are...,Food was mediocre at best. The lamb chops are ...,"February 17, 2020",New_York_City_New_York,https://www.tripadvisor.com/Restaurant_Review-...,UID_2
3,4,5,Lido,5,Positive,review_728859349,What a find in Harlem,My co-workers were volunteering at a foodbank ...,My co-workers were volunteering at a foodbank ...,"November 25, 2019",New_York_City_New_York,https://www.tripadvisor.com/Restaurant_Review-...,UID_3
4,5,6,Lido,5,Positive,review_728429643,Lunch,Lido is an intimate boutique style restaurant....,Lido is an intimate boutique style restaurant....,"November 23, 2019",New_York_City_New_York,https://www.tripadvisor.com/Restaurant_Review-...,UID_4


In [6]:
dt_whole = dt0[['restaurant_name', 'rating_review', 'sample', 'title_review', 'review_full', 'city']]
dt_whole.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510463 entries, 0 to 510462
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   restaurant_name  510463 non-null  object
 1   rating_review    510463 non-null  object
 2   sample           510463 non-null  object
 3   title_review     510462 non-null  object
 4   review_full      510461 non-null  object
 5   city             510461 non-null  object
dtypes: object(6)
memory usage: 23.4+ MB


In [7]:
# check number of each class
dt_whole['sample'].value_counts()

Positive                                                                                                                419763
Negative                                                                                                                 90699
https://www.tripadvisor.com/Restaurant_Review-g60763-d477523-Reviews-or290-Rosa_Mexicano-New_York_City_New_York.html         1
Name: sample, dtype: int64

In [8]:
# check unusual data
dt_whole[dt_whole['sample']=='https://www.tripadvisor.com/Restaurant_Review-g60763-d477523-Reviews-or290-Rosa_Mexicano-New_York_City_New_York.html']

Unnamed: 0,restaurant_name,rating_review,sample,title_review,review_full,city
300451,"February 5, 2017",New_York_City_New_York,https://www.tripadvisor.com/Restaurant_Review-...,,,


In [9]:
dt_whole.drop(300451, inplace=True)

In [10]:
dt_whole.dropna(inplace=True)

In [11]:
dt_whole['sample'].value_counts()

Positive    419762
Negative     90699
Name: sample, dtype: int64

In [12]:
# copy original data
dt = dt_whole

## Balance data

In [13]:
dt_majority = dt[dt['sample'] == 'Positive']
dt_minority = dt[dt['sample'] == 'Negative']

In [14]:
# Randomly select majority class data (positive reviews)
dt_majority_downsampled = resample(dt_majority, 
                                   replace=False,  
                                   n_samples=len(dt_minority),  # to match minority class
                                   random_state=61)

In [15]:
dt_balanced = pd.concat([dt_majority_downsampled, dt_minority])
dt_balanced = dt_balanced.sample(frac=1, random_state=61).reset_index(drop=True)

In [16]:
dt_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181398 entries, 0 to 181397
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   restaurant_name  181398 non-null  object
 1   rating_review    181398 non-null  object
 2   sample           181398 non-null  object
 3   title_review     181398 non-null  object
 4   review_full      181398 non-null  object
 5   city             181398 non-null  object
dtypes: object(6)
memory usage: 8.3+ MB


In [17]:
# Take out a part of the data for testing
# dt = dt_balanced
dt = dt_balanced[0:10000]

In [18]:
# encoding target variable
dt['sentiment'] = dt['sample'].apply(lambda x: 1 if x == 'Positive' else 0)

# Normalization

In [19]:
stopword = set(stopwords.words('english'))

In [20]:
negations = {"not", "no", "nor", "neither", "never", "n't"}

In [21]:
# keep negation for sentimental analysis
stopword = stopword - negations

In [22]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

In [18]:
# using WordNetLemmatizer for Lemmatization
def get_wordnet_pos(word):
    """Map POS tag to the first character accepted by WordNetLemmatizer"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text])

In [24]:
# data preprocess
def process_row(row):
    tokens = word_tokenize(row)
    vocab = [w.lower() for w in tokens if w.lower() not in stopword and (w.isalpha() or w==['?', '!'] )]
    return lemmatize_text(vocab)
#     return ' '.join([porter.stem(t) for t in vocab])

In [25]:
dt['title review'] = dt['title_review'].apply(process_row)
dt['full review'] = dt['review_full'].apply(process_row)

In [26]:
dt.head()

Unnamed: 0,restaurant_name,rating_review,sample,title_review,review_full,city,sentiment,title review,full review
0,Sardi_s_Restaurant,2,Negative,Never Ever Again,Sardis was directly across from some Broadway ...,New_York_City_New_York,0,never ever,sardis directly across broadway theater crowd ...
1,Del_Frisco_s_Double_Eagle_Steakhouse,3,Negative,Overpriced,Had the prime strip steak - it was ok. The mea...,New_York_City_New_York,0,overprice,prime strip steak ok meat taste good medium ra...
2,Freemans_Restaurant,2,Negative,The best part stops when you walk in the front...,Walking down a secret alleyway to this charmin...,New_York_City_New_York,0,best part stop walk front door snarky attitude,walk secret alleyway charm farmhouse restauran...
3,Empire_Diner,4,Positive,Great Breakfast,Lovely diner near our hotel - not cheap but go...,New_York_City_New_York,1,great breakfast,lovely diner near hotel not cheap good staff l...
4,Rosa_Mexicano,3,Negative,Presentation over Substance,Rosa Mexicano was all about the ambiance. It's...,New_York_City_New_York,0,presentation substance,rosa mexicano ambiance showy touristy impressi...


### Pos Tag

In [27]:
# Counting the number of each lexical property in each review
def pos_tag_counter(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words, tagset='universal')
    tag_counts = Counter(tag for word, tag in pos_tags)
    return tag_counts

In [28]:
dt['pos_tag_counts'] = dt['full review'].apply(pos_tag_counter)

In [29]:
# saving pos_tag result as dataframe
pos_counts_df = pd.DataFrame()

for index, row in dt.iterrows():
    for tag, count in row['pos_tag_counts'].items():
        pos_counts_df.at[index, tag] = count

In [30]:
pos_counts_df.fillna(0, inplace=True)
pos_counts_df

Unnamed: 0,NOUN,ADV,ADP,VERB,ADJ,NUM,DET,PRT,PRON,X,CONJ,.
0,40.0,5.0,1.0,7.0,17.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13.0,8.0,0.0,8.0,11.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,111.0,34.0,3.0,55.0,61.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10.0,3.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,85.0,13.0,5.0,21.0,34.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,25.0,2.0,3.0,6.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,10.0,2.0,0.0,1.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,25.0,9.0,0.0,7.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,12.0,0.0,0.0,3.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# pos_counts_df.drop('.', axis=1, inplace=True)

### Negation Handling

In [32]:
# Combining the negative word with the word following it
def handle_negations(text):
    negation_words = {"not", "no", "never", "n't", "nor", "none"}
    words = text.split()
    handled_words = []
    negation = False
    for word in words:
        if negation:
            word = "NOT_" + word
            negation = False
        if word in negation_words:
            negation = True
            continue
        handled_words.append(word)
    return ' '.join(handled_words)

In [33]:
dt['handled_negations'] = dt['full review'].apply(handle_negations)

In [34]:
dt.head()

Unnamed: 0,restaurant_name,rating_review,sample,title_review,review_full,city,sentiment,title review,full review,pos_tag_counts,handled_negations
0,Sardi_s_Restaurant,2,Negative,Never Ever Again,Sardis was directly across from some Broadway ...,New_York_City_New_York,0,never ever,sardis directly across broadway theater crowd ...,"{'NOUN': 40, 'ADV': 5, 'ADP': 1, 'VERB': 7, 'A...",sardis directly across broadway theater crowd ...
1,Del_Frisco_s_Double_Eagle_Steakhouse,3,Negative,Overpriced,Had the prime strip steak - it was ok. The mea...,New_York_City_New_York,0,overprice,prime strip steak ok meat taste good medium ra...,"{'ADJ': 11, 'NOUN': 13, 'VERB': 8, 'ADV': 8, '...",prime strip steak ok meat taste good medium ra...
2,Freemans_Restaurant,2,Negative,The best part stops when you walk in the front...,Walking down a secret alleyway to this charmin...,New_York_City_New_York,0,best part stop walk front door snarky attitude,walk secret alleyway charm farmhouse restauran...,"{'NOUN': 111, 'ADJ': 61, 'ADV': 34, 'VERB': 55...",walk secret alleyway charm farmhouse restauran...
3,Empire_Diner,4,Positive,Great Breakfast,Lovely diner near our hotel - not cheap but go...,New_York_City_New_York,1,great breakfast,lovely diner near hotel not cheap good staff l...,"{'ADV': 3, 'NOUN': 10, 'ADP': 1, 'ADJ': 5}",lovely diner near hotel NOT_cheap good staff l...
4,Rosa_Mexicano,3,Negative,Presentation over Substance,Rosa Mexicano was all about the ambiance. It's...,New_York_City_New_York,0,presentation substance,rosa mexicano ambiance showy touristy impressi...,"{'NOUN': 85, 'ADJ': 34, 'ADV': 13, 'ADP': 5, '...",rosa mexicano ambiance showy touristy impressi...


## Most Frequent

In [35]:
# get 100 most frequent words from all of reviews
# and then check whether each review include these words 
def get_top_frequent_words(data, max_features=100, ngram_range=(1, 1)):

    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)  # Directly limit to top 'n' frequent words
    tfidf_result = vectorizer.fit_transform(data)   
    tfidf_df = pd.DataFrame(tfidf_result.toarray(), columns=vectorizer.get_feature_names_out())
    binary_tfidf_df = tfidf_df.applymap(lambda x: 1 if x > 0 else 0)
    
    return tfidf_df, binary_tfidf_df
# the first output is a dataframe with TF-IDF score, and the second one is binary value

In [36]:
_, title_feature1 = get_top_frequent_words(dt['title review'], max_features=100)
_, full_feature1 = get_top_frequent_words(dt['handled_negations'], max_features=200)

## TF_IDF

In [37]:
# get 100 words with the best TF-IDF score from all of reviews or titles
# and then check whether each review include these words 
def get_top_tfidf_words(data, max_features=None, top_n=100, ngram_range=(1, 1)):
    
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    tfidf_result = vectorizer.fit_transform(data)
    sorted_indices = np.argsort(tfidf_result.toarray().sum(axis=0))[::-1]
    feature_names = np.array(vectorizer.get_feature_names_out())
    tfidf_df = pd.DataFrame(tfidf_result.toarray(), columns=vectorizer.get_feature_names_out())
    tfidf_df = tfidf_df[feature_names[sorted_indices[:top_n]]]
    binary_tfidf_df = tfidf_df.applymap(lambda x: 1 if x > 0 else 0)

    return tfidf_df, binary_tfidf_df

In [38]:
_, title_feature2 = get_top_tfidf_words(dt['title review'], top_n=100)
_, full_feature2 = get_top_tfidf_words(dt['handled_negations'], top_n=200)

## N-Grams

In [39]:
# Using above two function to find bigram and trigram features for each review title
_, title_feature3 = get_top_frequent_words(dt['title review'], max_features=100, ngram_range=(2, 3))
_, title_feature4 = get_top_tfidf_words(dt['title review'], top_n=100, ngram_range=(2, 3))

In [40]:
# Using above two function to find bigram and trigram features for each review
_, full_feature3 = get_top_frequent_words(dt['handled_negations'], max_features=200, ngram_range=(2, 3))
_, full_feature4 = get_top_tfidf_words(dt['handled_negations'], max_features=5000, top_n=200, ngram_range=(2, 3))

In [41]:
# get length of each review
dt['length review'] = dt['full review'].apply(len)
dt['length title'] = dt['title review'].apply(len)

In [42]:
dt.head()

Unnamed: 0,restaurant_name,rating_review,sample,title_review,review_full,city,sentiment,title review,full review,pos_tag_counts,handled_negations,length review,length title
0,Sardi_s_Restaurant,2,Negative,Never Ever Again,Sardis was directly across from some Broadway ...,New_York_City_New_York,0,never ever,sardis directly across broadway theater crowd ...,"{'NOUN': 40, 'ADV': 5, 'ADP': 1, 'VERB': 7, 'A...",sardis directly across broadway theater crowd ...,468,10
1,Del_Frisco_s_Double_Eagle_Steakhouse,3,Negative,Overpriced,Had the prime strip steak - it was ok. The mea...,New_York_City_New_York,0,overprice,prime strip steak ok meat taste good medium ra...,"{'ADJ': 11, 'NOUN': 13, 'VERB': 8, 'ADV': 8, '...",prime strip steak ok meat taste good medium ra...,232,9
2,Freemans_Restaurant,2,Negative,The best part stops when you walk in the front...,Walking down a secret alleyway to this charmin...,New_York_City_New_York,0,best part stop walk front door snarky attitude,walk secret alleyway charm farmhouse restauran...,"{'NOUN': 111, 'ADJ': 61, 'ADV': 34, 'VERB': 55...",walk secret alleyway charm farmhouse restauran...,1780,46
3,Empire_Diner,4,Positive,Great Breakfast,Lovely diner near our hotel - not cheap but go...,New_York_City_New_York,1,great breakfast,lovely diner near hotel not cheap good staff l...,"{'ADV': 3, 'NOUN': 10, 'ADP': 1, 'ADJ': 5}",lovely diner near hotel NOT_cheap good staff l...,122,15
4,Rosa_Mexicano,3,Negative,Presentation over Substance,Rosa Mexicano was all about the ambiance. It's...,New_York_City_New_York,0,presentation substance,rosa mexicano ambiance showy touristy impressi...,"{'NOUN': 85, 'ADJ': 34, 'ADV': 13, 'ADP': 5, '...",rosa mexicano ambiance showy touristy impressi...,1087,22


## Concat features

In [43]:
# title features
title_feature = pd.concat([title_feature1, title_feature2, title_feature3, title_feature4], axis=1)
title_feature = title_feature.loc[:,~title_feature.columns.duplicated()]

In [44]:
title_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 220 entries, amaze to good burger
dtypes: int64(220)
memory usage: 16.8 MB


In [45]:
# full review features
full_feature = pd.concat([full_feature1, full_feature2, full_feature3, full_feature4], axis=1)
full_feature = full_feature.loc[:,~full_feature.columns.duplicated()]

In [46]:
full_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 427 entries, also to large portion
dtypes: int64(427)
memory usage: 32.6 MB


In [47]:
# all features
combined_features = pd.concat([title_feature, full_feature, pos_counts_df, dt['length review'], dt['length title']], axis=1)
combined_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 661 entries, amaze to length title
dtypes: float64(12), int64(649)
memory usage: 50.4 MB


## Split data

In [48]:
X = combined_features
y = dt['sentiment']

In [49]:
X.fillna(0, inplace=True)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=61)

In [51]:
# based on what kind of model we choose
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=61)

## Random Forest

In [52]:
model = RandomForestClassifier(random_state=61)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=61)

In [53]:
predictions = model.predict(X_test)

# Evaluation
print(classification_report(y_test, predictions))
print("Accuracy:", accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.82      0.85      0.84      1537
           1       0.84      0.81      0.82      1463

    accuracy                           0.83      3000
   macro avg       0.83      0.83      0.83      3000
weighted avg       0.83      0.83      0.83      3000

Accuracy: 0.832


## Feature Selection

In [55]:
# TIME WARING!!!
# not much improvment

selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=61), n_features_to_select=50, step=1)
selector = selector.fit(X_train, y_train)

# Transform training and testing sets
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Fit the model on selected features
model = RandomForestClassifier(random_state=61)
model.fit(X_train_selected, y_train)

# Evaluate the model
accuracy = model.score(X_test_selected, y_test)
print("Accuracy with selected features: ", accuracy)

Accuracy with selected features:  0.7936666666666666


## GridSearch

In [54]:
# TIME WARING!!!
# not much improvment

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=61)

# Initialize the Grid Search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)

# Evaluate the best model
best_model = grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy of the best model: ", accuracy)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters found:  {'max_depth': 30, 'min_samples_split': 10, 'n_estimators': 300}
Accuracy of the best model:  0.8336666666666667


## Restaurant Tag
I want to group reviews by restaurant name and use TF-IDF to find 20 most important words or bigrams as tags for each restaurant.

In [13]:
# Becasue this is not sentimental analysis, sentiment inbalance is ok, so I choose the whole originial dataset,
dt_whole.head()

Unnamed: 0,restaurant_name,rating_review,sample,title_review,review_full,city
0,Lido,5,Positive,A Regular Treat,My wife and I have been eating dinner frequent...,New_York_City_New_York
1,Lido,4,Positive,Good neighborhood spot!,Came with family for Labor Day weekend brunch ...,New_York_City_New_York
2,Lido,1,Negative,Disappointing,Food was mediocre at best. The lamb chops are ...,New_York_City_New_York
3,Lido,5,Positive,What a find in Harlem,My co-workers were volunteering at a foodbank ...,New_York_City_New_York
4,Lido,5,Positive,Lunch,Lido is an intimate boutique style restaurant....,New_York_City_New_York


In [14]:
dt_whole['rating_review'] = dt_whole['rating_review'].astype(int)

In [15]:
dt_whole.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510461 entries, 0 to 510462
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   restaurant_name  510461 non-null  object
 1   rating_review    510461 non-null  int32 
 2   sample           510461 non-null  object
 3   title_review     510461 non-null  object
 4   review_full      510461 non-null  object
 5   city             510461 non-null  object
dtypes: int32(1), object(5)
memory usage: 25.3+ MB


In [16]:
# Group reivew 
grouped_reviews = dt_whole.groupby('restaurant_name').apply(
    lambda x: pd.Series({
        'combined_reviews': ';'.join(x['title_review'] + ' ' + x['review_full']),
        'mean_rate': round(x['rating_review'].mean(),2)
    })).reset_index()

In [17]:
grouped_reviews

Unnamed: 0,restaurant_name,combined_reviews,mean_rate
0,12_Chairs,This is a great for breakfast This is a great ...,4.39
1,16_Handles,Yummy! Same as other 16 Handles. This place ne...,5.00
2,1902_Empire_Iii_Inc,"wrong order, rude staff nasty place. They got ...",1.00
3,1_Chimi_Sushi,Good sushi. Fast service Very good japanese...,5.00
4,211_New_Taco_Grill,"Dinner the food excellent, very good attention...",2.50
...,...,...,...
1800,Zio_Ristorante,Good standard Italian fare The food was good t...,4.38
1801,Zoe_Restaurant_Lounge,The Zoe Restaurant and Bar in Washington Heigh...,5.00
1802,Zoni_Cafe,Great value Close to Penn Station/ Empire Stat...,5.00
1803,Zoob_Zib_Thai_Authentic_Noodle_Bar,"Variety to the MAX!!! Wow, what a great select...",4.25


In [21]:
# Filter all eligible nouns
def filter_nouns_adj(text):
    lemmatizer = WordNetLemmatizer()
    stopword = set(stopwords.words('english'))    
    words = word_tokenize(text)
    vocab = [w.lower() for w in words if w.lower() not in stopword and w.isalpha() and w.lower() not in ['new york','york','nyc', 'restaurant',"restaurant", "food", "place", "drink"]]
    pos_tags = pos_tag(vocab, tagset='universal')
#     return vocab
    return ' '.join([lemmatizer.lemmatize(word) for word, tag in pos_tags if tag == 'NOUN'])

In [22]:
grouped_reviews['filtered_reviews'] = grouped_reviews['combined_reviews'].apply(filter_nouns_adj)

In [606]:
# check 5000 most frequent words and bigrams in all reviews and calculate TF-IDF score, then select 20 words with the best TF-IDF score
def tag_words(text, tag_index, max_features=5000, top_n=20, ngram_range=(1, 2)):
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range) 
    tfidf_result = vectorizer.fit_transform(text)

    words = vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_result.toarray(), columns=words, index=tag_index)

    top_words_dict = {}
    for restaurant in tfidf_df.index:
        top_words = tfidf_df.loc[restaurant].nlargest(top_n).index.tolist()
        top_words_dict[restaurant] = top_words

    return top_words_dict

In [23]:
def tag_words(text, tag_index, max_features=5000, top_n=20, ngram_range=(1, 3)):
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    tfidf_result = vectorizer.fit_transform(text)

    words = vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_result.toarray(), columns=words, index=tag_index)

    top_words_dict = {}
    for restaurant in tfidf_df.index:
        top_words = tfidf_df.loc[restaurant].nlargest(top_n * 2).index.tolist()
        filtered_top_words = []

        for word in top_words:
            if not any(word in filtered_word and word != filtered_word for filtered_word in filtered_top_words):
                filtered_top_words.append(word)
            if len(filtered_top_words) >= top_n:
                break

        top_words_dict[restaurant] = filtered_top_words

    return top_words_dict

In [24]:
restaurant_tags = tag_words(grouped_reviews['filtered_reviews'], grouped_reviews['restaurant_name'])

In [25]:
for restaurant, tags in restaurant_tags.items():
    print(f"Restaurant: {restaurant}, Tags: {tags}")

Restaurant: 12_Chairs, Tags: ['breakfast', 'hummus', 'chair', 'service', 'soho', 'time', 'pita', 'falafel', 'salad', 'egg', 'staff', 'menu', 'schnitzel', 'lunch', 'brunch', 'day', 'coffee', 'dish', 'bread', 'price']
Restaurant: 16_Handles, Tags: ['topping', 'yummy', 'selection', 'abc', 'ability', 'absolute', 'abundance', 'accent', 'access', 'accident', 'accommodate', 'accommodation', 'accompaniment', 'account', 'acoustic', 'act', 'action', 'activity', 'actor', 'ad']
Restaurant: 1902_Empire_Iii_Inc, Tags: ['order staff', 'staff order', 'knew', 'customer service', 'bland', 'way', 'abc', 'ability', 'absolute', 'abundance', 'accent', 'access', 'accident', 'accommodate', 'accommodation', 'accompaniment', 'account', 'acoustic', 'act', 'action']
Restaurant: 1_Chimi_Sushi, Tags: ['sushi service', 'speciality', 'delivery', 'soup', 'abc', 'ability', 'absolute', 'abundance', 'accent', 'access', 'accident', 'accommodate', 'accommodation', 'accompaniment', 'account', 'acoustic', 'act', 'action', 'a

#### Testing code
Please ignore them

In [541]:
def filter_nouns_adj(text, n=2):
    words = word_tokenize(text)
    words = [w for w in words if w not in ['new york', 'york', 'nyc']]
    pos_tags = pos_tag(words, tagset='universal')
    n_grams = ngrams(pos_tags, n)
    
    # Filter n-grams to include only those with a noun and an adjective
    filtered_ngrams = [' '.join([word for word, tag in ngram if tag in ['NOUN', 'ADJ']])
                       for ngram in n_grams if any(tag == 'ADJ' for word, tag in ngram)
                       and any(tag == 'NOUN' for word, tag in ngram)]
    
    return ' '.join(filtered_ngrams)

grouped_reviews['filtered_reviews'] = grouped_reviews['combined_reviews'].apply(lambda x: filter_nouns_adj(x, n=2))

## Aspected Based Sentimental Analysis

In [None]:
def load_lexicon(filename):
    with open(filename, 'r', encoding='latin-1') as file:
        words = file.read().splitlines()
    return words

positive_words = load_lexicon('positive-words.txt')
negative_words = load_lexicon('negative-words.txt')