In [1]:
import re
import pickle
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

# NLP Imports
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Download data safely
try:
    nltk.download('stopwords')
except:
    pass

STOPWORDS = set(stopwords.words('english'))

# ML Imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from xgboost import XGBClassifier
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv('trip advisor restaurents  10k - trip_rest_neywork_1.csv')
print('Dataset shape:', data.shape)
print(data.head())

Dataset shape: (10397, 6)
                          Title Number of review              Catagory  \
0  All Stars Sports Bar & Grill               21              Bar, Pub   
1                    Olio e Piu            2,998        Italian, Pizza   
2        Boucherie West Village            1,465    French, Steakhouse   
3             Club A Steakhouse            4,413  American, Steakhouse   
4     Piccola Cucina Estiatorio              403     Italian, Sicilian   

                                      Reveiw Comment  Popular food  \
0       “The fries were terrific also, hot crisp...”         fries   
1            “I love the food and our server Maria!”  filet mignon   
2  “The filet mignon was impeccable and the musse...       lobster   
3  “My seafood cocktail had wonderful large lump ...  cacio e pepe   
4  “penne al pomodoro and bucatini cacio e pepe w...       mussels   

  Online Order  
0          Yes  
1          Yes  
2          Yes  
3          Yes  
4          Yes  


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10397 entries, 0 to 10396
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Title             10397 non-null  object
 1   Number of review  10397 non-null  object
 2   Catagory          10397 non-null  object
 3   Reveiw Comment    10397 non-null  object
 4   Popular food      10397 non-null  object
 5   Online Order      10397 non-null  object
dtypes: object(6)
memory usage: 487.5+ KB


In [4]:
data.isnull().sum()

Title               0
Number of review    0
Catagory            0
Reveiw Comment      0
Popular food        0
Online Order        0
dtype: int64

In [5]:
data.describe()

Unnamed: 0,Title,Number of review,Catagory,Reveiw Comment,Popular food,Online Order
count,10397,10397,10397,10397,10397,10397
unique,7237,857,560,6029,539,4
top,Royal 35 Steakhouse,No,"Italian, Pizza",No,No,No
freq,82,1511,822,2199,7709,5729


In [6]:
data['length'] = data['Reveiw Comment'].apply(len)

In [7]:
data.head()

Unnamed: 0,Title,Number of review,Catagory,Reveiw Comment,Popular food,Online Order,length
0,All Stars Sports Bar & Grill,21,"Bar, Pub","“The fries were terrific also, hot crisp...”",fries,Yes,44
1,Olio e Piu,2998,"Italian, Pizza",“I love the food and our server Maria!”,filet mignon,Yes,39
2,Boucherie West Village,1465,"French, Steakhouse",“The filet mignon was impeccable and the musse...,lobster,Yes,82
3,Club A Steakhouse,4413,"American, Steakhouse",“My seafood cocktail had wonderful large lump ...,cacio e pepe,Yes,81
4,Piccola Cucina Estiatorio,403,"Italian, Sicilian",“penne al pomodoro and bucatini cacio e pepe w...,mussels,Yes,61


In [8]:
data['Number of review'] = (
    data['Number of review']
    .astype(str)
    .str.replace(r'\D+', '', regex=True)   # remove non-digits
)

data['Number of review'] = pd.to_numeric(
    data['Number of review'],
    errors='coerce'
)


In [10]:
data = data.dropna(subset=['Number of review'])


In [19]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8886 entries, 0 to 8885
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Title             8886 non-null   object
 1   Number of review  8886 non-null   int64 
 2   Catagory          8886 non-null   object
 3   Reveiw Comment    8886 non-null   object
 4   Popular food      8886 non-null   object
 5   Online Order      8886 non-null   object
 6   length            8886 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 486.1+ KB


In [18]:
data = data.reset_index(drop=True)


In [13]:
data['Number of review'] = data['Number of review'].round().astype(int)


In [14]:
data['Number of review'] = data['Number of review'].astype(int)
