In [1]:
import numpy as np 
import pandas as pd 
import os

# Read data

In [2]:
df_reviews = pd.read_csv('data/Reviews CSV.csv')
df_reviewers = pd.read_csv('data/Reviewers (Users) CSV.csv')
df_restaurants = pd.read_csv('data/Resturants CSV.csv')

# Join data

In [22]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 788471 entries, 0 to 788470
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   date           788471 non-null  object
 1   reviewID       788470 non-null  object
 2   reviewerID     788471 non-null  object
 3   reviewContent  788469 non-null  object
 4   rating         788471 non-null  int64 
 5   usefulCount    788471 non-null  int64 
 6   coolCount      788471 non-null  int64 
 7   funnyCount     788471 non-null  int64 
 8   flagged        788471 non-null  object
 9   restaurantID   788471 non-null  object
dtypes: int64(4), object(6)
memory usage: 60.2+ MB


In [5]:
df_reviewers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16941 entries, 0 to 16940
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   reviewerID       16941 non-null  object
 1   name             16941 non-null  object
 2   location         16931 non-null  object
 3   yelpJoinDate     16941 non-null  object
 4   friendCount      16941 non-null  int64 
 5   reviewCount      16941 non-null  int64 
 6   firstCount       16941 non-null  int64 
 7   usefulCount      16941 non-null  int64 
 8   coolCount        16941 non-null  int64 
 9   funnyCount       16941 non-null  int64 
 10  complimentCount  16941 non-null  int64 
 11  tipCount         16941 non-null  int64 
 12  fanCount         16941 non-null  int64 
dtypes: int64(9), object(4)
memory usage: 1.7+ MB


In [6]:
df_restaurants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242652 entries, 0 to 242651
Data columns (total 30 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   restaurantID          242652 non-null  object 
 1   name                  242652 non-null  object 
 2   location              242652 non-null  object 
 3   reviewCount           242652 non-null  int64  
 4   rating                242652 non-null  float64
 5   categories            242652 non-null  object 
 6   address               242652 non-null  object 
 7   Hours                 106328 non-null  object 
 8   GoodforKids           127014 non-null  object 
 9   AcceptsCreditCards    182276 non-null  object 
 10  Parking               138119 non-null  object 
 11  Attire                109277 non-null  object 
 12  GoodforGroups         116329 non-null  object 
 13  PriceRange            179454 non-null  object 
 14  TakesReservations     102629 non-null  object 
 15  

In [23]:
# Columns to exclude prefix, for ease of joining
exclude = ['reviewID', 'reviewerID', 'restaurantID']

# Add prefix to distinctively differentiate columns

df_reviews.columns = [
    col if col in exclude else 'review_' + col
    for col in df_reviews.columns
]

df_reviewers.columns = [
    col if col in exclude else 'reviewer_' + col
    for col in df_reviewers.columns
]

df_restaurants.columns = [
    col if col in exclude else 'restaurant_' + col
    for col in df_restaurants.columns
]

In [24]:
# Merge the dataframes together
merged = df_reviews.merge(
    df_reviewers,
    on='reviewerID',
    how='inner'   
)

merged = merged.merge(
    df_restaurants,
    on='restaurantID',
    how='inner'  
)

print(f'length of merged: {len(merged)}')
merged.head(3)

length of merged: 700617


Unnamed: 0,review_date,reviewID,reviewerID,review_reviewContent,review_rating,review_usefulCount,review_coolCount,review_funnyCount,review_flagged,restaurantID,...,restaurant_GoodFor,restaurant_Alcohol,restaurant_NoiseLevel,restaurant_Ambience,restaurant_HasTV,restaurant_Caters,restaurant_WheelchairAccessible,restaurant_webSite,restaurant_phoneNumber,restaurant_filReviewCount
0,9/22/2012,GtwU21YOQn-wf4vWRUIx6w,bNYesZ944s6IJVowOnB0iA,"Unlike Next, which we'd eaten at the previous ...",5,0,0,0,N,pbEiXam9YJL3neCYHGwLUA,...,Dinner,Full Bar,Quiet,"Classy, Upscale",No,No,Yes,http://www.alinearestaurant.com,(312) 867-0110,136
1,9/22/2012,0LpVTc3,TRKxLC3y-ZvP45e5iilMtw,Probably one of the best meals I've had ever. ...,5,0,0,0,N,pbEiXam9YJL3neCYHGwLUA,...,Dinner,Full Bar,Quiet,"Classy, Upscale",No,No,Yes,http://www.alinearestaurant.com,(312) 867-0110,136
2,9/19/2012,tljtLzf68Fkwf,0EMm8umAqXZzyhxNpL4M9g,Service was impeccable. Experience and present...,3,2,0,0,N,pbEiXam9YJL3neCYHGwLUA,...,Dinner,Full Bar,Quiet,"Classy, Upscale",No,No,Yes,http://www.alinearestaurant.com,(312) 867-0110,136


In [25]:
# Replace null and NaN values in the flagged column with a default label
default_label = 'NR'
merged['review_flagged'] = merged['review_flagged'].fillna(default_label)
# Convert the flagged column to binary labels
merged['review_flagged'] = merged['review_flagged'].apply(lambda x: 1 if x == 'Y' else 0)
merged['review_flagged'].unique()

array([0, 1], dtype=int64)

# Feature Engineering for Businesses

In [48]:
df = df_restaurants.copy()

In [49]:
df['restaurant_location']

0                       Alinea - Lincoln Park - Chicago, IL
1                       Hot Doug's - Avondale - Chicago, IL
2                       Tru - Near North Side - Chicago, IL
3                            Crisp - Lakeview - Chicago, IL
4                         Schwa - Wicker Park - Chicago, IL
                                ...                        
242647    Antiquarian's  Delight - South Street District...
242648            Tired Hands Brewing Company - Ardmore, PA
242649    Lemon Grass Thai Restaurant - University City ...
242650             Tang Pho House - CLOSED - Naperville, IL
242651    T & T Lounge - CLOSED - Near West Side - Chica...
Name: restaurant_location, Length: 242652, dtype: object

### Extract location information

In [50]:
# Split by " - "
parts = df['restaurant_location'].str.split(' - ', expand=True)

# parts[1] = neighbourhood
# parts[2] = "City, State"

df['neighbourhood'] = parts[1]

# Now split city/state
df[['city', 'state']] = parts[2].str.split(', ', expand=True)

In [51]:
df[['neighbourhood', 'city', 'state']]

Unnamed: 0,neighbourhood,city,state
0,Lincoln Park,Chicago,IL
1,Avondale,Chicago,IL
2,Near North Side,Chicago,IL
3,Lakeview,Chicago,IL
4,Wicker Park,Chicago,IL
...,...,...,...
242647,South Street District,Philadelphia,PA
242648,"Ardmore, PA",,
242649,University City,Philadelphia,PA
242650,CLOSED,Naperville,IL


# End of Feature Engineering for Businesses

In [72]:
from sklearn.model_selection import train_test_split

X = merged.drop(columns=['review_flagged'])
y = merged['review_flagged']

# Assuming your full merged DataFrame is called df
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,              # 20% for testing
    stratify=y,                 # preserve class proportions
    random_state=42             # for reproducibility
)

print(X_test.shape)
print(y_test.shape)

# Check class balance
print("Train flag ratio:")
print(y_train.value_counts(normalize=True))

print("\nTest flag ratio:")
print(y_test.value_counts(normalize=True))

(140124, 50)
(140124,)
Train flag ratio:
review_flagged
0    0.991142
1    0.008858
Name: proportion, dtype: float64

Test flag ratio:
review_flagged
0    0.991144
1    0.008856
Name: proportion, dtype: float64


In [73]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, accuracy_score, recall_score


# Handle non-numeric columns (convert to dummy variables)
X_train = X_train.select_dtypes(include=['number']).copy()

# Ensure same columns in train/test after dummy encoding
X_test = X_test[X_train.columns]

# Create and train the XGBoost model
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    learning_rate=0.1,
    max_depth=6,
    n_estimators=200,
    random_state=42,
    use_label_encoder=False
)

model.fit(X_train, y_train)

# Evaluate model
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# F1-score
f1 = f1_score(y_test, y_pred)
print(f"\nF1-score: {f1:.4f}")

# ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC: {roc_auc:.4f}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.4f}")

# Classification Report
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[138624    259]
 [   330    911]]

F1-score: 0.7557
ROC-AUC: 0.9984
Accuracy: 0.9958
Recall: 0.7341
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    138883
           1       0.78      0.73      0.76      1241

    accuracy                           1.00    140124
   macro avg       0.89      0.87      0.88    140124
weighted avg       1.00      1.00      1.00    140124

