## Import Libraries

In [1]:
from pymongo import MongoClient
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import math

## Connect to MongoDB Cluster

In [2]:
client = MongoClient("mongodb://localhost:27017")  # connect to local MongoDB Database
db = client.yelp  # use yelp database
serverStatusResult = db.command("serverStatus")  # check server status
# print(serverStatusResult)


## Business Collection DataFrame

In [3]:
business = db.business.find() # select business collection 
query = list(business) # get all observations
df = pd.DataFrame(query) # convert to pandas DataFrame

df = df.dropna()  # removing any NA values
df = df[df.is_open != 0]  # keeping only businesses that are open


## Cleaning Data

In [4]:
# Dropping unnecessary columns
df = df.drop(
    columns=[
        "_id",
        "business_id",
        "name",
        "address",
        "attributes",
        "hours",
        "latitude",
        "longitude",
        "postal_code",
        "is_open",
    ]
)
# split categories by comma delimiter and keep only the first category 
df["categories"] = df["categories"].str.split(pat=",", expand=True).dropna(axis=1)

# rearrange columns 
first_column = df.pop("stars")
df.insert(0, "stars", first_column)
last_column = df.pop("review_count")
df.insert(4, "review_count", last_column)

# standarize categorical variables 
df["city"] = df["city"].str.upper()
df["categories"] = df["categories"].str.upper()

df


Unnamed: 0,stars,city,state,categories,review_count
0,4.0,BOULDER,CO,GASTROPUBS,86
1,4.0,PORTLAND,OR,SALAD,126
2,4.5,PORTLAND,OR,ANTIQUES,13
4,4.0,ATLANTA,GA,GYMS,14
5,3.5,VANCOUVER,BC,RESTAURANTS,169
...,...,...,...,...,...
160576,4.0,WORTHINGTON,OH,AUTOMOTIVE,26
160577,4.0,AUSTIN,TX,EDUCATIONAL SERVICES,16
160578,4.5,ORLANDO,FL,AUTOMOTIVE,8
160579,1.0,NEEDHAM,MA,AUTOMOTIVE,11


## Data Encoding

In [5]:
# one hot encoding with dummy variables
city = pd.get_dummies(df["city"])
state = pd.get_dummies(df["state"])
category = pd.get_dummies(df["categories"])

# splitting dataframe into feautures and label
features = pd.concat([city, state, category, df[["review_count"]]], axis=1)
label = df["stars"].astype(str)


Unnamed: 0,ALAFAYA,ALAMONTE SPRINGS,ALLSTON,ALOHA,ALOMA,ALPHARETTA,ALTAMONTE,ALTAMONTE SPRINGS,ALTAMONTE SPRIN,ALTAMONTE SPRING,...,WINE TASTING ROOM,WINE TOURS,WINERIES,WOMEN'S CLOTHING,WRAPS,YELP EVENTS,YOGA,ZIPLINING,ZOOS,review_count
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,86
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,126
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160576,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,26
160577,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,16
160578,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
160579,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11


## Classification Modeling

In [6]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3) # 70% training and 30% test

# Use KNN Classifier and choose k based on sqrt(n)
k = round(math.sqrt(df.shape[0]))
knn = KNeighborsClassifier(n_neighbors=k)

# Training the model
knn.fit(X_train, y_train)

# Predicting with the model
y_pred = knn.predict(X_test)

# Finding Model Accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.26178985107556535
