## Import Libraries

In [13]:
from pymongo import MongoClient
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import math

## Connect to MongoDB Cluster

In [14]:
client = MongoClient("mongodb://localhost:27017")  # connect to local MongoDB Database
db = client.yelp  # use yelp database
serverStatusResult = db.command("serverStatus")  # check server status
# print(serverStatusResult)


## Business Collection DataFrame

In [15]:
business = db.business.find() # select business collection 
query = list(business) # get all observations
df = pd.DataFrame(query) # convert to pandas DataFrame

df = df.dropna()  # removing any NA values

df = df.drop(
    columns=[
        "_id",
        "business_id",
        "name",
        "address",
        "hours",
        "attributes",
        "postal_code"
    ]
)
df

Unnamed: 0,city,state,latitude,longitude,stars,review_count,is_open,categories
0,Boulder,CO,40.017544,-105.283348,4.0,86,1,"Gastropubs, Food, Beer Gardens, Restaurants, B..."
1,Portland,OR,45.588906,-122.593331,4.0,126,1,"Salad, Soup, Sandwiches, Delis, Restaurants, C..."
2,Portland,OR,45.511907,-122.613693,4.5,13,1,"Antiques, Fashion, Used, Vintage & Consignment..."
4,Atlanta,GA,33.747027,-84.353424,4.0,14,1,"Gyms, Active Life, Interval Training Gyms, Fit..."
5,Vancouver,BC,49.251342,-123.101333,3.5,169,1,"Restaurants, Thai"
...,...,...,...,...,...,...,...,...
160578,Orlando,FL,28.566161,-81.298123,4.5,8,1,"Automotive, Tires, Towing, Auto Repair"
160579,Needham,MA,42.282399,-71.225169,1.0,11,1,"Automotive, Auto Detailing"
160581,Orlando,FL,28.499836,-81.047478,4.5,18,1,"Health Markets, Food, Specialty Food, Grocery"
160582,Orlando,FL,28.511615,-81.270020,5.0,8,0,"Arts & Entertainment, Paint & Sip, Art Classes..."


## Data Encoding

In [16]:
categories = pd.get_dummies(df["categories"].str.split(',', expand=True))
city = pd.get_dummies(df["city"])
state = pd.get_dummies(df["state"])
df = df.drop(columns=["city", "state", "categories"])

## Combining Data Frames

In [17]:
full = pd.concat([df, city, state, categories], axis=1)
full

Unnamed: 0,latitude,longitude,stars,review_count,is_open,ALTAMONTE SPRINGS,ATLANTA,AUBURNDALE,AUSTIN,Alafaya,...,30_ Active Life,30_ Cards & Stationery,31_ Automotive,31_ Jewelry,32_ Mobile Phones,32_ Restaurants,33_ Auto Repair,34_ Mobile Phone Repair,35_ Movers,36_ Dog Parks
0,40.017544,-105.283348,4.0,86,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,45.588906,-122.593331,4.0,126,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,45.511907,-122.613693,4.5,13,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,33.747027,-84.353424,4.0,14,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,49.251342,-123.101333,3.5,169,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160578,28.566161,-81.298123,4.5,8,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
160579,42.282399,-71.225169,1.0,11,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
160581,28.499836,-81.047478,4.5,18,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
160582,28.511615,-81.270020,5.0,8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Split into Training and Testing

In [18]:
y = full["is_open"]
X = full.drop(columns="is_open")

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

## Classification Modeling

## KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7405422958272447


## Random Forrest

In [21]:
from sklearn.ensemble import RandomForestClassifier
clf =  RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7939600584510472


## Decision Tree Classifier

In [22]:
from sklearn.tree import DecisionTreeClassifier
clt = DecisionTreeClassifier()
clt.fit(X_train, y_train)
y_pred = clt.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7299074525085241


## Naive Bayes

In [23]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4200086594144071


## Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)
y_pred = logisticRegr.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7887102884667424
