In [1]:
import pandas as pd
import requests
from splinter import Browser
from bs4 import BeautifulSoup
import time
from sqlalchemy import create_engine
import json
import pymongo
from pymongo import MongoClient

# Extract CSVs into DataFrames

In [2]:
yelp_file = "../Resources/yelp_data.csv"
yelp_df = pd.read_csv(yelp_file)
yelp_df

Unnamed: 0.1,Unnamed: 0,restaurant_name,restaurant_url,restaurant_rating,price_cuisine,address1,address2
0,0,Rosie’s Trattoria,https://www.yelp.com/biz/rosie-s-trattoria-ran...,4.5,"$$$Italian, Bars, Venues & Event Spaces",1181 Sussex Tpke,"Randolph, NJ 07869"
1,1,SubUrban Bar & Kitchen,https://www.yelp.com/biz/suburban-bar-and-kitc...,4.0,"$$American (New), Bars, Pizza",500 NJ-10,"Randolph, NJ 07869"
2,2,Missy’s Main Street Cafe,https://www.yelp.com/biz/missys-main-street-ca...,5.0,"$Breakfast & Brunch, Cafes",181 E Main St,"Randolph, NJ 07869"
3,3,The Corner Bistro,https://www.yelp.com/biz/the-corner-bistro-ran...,4.0,"$$Diners, Italian",477 NJ-10,"Randolph, NJ 07869"
4,4,Beanbury,https://www.yelp.com/biz/beanbury-succasunna?o...,5.0,"Coffee & Tea, Breakfast & Brunch",37 Route 10 E,"Randolph, NJ 07869"
...,...,...,...,...,...,...,...
235,235,Clean Juice,https://www.yelp.com/biz/clean-juice-morristow...,4.5,"Juice Bars & Smoothies, Acai Bowls, Wraps",68 South St,"Randolph, NJ 07869"
236,236,LongHorn Steakhouse,https://www.yelp.com/biz/longhorn-steakhouse-f...,3.5,"$$Steakhouses, American (Traditional), Barbeque",50-J International Dr S,"Randolph, NJ 07869"
237,237,Hunan Chinese Room,https://www.yelp.com/biz/hunan-chinese-room-mo...,3.5,$$Chinese,255 Speedwell Ave,"Randolph, NJ 07869"
238,238,Brenda Lee Restaurant,https://www.yelp.com/biz/brenda-lee-restaurant...,3.5,Mexican,15 E Blackwell St,"Randolph, NJ 07869"


In [3]:
# All columns
yelp_df.columns

Index(['Unnamed: 0', 'restaurant_name', 'restaurant_url', 'restaurant_rating',
       'price_cuisine', 'address1', 'address2'],
      dtype='object')

In [15]:
# restaurant_name check
yelp_df["restaurant_name"].value_counts()
#yelp_df["restaurant_name"].count()

Sandwiches Unlimited Lunch Box            2
L & L Deli & Catering                     1
The 53 Grill                              1
JA Spice Island Jerk and American Food    1
Carver’s                                  1
                                         ..
Malay                                     1
Tierras y Sabores Restaurant              1
Paisano’s Pizzeria                        1
Bryans Luncheonette                       1
Blossom Asian Bistro                      1
Name: restaurant_name, Length: 239, dtype: int64

In [16]:
# address1 check
yelp_df["address1"].value_counts()

330 S Salem St          3
1250 Sussex Turnpike    2
39 W Clinton St         2
459 Main St             2
6B S Warren St          1
                       ..
25 E Main St            1
242 E Union Tpke        1
42 Main St              1
517 E Rte 10            1
12 Elm St               1
Name: address1, Length: 235, dtype: int64

In [17]:
# average rating
average_rating = yelp_df["restaurant_rating"].sum()/yelp_df["restaurant_name"].count()
print(average_rating)


4.04375


# Create new data with select columns

In [40]:
new_yelp_df = yelp_df[['restaurant_name', 'address1']].copy()
new_yelp_df

Unnamed: 0,restaurant_name,address1
0,Rosie’s Trattoria,1181 Sussex Tpke
1,SubUrban Bar & Kitchen,500 NJ-10
2,Missy’s Main Street Cafe,181 E Main St
3,The Corner Bistro,477 NJ-10
4,Beanbury,37 Route 10 E
...,...,...
235,Clean Juice,68 South St
236,LongHorn Steakhouse,50-J International Dr S
237,Hunan Chinese Room,255 Speedwell Ave
238,Brenda Lee Restaurant,15 E Blackwell St


# Clean DataFrame

In [33]:
# There are different length for restaurant_name and address1 values.So,we need to fill the empty values.
new_yelp_df['restaurant_name'] = new_yelp_df['restaurant_name'].fillna(0)
new_yelp_df['address1'] = new_yelp_df['address1'].fillna(0)
new_yelp_df

Unnamed: 0,restaurant_name,address1
0,Rosie’s Trattoria,1181 Sussex Tpke
1,SubUrban Bar & Kitchen,500 NJ-10
2,Missy’s Main Street Cafe,181 E Main St
3,The Corner Bistro,477 NJ-10
4,Beanbury,37 Route 10 E
...,...,...
235,Clean Juice,68 South St
236,LongHorn Steakhouse,50-J International Dr S
237,Hunan Chinese Room,255 Speedwell Ave
238,Brenda Lee Restaurant,15 E Blackwell St


# Store JSON data into a DataFrame

In [18]:
json_file = "../Resources/yelp_academic_dataset_business.json"
yelp_business_df = pd.read_json(json_file, lines=True)
yelp_business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'..."
1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD","8880 E Via Linda, Ste 107",Scottsdale,AZ,85258,33.569404,-111.890264,5.0,4,1,"{'GoodForKids': 'True', 'ByAppointmentOnly': '...","Health & Medical, Fitness & Instruction, Yoga,...",
2,XNoUzKckATkOD1hP6vghZg,Felinus,3554 Rue Notre-Dame O,Montreal,QC,H4C 1P4,45.479984,-73.58007,5.0,5,1,,"Pets, Pet Services, Pet Groomers",
3,6OAZjbxqM5ol29BuHsil3w,Nevada House of Hose,1015 Sharp Cir,North Las Vegas,NV,89030,36.219728,-115.127725,2.5,3,0,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Hardware Stores, Home Services, Building Suppl...","{'Monday': '7:0-16:0', 'Tuesday': '7:0-16:0', ..."
4,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726648,4.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Home Services, Plumbing, Electricians, Handyma...","{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', '..."


In [20]:
# All columns
yelp_business_df.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')

In [38]:
# restaurant name check
yelp_business_df["name"].value_counts()

Starbucks                  1184
McDonald's                  854
Subway Restaurants          613
Tim Hortons                 388
Burger King                 337
                           ... 
Toothology                    1
American Tattoo Studios       1
Porsche Beachwood             1
PreFocus Solutions            1
                              1
Name: name, Length: 157229, dtype: int64

In [39]:
# restaurant address check
yelp_business_df["address"].value_counts()

                             8679
5757 Wayne Newton Blvd        107
7014 E Camelback Rd            89
100 City Centre Drive          84
3200 Las Vegas Blvd S          80
                             ... 
353 N Shore Dr                  1
712 Steels Corners Rd           1
1561 Brittain Rd                1
3591 Sheppard Avenue E          1
1209 Mount Royal Avenue E       1
Name: address, Length: 164423, dtype: int64

# Create new data with select columns

In [41]:
new_yelp_business_df = yelp_business_df[['name', 'address']].copy()
new_yelp_business_df

Unnamed: 0,name,address
0,The Range At Lake Norman,10913 Bailey Rd
1,"Carlos Santo, NMD","8880 E Via Linda, Ste 107"
2,Felinus,3554 Rue Notre-Dame O
3,Nevada House of Hose,1015 Sharp Cir
4,USE MY GUY SERVICES LLC,4827 E Downing Cir
...,...,...
209388,Nishi Sushi,9750 Weston Road
209389,Walmart,3240 Wilkinson Blvd
209390,Five Guys,7014-590 E Camelback Rd
209391,Indian Trail Dog Training,


# Clean DataFrame

In [42]:
new_yelp_business_df['name'] = new_yelp_business_df['name'].fillna(0)
new_yelp_business_df['address'] = new_yelp_business_df['address'].fillna(0)
new_yelp_business_df

Unnamed: 0,name,address
0,The Range At Lake Norman,10913 Bailey Rd
1,"Carlos Santo, NMD","8880 E Via Linda, Ste 107"
2,Felinus,3554 Rue Notre-Dame O
3,Nevada House of Hose,1015 Sharp Cir
4,USE MY GUY SERVICES LLC,4827 E Downing Cir
...,...,...
209388,Nishi Sushi,9750 Weston Road
209389,Walmart,3240 Wilkinson Blvd
209390,Five Guys,7014-590 E Camelback Rd
209391,Indian Trail Dog Training,


In [37]:
yelp_business_df1 = yelp_business_df.loc[yelp_business_df["name"] == "new_yelp_df['restaurant_name']"]
yelp_business_df1.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours


# Create database connection

In [63]:
# Making a Connection with MongoClient
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
# database
mydb = myclient["NJ_Restaurants"]

In [64]:
# Check if Database Exists
print(myclient.list_database_names())

['ClassDB', 'Fruits_db', 'NJ_Restaurants', 'admin', 'config', 'craigslist_db', 'local', 'mars_db', 'store_inventory', 'team_db']


# Creating a Collection

In [65]:
mycol = mydb["Restaurants"]

In [66]:
# Check if Collection Exists
print(mydb.list_collection_names())

['Restaurants']


# Load DataFrames into database

In [67]:
# Load csv dataset
data = pd.read_csv('../Resources/yelp_data.csv')
data.reset_index(inplace=True)
data_dict = data.to_dict("records")

# Insert collection
mycol.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x2170bcad808>