<a href="https://colab.research.google.com/github/elizabethavargas/cloud-hw1/blob/master/other_scripts/yelp_opensearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Upload Yelp Restaurant Data to Open Search

In [24]:
# install packages
!pip install boto3
!pip install opensearch-py
!pip install requests-aws4auth



### Open Saved Data

In [25]:
import pickle
with open('yelp_restaurants.pickle', 'rb') as f:
  yelp_restaurants = pickle.load(f)

In [26]:
yelp_restaurants[1]

{'id': 'mNeyjjcfrXdGf_mCyoolkw',
 'alias': 'lolas-new-york',
 'name': "Lola's",
 'image_url': 'https://s3-media0.fl.yelpcdn.com/bphoto/mqY1TlTF7-zwNlR1hjDlJg/o.jpg',
 'is_closed': False,
 'url': 'https://www.yelp.com/biz/lolas-new-york?adjust_creative=p6mhJwsttIzMzP5Ln3malA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=p6mhJwsttIzMzP5Ln3malA',
 'review_count': 80,
 'categories': [{'alias': 'newamerican', 'title': 'New American'}],
 'rating': 4.6,
 'coordinates': {'latitude': 40.74481688811193, 'longitude': -73.9878205},
 'transactions': [],
 'location': {'address1': '2 W 28th St',
  'address2': '',
  'address3': None,
  'city': 'New York',
  'zip_code': '10001',
  'country': 'US',
  'state': 'NY',
  'display_address': ['2 W 28th St', 'New York, NY 10001']},
 'phone': '',
 'display_phone': '',
 'distance': 2216.123384764452,
 'business_hours': [{'open': [{'is_overnight': False,
     'start': '1700',
     'end': '2100',
     'day': 0},
    {'is_overnight': False, 

In [39]:
# convert restaurant info into correct format
restaurants = []
category_map = {
    # MEXICAN
    "Mexican": "Mexican",
    "New Mexican Cuisine": "Mexican",
    "Tex-Mex": "Mexican",
    "Tacos": "Mexican",
    "Latin American": "Mexican",

    # CHINESE
    "Chinese": "Chinese",
    "Cantonese": "Chinese",
    "Dim Sum": "Chinese",
    "Hainan": "Chinese",
    "Hong Kong Style Cafe": "Chinese",
    "Shanghainese": "Chinese",
    "Szechuan": "Chinese",
    "Taiwanese": "Chinese",

    # ITALIAN
    "Italian": "Italian",
    "Pizza": "Italian",
    "Pasta Shops": "Italian",
    "Sicilian": "Italian",
    "Tuscan": "Italian",

    # INDIAN
    "Indian": "Indian",
    "Pakistani": "Indian",
    "Bangladeshi": "Indian",
    "Sri Lankan": "Indian",
    "Himalayan/Nepalese": "Indian",

    # NEW AMERICAN
    "New American": "American",
    "American": "American",
    "Southern": "American",
    "Comfort Food": "American",
    "Steakhouses": "American",
    "Seafood": "American",
    "Burgers": "American",
    "Breakfast & Brunch": "American",
    "Diners": "American",
    "Gastropubs": "American",
    "Barbeque": "American"}

for r in yelp_restaurants:
  cuisine = None
  for c in r['categories']:
    if c['title'] in category_map.keys():
      cuisine = category_map[c['title']]
      break

  restaurants.append({"RestaurantID": r["id"],
    "Cuisine": cuisine})

In [43]:
len(restaurants)

1189

### Insert Restaurants into Open Search

In [6]:
from google.colab import userdata

# load AWS key from secrets
AWS_KEY = userdata.get('AWS_KEY').strip()
AWS_SECRET = userdata.get('AWS_SECRET').strip()

In [44]:
from opensearchpy import OpenSearch, RequestsHttpConnection, helpers
from requests_aws4auth import AWS4Auth
import boto3

# AWS credentials
region = "us-east-1"
service = "es"

awsauth = AWS4Auth(
    AWS_KEY,
    AWS_SECRET,
    region,
    service,
    session_token=None
)

# OpenSearch client
host = "search-domain1-otwi6xtzh23sgx5uo3vq75hfei.us-east-1.es.amazonaws.com"

client = OpenSearch(
    hosts=[{"host": host, "port": 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

# Prepare bulk actions
actions = [
    {
        "_index": "restaurants",
        "_source": restaurant
    }
    for restaurant in restaurants
]

# Bulk insert
helpers.bulk(client, actions)

print("Bulk insert completed!")

Bulk insert completed!


### Check the Table

In [45]:
response = client.search(
    index="restaurants",
    body={"query": {"match_all": {}}}
)

print(response)

{'took': 115, 'timed_out': False, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1189, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'restaurants', '_id': 'q6NkhpwBhvVSU2OaS29l', '_score': 1.0, '_source': {'RestaurantID': 'zRXMvxUX_rOliKZPpkWi_g', 'Cuisine': 'American'}}, {'_index': 'restaurants', '_id': 'raNkhpwBhvVSU2OaS29l', '_score': 1.0, '_source': {'RestaurantID': 'Q-zg3X3OQ0gNABmwlgiczw', 'Cuisine': 'American'}}, {'_index': 'restaurants', '_id': 'r6NkhpwBhvVSU2OaS29l', '_score': 1.0, '_source': {'RestaurantID': 'SaYTVG7wHCMDBofHCgXPaA', 'Cuisine': 'American'}}, {'_index': 'restaurants', '_id': 'sKNkhpwBhvVSU2OaS29l', '_score': 1.0, '_source': {'RestaurantID': 'z5hRX3iJ5Ty_S38iG_WY3Q', 'Cuisine': 'American'}}, {'_index': 'restaurants', '_id': 'tKNkhpwBhvVSU2OaS29l', '_score': 1.0, '_source': {'RestaurantID': 'OfKH43VSRWrrv35jC9upAA', 'Cuisine': 'American'}}, {'_index': 'restaurants', '_id': 'vKNkhpwBhvVSU2OaS29l', '_sco