# Space Needle Trip Advisor Scraper

## Import Libraries

In [1]:
import pandas as pd
from datetime import datetime
import time
import json

## Import Libraries and Install if necessary

In [2]:
!pip install requests
import requests



In [3]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup



In [4]:
!pip install tqdm
from copy import deepcopy
from tqdm import tqdm_notebook



In [5]:
!pip install python-dateutil
from dateutil import parser



## Functions

In [6]:
def Process_Reviews(reviews):
    
    data = []
    
    for review in reviews:
        
        info = dict()

        reviewer = review.select_one('.pointer_cursor div').text
        try:
            location = review.select_one('.pointer_cursor strong').text
        except:
            location = None
        rating_date = review.select_one('.ratingDate')['title']
        rating = str(review.select_one('.ui_bubble_rating'))[-11]
        quote = review.select_one('.noQuotes').text.strip()
        review_text = review.select_one('.partial_entry').text
        try:
            exp_date = review.select_one('.prw_reviews_stay_date_hsx').text[20:]
        except:
            exp_date = None
    
        info['Reviewer'] = reviewer
        """
        if location:
            info['Location'] = location
        else:
            info['Location'] = None
        """
        info['Rating'] = rating
        """
        info['Quote'] = quote
        info['Review'] = review_text
        """
        info['Rating Date'] = datetime.strftime(datetime.strptime(rating_date, '%B %d, %Y'), '%Y-%m-%d')
        if exp_date:
            info['Experience Date'] = datetime.strftime(datetime.strptime(exp_date, '%B %Y'), '%Y-%m')
        else:
            info['Experience Date'] = None
        
        data.append(info)
        
    return data

In [7]:
def Get_Reviews(url):
    
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    reviews = soup.select('.review-container')
    
    return reviews

## Defaults

In [8]:
url_pre = 'https://www.tripadvisor.com/Attraction_Review-g60878-d123109-Reviews-or'
url_post = '0-Space_Needle-Seattle_Washington.html'

In [9]:
num_pages = 1675
#num_pages = 3

In [10]:
wait_time = 1 #in seconds

## Main Program

In [11]:
%%time

all_reviews = []

for page in range(0, num_pages):

    complete = round(page / num_pages, 2) * 100
    print (f'\rNow processing page {page:,} of {num_pages:,} ({complete:.1f}% done)', end='')
    url =  url_pre + str(page) + url_post
    reviews = Get_Reviews(url)
    page_data = Process_Reviews(reviews)
    all_reviews.extend(page_data)
    time.sleep(wait_time)

print ('\nDone.')

Now processing page 1,674 of 1,675 (99.9% done)Done.
Wall time: 1h 11min 29s


## Convert to Pandas DataFrame and eliminate duplicates

In [12]:
df = pd.DataFrame(all_reviews)
df.drop_duplicates(subset=None, keep='first', inplace=True)

## Output to JSON

In [13]:
j = df.to_dict()

In [14]:
with open('SN_Reviews_with_Exp_Date.json', 'w') as f:
    
    json.dump(j, f)