In [9]:

from __future__ import unicode_literals

import json
from time import sleep

from bs4 import BeautifulSoup
from kafka import KafkaConsumer, KafkaProducer

import requests
# -*- coding: utf-8 -*-

def fetch_raw(hotel_url):
    html = None
    print('Processing..{}'.format(hotel_url))
    try:
        r = requests.get(hotel_url, headers=headers)
        if r.status_code == 200:
            html = r.text
    except Exception as ex:
        print('Exception while accessing raw html')
        print(str(ex))
    finally:
        return html.strip()


def get_hotels():
    hotels = []
    url = 'https://www.traveloka.com/en-th/hotel/thailand/region/bangkok-10000045/best-hotels-in-bangkok'
    print('Accessing list')

    try:
        r = requests.get(url, headers=headers)
        if r.status_code == 200:
            html = r.text
            #soup = BeautifulSoup(html, 'lxml')
            soup = BeautifulSoup(html, 'html.parser')
            #soup = BeautifulSoup(r.content, features="html")
            links = soup.findAll('h3')
            idx = 0
            for link in links:
                if link.find('a'):
                    sleep(2)
                    hotel = fetch_raw(link.find('a')['href'])
                    hotels.append(hotel)
                    idx += 1
                    if idx > 9:
                        break
    except Exception as ex:
        print('Exception in get_hotels')
        print(str(ex))
    finally:
        return hotels
      
def publish_message(producer_instance, topic_name, key, value):
    try:
        #print(key)
        #print(value)
        key_bytes =  bytes(key, encoding='utf-8')   #.encode('utf-8')
        value_bytes = bytes(value, encoding='utf-8') #encode('utf-8')
        
        producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
        producer_instance.flush()
        print('Message published successfully.')
    except Exception as ex:
        print('Exception in publishing message')
        print(str(ex))

def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'], api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka')
        print(str(ex))
    finally:
        return _producer

if __name__ == '__main__':
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
        'Pragma': 'no-cache'
    }
    all_hotels = get_hotels()
    if len(all_hotels) > 0:
        kafka_producer = connect_kafka_producer()
        for hotel in all_hotels:
             
            publish_message(kafka_producer, 'thai_hotels_final', 'raw', hotel.strip())
        if kafka_producer is not None:
            kafka_producer.close()

Accessing list
Processing..https://www.traveloka.com/en-th/hotel/thailand/baron-residence-bangkok-1000000542715
Processing..https://www.traveloka.com/en-th/hotel/thailand/miracle-grand-convention-hotel---buy-now-stay-later-9000000881505
Processing..https://www.traveloka.com/en-th/hotel/thailand/miracle-grand-convention-hotel-1000000259952
Processing..https://www.traveloka.com/en-th/hotel/thailand/the-berkeley-hotel-pratunam-9000000001714
Processing..https://www.traveloka.com/en-th/hotel/thailand/amara-bangkok-1000000505774
Processing..https://www.traveloka.com/en-th/hotel/thailand/jasmine-grande-residence--1000000434118
Processing..https://www.traveloka.com/en-th/hotel/thailand/best-western-premier-sukhumvit-1000000409574
Processing..https://www.traveloka.com/en-th/hotel/thailand/grande-centre-point-sukhumvit-55-thong-lo-3000010022477
Processing..https://www.traveloka.com/en-th/hotel/thailand/bangkok-palace-1000000362312
Processing..https://www.traveloka.com/en-th/hotel/thailand/amari-

In [13]:

import json
from time import sleep

from bs4 import BeautifulSoup
from kafka import KafkaConsumer, KafkaProducer

def publish_message(producer_instance, topic_name, key, value):
    try:
        key_bytes = bytes(key, encoding='utf-8') 
        value_bytes = bytes(value, encoding='utf-8') 
        producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
        producer_instance.flush()
        print('Message published successfully (producer).')
    except Exception as ex:
        print('Exception in publishing message')
        print(str(ex))


def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'], api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka')
        print(str(ex))
    finally:
        return _producer


def parse(markup):
    title = '-'
    starRating = '-'
    userRating = '-'
    reviews = []
    rec = {}

    try:

        soup = BeautifulSoup(markup)
        
        # title
        # <h1 class="_2kDpX tvat-hotelName" itemProp="name">Miracle Grand Convention Hotel</h1>
        title_section = soup.find_all('h1', {'class': '_2kDpX tvat-hotelName'})
        
        # starRating
        # <div itemProp="starRating" itemscope="" itemType="https://schema.org/Rating" class="_1RoiH _1u8y8">
        # <meta itemProp="ratingValue" content="5"/>
        starRating_section = soup.find_all('div', {'class': '_1RoiH _1u8y8'})
        
        # userRating
        # <div class="_3-G5M"><meta itemProp="ratingValue" content="8.7"/>
        userRating_section = soup.find_all('div', {'class': '_3-G5M'})
        
        # reviews
        # <div class="_2K0Zb _278Mz" id="123882031"><div itemProp="review" itemscope="" itemType="https://schema.org/Review" class="r-1guathk r-1yzf0co">
        # <div dir="auto" class="css-901oao r-1i6uqv8 r-1sixt3s r-ubezar r-majxgm r-135wba7 r-fdjqy7">8.5</div>
        reviews_section = soup.find_all('div', {'class': '_2K0Zb _278Mz'})
        
        if reviews_section:
            for review in reviews_section:
                review_section = review.find_all('div', {'class':'css-901oao r-1i6uqv8 r-1sixt3s r-ubezar r-majxgm r-135wba7 r-fdjqy7'})
                review_text = review_section[0].text.strip()
                if review_text != '':
                    reviews.append(float(review_text))
                    
        if userRating_section:
            for oneUserRating in userRating_section:
                userRating = oneUserRating.find('meta')['content']
                    

        if starRating_section:
            for oneStarRating in starRating_section:
                starRating = oneStarRating.find('meta')['content']

        if title_section:
            title = title_section[0].text.strip()
            

        rec = {'title': title, 'starRating': starRating, 'userRating': userRating, 'reviews': reviews}

    except Exception as ex:
        print('Exception while parsing')
        print(str(ex))
    finally:
        return json.dumps(rec)

if __name__ == '__main__':
    print('Running Consumer..')
    parsed_records = []
    topic_name = 'thai_hotels_final'
    parsed_topic_name = 'thai_parsed_hotels_final'

    consumer = KafkaConsumer(topic_name, auto_offset_reset='earliest',
                             bootstrap_servers=['localhost:9092'], api_version=(0, 10), consumer_timeout_ms=1000)
    for msg in consumer:
        html = msg.value
        result = parse(html)
        print (result)
        parsed_records.append(result)
    consumer.close()
    sleep(5)

    if len(parsed_records) > 0:
        print('Publishing records..')
        producer = connect_kafka_producer()
        for rec in parsed_records:
            publish_message(producer, parsed_topic_name, 'parsed', rec)

Running Consumer..
{"title": "Baron Residence Bangkok", "starRating": "5", "userRating": "8.3", "reviews": [9.7, 9.7, 9.7, 8.5, 9.7, 9.1, 8.8, 9.7, 9.7, 9.4]}
Exception while parsing
'NoneType' object is not subscriptable
{}
{"title": "Miracle Grand Convention Hotel", "starRating": "5", "userRating": "8.7", "reviews": [9.7, 9.4, 8.5, 8.5, 8.5, 8.5, 8.5, 10.0, 10.0, 8.5]}
{"title": "The Berkeley Hotel Pratunam", "starRating": "5", "userRating": "8.8", "reviews": [8.8, 9.4, 9.7, 9.1, 9.7, 9.7, 9.7, 9.0, 9.1, 9.1]}
{"title": "Amara Bangkok", "starRating": "5", "userRating": "8.8", "reviews": [9.1, 9.7, 9.7, 9.7, 9.4, 9.4, 10.0, 9.7, 9.7, 8.5]}
{"title": "Jasmine Grande Residence", "starRating": "5", "userRating": "8.6", "reviews": [9.7, 9.7, 9.4, 8.5, 9.7, 9.1, 9.4, 9.7, 9.7, 8.2]}
{"title": "Best Western Premier Sukhumvit", "starRating": "5", "userRating": "8.6", "reviews": [8.2, 9.4, 10.0, 9.7, 10.0, 9.7, 9.4, 9.7, 6.4, 9.4]}
{"title": "Grande Centre Point Sukhumvit 55 Thong Lo", "starR

In [14]:

if __name__ == '__main__':
    print('Running Consumer..')
    parsed_records = []
    topic_name = 'thai_hotels_final'
    parsed_topic_name = 'thai_parsed_hotels_final'

    consumer = KafkaConsumer(topic_name, auto_offset_reset='earliest',
                             bootstrap_servers=['localhost:9092'], api_version=(0, 10), consumer_timeout_ms=1000)
    for msg in consumer:
        html = msg.value
        result = parse(html)
        print (result)
        parsed_records.append(result)
    consumer.close()
    sleep(5)

    if len(parsed_records) > 0:
        print('Publishing records..')
        producer = connect_kafka_producer()
        for rec in parsed_records:
            publish_message(producer, parsed_topic_name, 'parsed', rec)

Running Consumer..
{"title": "Baron Residence Bangkok", "starRating": "5", "userRating": "8.3", "reviews": [9.7, 9.7, 9.7, 8.5, 9.7, 9.1, 8.8, 9.7, 9.7, 9.4]}
Exception while parsing
'NoneType' object is not subscriptable
{}
{"title": "Miracle Grand Convention Hotel", "starRating": "5", "userRating": "8.7", "reviews": [9.7, 9.4, 8.5, 8.5, 8.5, 8.5, 8.5, 10.0, 10.0, 8.5]}
{"title": "The Berkeley Hotel Pratunam", "starRating": "5", "userRating": "8.8", "reviews": [8.8, 9.4, 9.7, 9.1, 9.7, 9.7, 9.7, 9.0, 9.1, 9.1]}
{"title": "Amara Bangkok", "starRating": "5", "userRating": "8.8", "reviews": [9.1, 9.7, 9.7, 9.7, 9.4, 9.4, 10.0, 9.7, 9.7, 8.5]}
{"title": "Jasmine Grande Residence", "starRating": "5", "userRating": "8.6", "reviews": [9.7, 9.7, 9.4, 8.5, 9.7, 9.1, 9.4, 9.7, 9.7, 8.2]}
{"title": "Best Western Premier Sukhumvit", "starRating": "5", "userRating": "8.6", "reviews": [8.2, 9.4, 10.0, 9.7, 10.0, 9.7, 9.4, 9.7, 6.4, 9.4]}
{"title": "Grande Centre Point Sukhumvit 55 Thong Lo", "starR