# Getting travel blog content

1. Get travel blog urls with SERP API
2. Get all wordpress pages from original blog url
3. Get content from pages
4. Get metadata from pages
5. Use content to get location name with spacy's "en_core_web_sm"
6. Use geopy's Nominatim to get lat and long from location name
7. Write to DynamoDB

In [1]:
import os
import requests
from bs4 import BeautifulSoup
import warnings
from serpapi import GoogleSearch
from backend.off_the_path.src.utilities import clean_text
import numpy as np
import spacy
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from urllib.parse import urlparse
import pandas as pd
import boto3

# Load a pre-trained English language model
nlp = spacy.load("en_core_web_sm")
# Load the geolocator
geolocator = Nominatim(user_agent="off_the_path") 

[nltk_data] Downloading package stopwords to /Users/merho/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Get list of travel sites from google.

Query: "travel site:wordpress.com"


In [2]:
def serpapi_search(query, api_key):
    params = {
        "api_key": api_key,
        "engine": "google",
        "q": query,
        "hl": "en"
    }
    search = GoogleSearch(params)
    results = search.get_dict()
    return results.get('organic_results', [])

query = "travel site:wordpress.com"

Get all page links from base blog site.

In [3]:
def get_wordpress_pages(base_url):
    # init list to store page links
    all_pages = []
    # List of sitemap URL suffixes to try
    sitemap_paths = ["post-sitemap.xml", "sitemap-1.xml", "sitemap.xml", "sitemap_index.xml",]
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/126.0 Safari/537.36"
        ),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/"
    }
    
    # Try each sitemap path until a valid one is found
    sitemap_url = None
    for path in sitemap_paths:
        try:
            test_url = base_url.rstrip('/') + '/' + path
            r = requests.get(test_url, headers=headers, timeout=10)
            if 200 <= r.status_code < 300:
                sitemap_url = test_url
                sitemap_response = r
                break
        except requests.RequestException:
            continue 

    if not sitemap_url:
        print("No valid sitemap found.")
        return []

    # xml parser didnt work so html is a must, but it throws a warning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UserWarning)
        soup = BeautifulSoup(sitemap_response.text, 'html.parser')
    # find all links on page and store in a list
    links = soup.find_all('url')
    for l in links:
        loc = l.find('loc')
        if loc:
            all_pages.append(loc.text)
    return all_pages


In [4]:
def is_useful_url(url):

    parsed = urlparse(url)
    path = parsed.path.lower()

    # skip pagination
    if "/page/" in path:
        return False

    # skip categories, tags, authors
    skip_segments = ["category", "tag", "author"]
    if any(f"/{seg}/" in path for seg in skip_segments):
        return False

    # skip feeds
    if "/feed/" in path:
        return False

    # skip wp system URLs
    if "wp-json" in path or "wp-admin" in path:
        return False

    # skip attachments / misc
    bad_ext = (".jpg", ".png", ".gif", ".pdf", ".xml", ".zip")
    if path.endswith(bad_ext):
        return False

    # skip query-string pages entirely
    if parsed.query:
        return False

    # optional: only allow URLs that “look like” posts/pages
    # e.g., end in a slash and contain a slug
    if not path.endswith("/"):
        return False

    return True


In [5]:
def get_blog_page_content(page_url):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/126.0 Safari/537.36"
        ),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/"
    }
    response = requests.get(page_url, headers=headers, timeout=15)

    # Parse the HTML with Beautiful Soup
    soup = BeautifulSoup(response.text, 'html.parser')

    paragraphs = soup.find_all('p')
    all_paras = " ".join(paragraphs)
    return all_paras

In [6]:
def get_blog_page_meta_data(page_url):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/126.0 Safari/537.36"
        ),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/"
    }

    response = requests.get(page_url, headers=headers, timeout=15)

    # Parse the HTML with Beautiful Soup
    soup = BeautifulSoup(response.text, 'html.parser')

    title_list = soup.find_all('title')
    title = str(title_list[0])
    title = title.replace("<title>","").replace("</title>","")

    description_meta = soup.find('meta', attrs={'property': 'og:description'})
    if description_meta:
        description_content = description_meta.get('content')
    else:
        description_content = np.nan

    author_meta = soup.find('meta', attrs={'property': 'author'})
    if author_meta:
        author_content = author_meta.get('content')
    else:
        author_content = np.nan

    return title, description_content, author_content

In [7]:
def find_geo_name(title, description):

    doc = nlp(title)

    for ent in doc.ents:
        if ent.label_ == "GPE" or ent.label_ == "LOC": # GPE: Geopolitical Entity, LOC: Location
            return ent
        else:
            doc = nlp(description)
            for ent in doc.ents:
                if ent.label_ == "GPE" or ent.label_ == "LOC":
                    return ent

In [8]:
def get_lat_long(location_name):
    try:
        location = geolocator.geocode(location_name)
        if location:
            lat = location.latitude
            long = location.longitude
            return lat, long
        else:
            print(f"Could not find location for: {location_name}")
    except GeocoderTimedOut:
        print("Error: Geocoding service timed out.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [9]:
query_urls = ['https://thetravellush.wordpress.com', 'https://dangerousbusiness.wordpress.com', 'https://ashleighbugg.wordpress.com']

In [None]:

## init dynamo DB connection
# dynamodb = boto3.resource('dynamodb', region_name='us-east-1') 
# table = dynamodb.Table('off-the-beaten-path-blog-posts')


id = 1
for blog_url in query_urls:
    # init empt dataframe
    df = pd.DataFrame(columns=["id_num", "blog_url",  "page_url","page_title", "page_description",
                   "page_author","content","location_name", "latitude","longitude"])
    print(blog_url)
    all_links = get_wordpress_pages(blog_url)
    for link in all_links:
        if not is_useful_url(link):
            print("Skipping:", link)
            continue
        try:
            # get blog content
            content = get_blog_page_content(link)
            # clean text
            clean_content = clean_text(content)
            # get blog meta data
            title, description, author =  get_blog_page_meta_data(link)
            place_name = find_geo_name(title, description)
            lat, long = get_lat_long(place_name)
            id_num = id.zfill(6)
            if title:
                new_row = {"id_num":id_num, "blog_url":blog_url,  "page_url":link,"page_title":title, 
                           "page_description": description, "page_author":author,
                           "content":clean_content,"location_name":place_name, 
                           "latitude":lat ,"longitude": long}
                df = df.append(new_row, ignore_index=True)
                id +=1
        except:
            print("Not enough info in",link)
            continue
    print(len(df))
    print(df.head(5))
    # # convert to list of dictionaries
    # result = df.to_dict(orient='records')
    # # add items to dynamoDB
    # with table.batch_writer() as batch:
    #     for item in result:
    #         batch.put_item(Item=item)


https://thetravellush.wordpress.com
Not enough info in https://travel-lush.com/blog/
Not enough info in https://travel-lush.com/the-reality-of-traveling-to-a-disaster-zone-coron-philippines/
Not enough info in https://travel-lush.com/backpacking-in-bogota-is-colombias-capital-worth-a-visit/
Not enough info in https://travel-lush.com/nitty-gritty-behind-finding-job-indonesia/
Not enough info in https://travel-lush.com/backpacker-expat-2014-travel/
Not enough info in https://travel-lush.com/sampling-phnom-penhs-food-scene/
Not enough info in https://travel-lush.com/one-year-blogiversary/
Not enough info in https://travel-lush.com/living-jakarta-6-months/
Not enough info in https://travel-lush.com/making-time-jakarta/
Not enough info in https://travel-lush.com/christmas-jakarta-just-not/
Not enough info in https://travel-lush.com/epic-quest-find-best-veggie-burger-jakarta/
Not enough info in https://travel-lush.com/life-in-jakarta-my-taxi-ride-from-hell/
Not enough info in https://travel-