In [1]:
from bs4 import BeautifulSoup
import numpy as np
from tqdm.notebook import tqdm
import glob
import json

In [None]:
class ArticleExtractor:

    def __init__(self, paths):
        self.paths = paths
        self.blog_articles = []

        for path in tqdm(paths):
            self._open(path)
            self._get_blog_articles(path)

        print(f'{len(self.blog_articles)} articles found')

    def _open(self, path):
        with open(path, 'r') as file:
            self.html = file.read()

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        articles = soup.find_all('article', {'class':'type-post'})

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})

    def save(self, path):
        with open(path, 'w') as file:
            json.dump(self.blog_articles, file)

        print(f'saved at: {path}')

### adventourbegins.com

In [19]:
class adventourbeginsArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        articles = soup.find_all('article', {'class':'type-post'})

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'adventourbegins.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = adventourbeginsArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/239 [00:00<?, ?it/s]

  k = self.parse_starttag(i)


221 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/adventourbegins.com.json


### britonthemove.com

In [20]:
class britonthemoveArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'single-entry' in tag['class']:
                return False
            if not 'type-post' in tag['class']:
                return False
            return True

            
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})

folder       = 'britonthemove.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = britonthemoveArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/502 [00:00<?, ?it/s]

475 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/britonthemove.com.json


### expertvagabond.com

In [21]:
class expertvagabondArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        articles = soup.find_all('article', {'class':'type-post'})

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'expertvagabond.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = expertvagabondArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/352 [00:00<?, ?it/s]

314 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/expertvagabond.com.json


### jessieonajourney.com

In [22]:
class jessieonajourneyArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        articles = soup.find_all('article', {'class':'type-post'})

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'jessieonajourney.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = jessieonajourneyArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/1329 [00:00<?, ?it/s]

1047 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/jessieonajourney.com.json


### likewhereyouregoing.com

In [23]:
class likewhereyouregoingArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        articles = soup.find_all('article', {'class':'type-post'})

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'likewhereyouregoing.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = likewhereyouregoingArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/213 [00:00<?, ?it/s]

202 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/likewhereyouregoing.com.json


### ordinarytraveler.com

In [24]:
class ordinarytravelerArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'type-post' in tag['class']:
                return False
            if not 'single-entry' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'ordinarytraveler.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/*')
article_extractor = ordinarytravelerArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/576 [00:00<?, ?it/s]

  soup = BeautifulSoup(self.html, 'html.parser')


532 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/ordinarytraveler.com.json


### plateandcompass.com

In [25]:
class plateandcompassArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        
        articles = soup.find_all('article', {'class':'type-post'})

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'plateandcompass.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = plateandcompassArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/56 [00:00<?, ?it/s]

47 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/plateandcompass.com.json


### practicalwanderlust.com

In [26]:
class practicalwanderlustArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        
        articles = soup.find_all('article', {'class':'type-post'})

        # if len(articles) == 1:
        self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'practicalwanderlust.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = practicalwanderlustArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/316 [00:00<?, ?it/s]

303 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/practicalwanderlust.com.json


### tavernatravels.com

In [27]:
class tavernatravelsArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        
        articles = soup.find_all('div', {'class':'content'})

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'tavernatravels.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = tavernatravelsArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/270 [00:00<?, ?it/s]

262 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/tavernatravels.com.json


### theplanetd.com

In [28]:
class theplanetdArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'type-post' in tag['class']:
                return False
            if 'wp-show-posts-single' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'theplanetd.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = theplanetdArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/1651 [00:00<?, ?it/s]

1614 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/theplanetd.com.json


### virginiatraveltips.com

In [29]:
class virginiatraveltipsdArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'type-post' in tag['class']:
                return False
            if 'wp-show-posts-single' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'virginiatraveltips.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = virginiatraveltipsdArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/209 [00:00<?, ?it/s]

200 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/virginiatraveltips.com.json


### adventuresnsunsets.com

In [30]:
class adventuresnsunsetsArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'type-post' in tag['class']:
                return False
            if 'wp-show-posts-single' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.adventuresnsunsets.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = adventuresnsunsetsArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/527 [00:00<?, ?it/s]

507 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.adventuresnsunsets.com.json


### adventurouskate.com

In [31]:
class adventurouskateArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'type-post' in tag['class']:
                return False
            if 'wp-show-posts-single' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.adventurouskate.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = adventurouskateArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/948 [00:00<?, ?it/s]

939 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.adventurouskate.com.json


### alexinwanderland.com

In [32]:
class alexinwanderlandArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'page-content' in tag['class']:
                return False
            if 'wp-show-posts-single' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.alexinwanderland.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/*.html')
article_extractor = alexinwanderlandArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/1747 [00:00<?, ?it/s]

1677 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.alexinwanderland.com.json


### alongdustyroads.com

In [33]:
class alongdustyroadsArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            # if not 'page-content' in tag['class']:
            #     return False
            # if 'wp-show-posts-single' in tag['class']:
            #     return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.alongdustyroads.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/www.alongdustyroads.com/posts/*')
article_extractor = alongdustyroadsArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/276 [00:00<?, ?it/s]

270 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.alongdustyroads.com.json


### danflyingsolo.com

In [34]:
class danflyingsoloArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        
        articles = soup.find_all('article', {'class':'type-post'})

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.danflyingsolo.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/index.html')
article_extractor = danflyingsoloArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/307 [00:00<?, ?it/s]

285 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.danflyingsolo.com.json


### globeguide.ca

In [35]:
class globeguideArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        
        articles = soup.find_all('article', {'class':'type-post'})

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.globeguide.ca'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/index.html')
article_extractor = globeguideArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/669 [00:00<?, ?it/s]

643 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.globeguide.ca.json


### goatsontheroad.com

In [36]:
class goatsontheroadArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        
        articles = soup.find_all('article', {'class':'type-post'})

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.goatsontheroad.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/index.html')
article_extractor = goatsontheroadArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/1954 [00:00<?, ?it/s]

'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
1818 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.goatsontheroad.com.json


### jonesaroundtheworld.com

In [37]:
class jonesaroundtheworld.ArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        
        articles = soup.find_all('article', {'class':'type-post'})

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.jonesaroundtheworld.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/index.html')
article_extractor = jonesaroundtheworld.ArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/1096 [00:00<?, ?it/s]

1016 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.jonesaroundtheworld.com.json


### legalnomads.com

In [38]:
class legalnomadsArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')
        
        articles = soup.find_all('article', {'class':'type-post'})

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})

folder       = 'www.legalnomads.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/index.html')
article_extractor = legalnomadsArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/332 [00:00<?, ?it/s]

303 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.legalnomads.com.json


### nomadasaurus.com

In [39]:
class nomadasaurusArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'div':
                return False
            if not 'class' in tag.attrs:
                return False
            if not 'type-post' in tag['class']:
                return False
            if 'e-loop-item' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.nomadasaurus.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/index.html')
article_extractor = nomadasaurusArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/1067 [00:00<?, ?it/s]

941 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.nomadasaurus.com.json


### nomadicmatt.com

In [40]:
class nomadicmattArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'class' in tag.attrs:
                return False
            if not 'type-post' in tag['class']:
                return False
            if 'e-loop-item' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})

folder       = 'www.nomadasaurus.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/www.nomadicmatt.com/travel-blogs/**/index.html')
article_extractor = nomadicmattArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/161 [00:00<?, ?it/s]

161 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.nomadasaurus.com.json


### saltinourhair.com

In [42]:
class saltinourhairArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'div':
                return False
            if not 'class' in tag.attrs:
                return False
            if not 'blog-content' in tag['class']:
                return False
            if 'e-loop-item' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.saltinourhair.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/**/index.html')
article_extractor = saltinourhairArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/597 [00:00<?, ?it/s]

482 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.saltinourhair.com.json


### theblondeabroad.com

In [44]:
class theblondeabroadArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'class' in tag.attrs:
                return False
            if not 'type-post' in tag['class']:
                return False
            if 'e-loop-item' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.theblondeabroad.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/index.html')
article_extractor = theblondeabroadArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/1556 [00:00<?, ?it/s]

1401 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.theblondeabroad.com.json


### thebrokebackpacker.com

In [48]:
class thebrokebackpackerArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'class' in tag.attrs:
                return False
            if not 'post-single' in tag['class']:
                return False
            if 'e-loop-item' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.thebrokebackpacker.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/index.html')
article_extractor = thebrokebackpackerArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/4257 [00:00<?, ?it/s]

'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte
4219 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.thebrokebackpacker.com.json


### theviennablog.com

In [51]:
class theviennablogArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'class' in tag.attrs:
                return False
            if not 'type-post' in tag['class']:
                return False
            if 'e-loop-item' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.theviennablog.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/index.html')
article_extractor = theviennablogArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/763 [00:00<?, ?it/s]

'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
'utf-8' codec can't decode byte 0xf8 in position 4: invalid start byte
'utf-8' codec can't decode byte 0xc0 in position 4: invalid start byte
'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
'utf-8' codec can't decode byte 0xd8 in position 4: invalid continuation byte
'utf-8' codec can't decode byte 0xd2 in position 5: invalid continuation byte
'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
'utf-8' codec can't decode byte 0xff in position 0: invalid sta

### wanderingearl.com

In [53]:
class wanderingearlArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'class' in tag.attrs:
                return False
            if not 'type-post' in tag['class']:
                return False
            if 'e-loop-item' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.wanderingearl.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/index.html')
article_extractor = wanderingearlArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/531 [00:00<?, ?it/s]

511 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.wanderingearl.com.json


### wanderingwelshgirl.com

In [55]:
class wanderingwelshgirlArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'class' in tag.attrs:
                return False
            if not 'type-post' in tag['class']:
                return False
            if 'e-loop-item' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.wanderingwelshgirl.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/index.html')
article_extractor = wanderingwelshgirlArticleExtractor(html_files)
article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/187 [00:00<?, ?it/s]

176 articles found
saved at: /Volumes/2024 ML/data/1_extracted_blogs/www.wanderingwelshgirl.com.json


### weseektravel.com

In [56]:
class weseektravelArticleExtractor(ArticleExtractor):

    def _get_blog_articles(self, path):
        soup = BeautifulSoup(self.html, 'html.parser')

        def query(tag):
            if not tag.name == 'article':
                return False
            if not 'class' in tag.attrs:
                return False
            if not 'type-post' in tag['class']:
                return False
            if 'e-loop-item' in tag['class']:
                return False
            return True
        
        articles = soup.find_all(query)

        if len(articles) == 1:
            self.blog_articles.append({'path':path, 'article':str(articles[0])})


folder       = 'www.weseektravel.com'
html_files   = glob.glob(f'/Volumes/2024 ML/data/0_raw_blog_scrape/{folder}/**/index.html')
article_extractor = weseektravelArticleExtractor(html_files)
# article_extractor.save(f'/Volumes/2024 ML/data/1_extracted_blogs/{folder}.json')

  0%|          | 0/741 [00:00<?, ?it/s]

725 articles found
