In [465]:
import json
from bs4 import BeautifulSoup
from titlecase import titlecase
import random
import glob
import re
from unidecode import unidecode
import pprint
from tqdm.notebook import tqdm

In [61]:
paths = glob.glob('/Users/codyfalkosky/Desktop/1_extracted_blogs/*.json')

In [1302]:
class ArticleParser:
    def __init__(self, example):
        self._removed_intro = False
        self.stop_headings = ['Book Your Trip: Logistical Tips and Tricks', 'Leave A Comment Cancel reply', "Hi, I'm Matthew Karsten"]
        self.break_headings = ['Like this Post? Pin it!', 'Related Posts', 'Like This Article? Pin it!', 'Related posts']
        self.stop_heading_chars = ['\t']
        self.data    = []
        self.example = example
        self.soup    = BeautifulSoup(example['article'], 'html.parser')
        self._get_blog_title()
        self._extract_and_clean()

    def _get_blog_title(self):
        # find title
        title = re.findall(r"/([^/]+)/index.html", self.example['path']) # case: /path/title/index.html
        if not title:
            title = re.findall(r"/([^/]+)$", self.example['path']) # case: /path/title

        # clean
        title = title[0]
        title = title.replace('-', ' ')
        title = title.replace('_', ' ')
        title = titlecase(title)

        # save
        self.title = title

    def _assess_heading(self, name, text):
        # an after blog tag (the blog is over)
        if name == 'aside':
            return 'break'

        if any([heading in text for heading in self.break_headings]):
            return 'break'

        # check for blank heading
        if not text:
            return 'continue'

        # check for bad headings (recurring advertisements, comment section)
        if text in self.stop_headings:
            return 'continue'

        # mostly strange comment headings (e.g. "\t\t\t\t\tcomments\t\t\t\t\t")
        if any([char in text for char in self.stop_heading_chars]):
            return 'continue'

        # headings that arent headings, they are paragraphs.
        if len(text) > 100:
            return 'continue'

        return 'good'
        
        

    def _extract_and_clean(self):
        'extracts all headings and heading content, cleans, and appends to self.data'

        cleaned_headings = []

        # find all headings 
        headings = self.soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'aside']) + ['END']

        
        for current_heading, next_heading in zip(headings, headings[1:]):

            # if the heading is good, use it
            match self._assess_heading(current_heading.name, current_heading.text):
                case 'break':
                    break
                case 'continue':
                    continue
                case 'good':
                    pass


            # reformat current heading for dataset and save (removes id, class, etc. attrs from tag)
            heading = f'''<{current_heading.name}>{unidecode(current_heading.text).strip()}</{current_heading.name}>'''
            cleaned_headings.append(heading)

            # find the area that is the content for this heading
            start = self.example['article'].find(str(current_heading)) + len(str(current_heading))

            if next_heading == 'END':
                end = None
            else:
                end = self.example['article'].find(str(next_heading))

            # extract content
            html = self.example['article'][start:end]
            soup = BeautifulSoup(html, 'html.parser')

            p_tags = soup.find_all('p')

            content = ''

            for p_tag in p_tags:
                if not p_tag.find_parent('p'):
                    content += p_tag.text.strip() + ' '
            
            # if really long skip`
            if len(content) > 5000:
                continue
        
            # clean content
            content = unidecode(content)
            content = content.replace(' \n', ' ')
            content = content.replace('\n', ' ')
            content = re.sub(r'(?<!\s)([A-Z])', r' \1', content) # add a space before a capitol word.
            if not self._removed_intro:
                len_pre = len(content)
                # remove starting line
                # content = re.sub(r'Written by .+?, \d\d\d\d ', '', content) 
                # content = re.sub(r'Adventurous Kate contains affiliate links. If you make a purchase through these links, I will earn a commission at no extra cost to you. Thanks!\s+', '', content)
                content = re.sub(r'^.+?\d{1,2}(st|nd|rd|th)?, \d{4} ', '', content)
                content = re.sub(r'^.+?\d{1,2}(st|nd|rd|th)?, \d{4} ', '', content)
                content = re.sub(r'^\s*By.+?\.', '', content)
                len_post = len(content)
                if len_pre != len_post:
                    self._removed_intro = True

            content = re.sub(r'\s*([A-Z]\s+){5,}.*([A-Z]\s+){5,}.*', '', content)
            content = re.sub(r'\s*Table of Contents\s*$', '', content)
            content = re.sub(r'\[.+=".+\]', '', content)
            content = content.strip()

            # if really short skip
            if len(content) < 100:
                continue
            
            # check for blank content, then append to article data
            if content:
                self.data.append({'heading':heading, 'content':content})        

        # append title and headings to data
        self.data.append({'title':self.title, 'headings':cleaned_headings})


## Parse Path 0

In [461]:
with open(paths[0], 'r') as file:
    data = json.load(file)

In [478]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/511 [00:00<?, ?it/s]

In [479]:
len(cleaned_data)

208

In [483]:
new_path = paths[0].replace('1_extracted_blogs', '2_cleaned_data')
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

In [482]:
new_path

'/Users/codyfalkosky/Desktop/2_cleaned_data/www.wanderingearl.com.json'

## Parse Path 1

In [526]:
with open(paths[1], 'r') as file:
    data = json.load(file)

In [564]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/303 [00:00<?, ?it/s]

In [565]:
len(cleaned_data)

131

In [566]:
new_path = paths[1].replace('1_extracted_blogs', '2_cleaned_data')
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

## Parse Path 2

In [567]:
P = 2

In [568]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [597]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [598]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/939 [00:00<?, ?it/s]

In [599]:
len(cleaned_data)

628

In [600]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

## Parse Path 3

In [607]:
P = 3
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.wanderingwelshgirl.com.json'

In [602]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [611]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [612]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/176 [00:00<?, ?it/s]

In [613]:
len(cleaned_data)

170

In [614]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/www.wanderingwelshgirl.com.json


## Parse Path 4

In [615]:
P = 4
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.alongdustyroads.com.json'

In [616]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [635]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [636]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/270 [00:00<?, ?it/s]

In [637]:
len(cleaned_data)

256

In [638]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/www.alongdustyroads.com.json


## Parse Path 5

In [640]:
P = 5
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.theviennablog.com.json'

In [680]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [701]:
# # random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [702]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/732 [00:00<?, ?it/s]

In [703]:
len(cleaned_data)

623

In [704]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/www.theviennablog.com.json


## Parse Path 6

In [705]:
P = 6
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.adventuresnsunsets.com.json'

In [706]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [720]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [721]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/507 [00:00<?, ?it/s]

In [722]:
len(cleaned_data)

469

In [723]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/www.adventuresnsunsets.com.json


## Parse Path 7

In [724]:
P = 7
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/ordinarytraveler.com.json'

In [725]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [746]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [747]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/532 [00:00<?, ?it/s]

In [748]:
len(cleaned_data)

461

In [749]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/ordinarytraveler.com.json


## Parse Path 8

In [750]:
P = 8
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.thebrokebackpacker.com.json'

In [751]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [759]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [760]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/4219 [00:00<?, ?it/s]

In [761]:
len(cleaned_data)

4196

In [762]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/www.thebrokebackpacker.com.json


## Parse Path 9 - REVISIT

In [763]:
P = 9
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.alexinwanderland.com.json'

In [764]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [771]:
# random check

idx = random.randint(0, len(data))
parser = ArticleParser(data[idx])

parser.data

[{'heading': '<h1>Photo of the Week 230: Koh Tao</h1>',
  'content': "Greetings from Koh Tao... and twenty-six! I do want to thank you for a lovely outpouring of birthday wishes and responses to my big birthday post. Unfortunately, the year has gotten off to a bit of a rough start. For the second time in as many weeks, I opened my laptop to a black screen. While the first issue lost me a day of work and a heavy chunk of change, it was fixable and I brought my laptop back home the same night. This time, no such luck -- my screen went kaput, a fairly simple under-warranty fix back in New York, and a borderline nightmare here on a remote island in Thailand. I literally priced out some flights back to the U S yesterday, where this problem would be fixed with a few hours at an Apple store. Instead, I'm staying put and my computer is on its way to Bangkok, slated to return in three weeks. Fingers crossed the screen will be repaired and the entire machine won't be replaced, or I'll end up wit

In [760]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/4219 [00:00<?, ?it/s]

In [761]:
len(cleaned_data)

4196

In [762]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/www.thebrokebackpacker.com.json


## Parse Path 10

In [772]:
P = 10
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/adventourbegins.com.json'

In [773]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [782]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [783]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/221 [00:00<?, ?it/s]

In [784]:
len(cleaned_data)

218

In [785]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/adventourbegins.com.json


## Parse Path 11

In [786]:
P = 11
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/plateandcompass.com.json'

In [787]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [791]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [792]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/47 [00:00<?, ?it/s]

In [793]:
len(cleaned_data)

47

In [794]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/plateandcompass.com.json


## Parse Path 12

In [795]:
P = 12
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.nomadasaurus.com.json'

In [796]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [814]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [815]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/161 [00:00<?, ?it/s]

In [816]:
len(cleaned_data)

97

In [817]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/www.nomadasaurus.com.json


## Parse Path 13

In [818]:
P = 13
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/practicalwanderlust.com.json'

In [819]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [828]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [829]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/303 [00:00<?, ?it/s]

In [830]:
len(cleaned_data)

284

In [831]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/practicalwanderlust.com.json


## Parse Path 14

In [832]:
P = 14
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/theplanetd.com.json'

In [833]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [839]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [840]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/1614 [00:00<?, ?it/s]

In [841]:
len(cleaned_data)

1605

In [842]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/theplanetd.com.json


## Parse Path 15

In [843]:
P = 15
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/expertvagabond.com.json'

In [844]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [896]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [897]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/314 [00:00<?, ?it/s]

In [898]:
len(cleaned_data)

309

In [899]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/expertvagabond.com.json


## Parse Path 16 - skipped, did not conform to scrape

In [900]:
P = 16
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.danflyingsolo.com.json'

In [901]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [921]:
# random check

idx = random.randint(0, len(data))
parser = ArticleParser(data[idx])

parser.data

[{'heading': '<h1>Summer Highs in Innsbruck, Where the Alps Are for All</h1>',
  'content': 'Home >> Destinations >> Europe >> Austria >> Summer Highs in Innsbruck, Where the Alps Are for All This website uses affiliate links which may earn a commission at no additional cost to you. As an Amazon Associate I earn from qualifying purchases. Updated: 12th March 2024'},
 {'title': 'Summer in Innsbruck Austria',
  'headings': ['<h1>Summer Highs in Innsbruck, Where the Alps Are for All</h1>']}]

In [897]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/314 [00:00<?, ?it/s]

In [898]:
len(cleaned_data)

309

In [899]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/expertvagabond.com.json


## Parse Path 17

In [922]:
P = 17
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/likewhereyouregoing.com.json'

In [923]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [948]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [949]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/202 [00:00<?, ?it/s]

In [950]:
len(cleaned_data)

198

In [951]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/likewhereyouregoing.com.json


## Parse Path 18

In [952]:
P = 18
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.saltinourhair.com.json'

In [953]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [971]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [972]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/482 [00:00<?, ?it/s]

In [973]:
len(cleaned_data)

469

In [974]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/www.saltinourhair.com.json


## Parse Path 19

In [975]:
P = 19
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.goatsontheroad.com.json'

In [976]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [998]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [1025]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/1818 [00:00<?, ?it/s]

  soup = BeautifulSoup(html, 'html.parser')


In [1026]:
len(cleaned_data)

1464

In [1027]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/www.goatsontheroad.com.json


## Parse Path 20

In [1028]:
P = 20
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/tavernatravels.com.json'

In [1029]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [1039]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [1040]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/262 [00:00<?, ?it/s]

In [1041]:
len(cleaned_data)

257

In [1042]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/tavernatravels.com.json


## Parse Path 21

In [1044]:
P = 21
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/virginiatraveltips.com.json'

In [1045]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [1122]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [1123]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/200 [00:00<?, ?it/s]

In [1124]:
len(cleaned_data)

199

In [1125]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/virginiatraveltips.com.json


## Parse Path 22

In [1126]:
P = 22
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.globeguide.ca.json'

In [1127]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [1192]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [1193]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/643 [00:00<?, ?it/s]

In [1194]:
len(cleaned_data)

539

In [1195]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/www.globeguide.ca.json


## Parse Path 23

In [1196]:
P = 23
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.theblondeabroad.com.json'

In [1197]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [1217]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [1218]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/1401 [00:00<?, ?it/s]

In [1219]:
len(cleaned_data)

1243

In [1220]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/www.theblondeabroad.com.json


## Parse Path 24 - EXCLUDED

In [1221]:
P = 24
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/jessieonajourney.com.json'

In [1222]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [1363]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [1364]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/1047 [00:00<?, ?it/s]

In [1365]:
len(cleaned_data)

498

In [1366]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/jessieonajourney.com.json


## Parse Path 25

In [1367]:
P = 25
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/britonthemove.com.json'

In [1368]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [1420]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [1421]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/475 [00:00<?, ?it/s]

In [1422]:
len(cleaned_data)

471

In [1423]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/britonthemove.com.json


## Parse Path 26

In [1424]:
P = 26
paths[P]

'/Users/codyfalkosky/Desktop/1_extracted_blogs/www.jonesaroundtheworld.com.json'

In [1425]:
with open(paths[P], 'r') as file:
    data = json.load(file)

In [1551]:
# random check

# idx = random.randint(0, len(data))
# parser = ArticleParser(data[idx])

# parser.data

In [1552]:
cleaned_data = []
indexes = range(len(data))

for idx in tqdm(indexes):
    parser = ArticleParser(data[idx])
    if len(parser.data) > 4:
        cleaned_data.append(parser.data)

  0%|          | 0/1016 [00:00<?, ?it/s]

In [1553]:
len(cleaned_data)

966

In [1554]:
new_path = paths[P].replace('1_extracted_blogs', '2_cleaned_data')
print(new_path)
with open(new_path, 'w') as file:
    json.dump(cleaned_data, file)

/Users/codyfalkosky/Desktop/2_cleaned_data/www.jonesaroundtheworld.com.json
