In [1]:
import re
import json
import requests

from bs4 import BeautifulSoup

In [55]:
def download(url):
    response = requests.get(
        url,
        headers={'User-Agent': 'Mozilla/5.0'}
    )

    return response

def parse(line):
    parts = line.split(r' - ')

    label = parts[0].strip()
    definition = ' '.join(parts[1:]).strip()

    if len(label) == 0:
        return None

    return {
        'label': label[0].upper() + label[1:],
        'definition': definition.lower()
    }

def transform_unicode(text):
    text = re.sub('—', '-', text)
    text = re.sub('\xa0', ' ', text)
    text = re.sub('\u201c', '"', text)
    text = re.sub('\u201d', '"', text)
    text = re.sub('\u2019', "`", text)
    text = re.sub('\u2013', '-', text)
    text = re.sub('\u00e9', 'e', text)
    text = re.sub('\u00f4', 'o', text)
    text = re.sub('\u00fc', 'u', text)
    text = re.sub('\ufeff', ' ', text)

    text = re.sub(r'\(en\)', '', text)
    text = re.sub(r'[ ]+', ' ', text)

    return text

### vinology

In [3]:
bs = BeautifulSoup(
    download('https://www.vinology.com/wine-terms/').text
)

In [4]:
def clean(text):
    text = transform_unicode(text)
    text = re.sub(
        'We created this glossary of wine terminology for all wine lovers, especially our wine students.\s+In this wine dictionary, you can find the most important wine terms and their definitions. Enjoy!',
        '',
        text
    )
    text = re.sub(r'[ ]+', ' ', text)

    return text

div = bs.find(
    'div',
    { 'class': 'entry-content' }
)

ps = div.find_all('p')

text = '\n'.join(
    [el.getText() for el in ps]
)

concepts = []
for line in clean(text).split('\n'):
    if len(line) == 0:
        continue

    concept = parse(line)
    if concept is not None:
        concepts.append(concept)

In [5]:
with open('./terminology/vinology.json', 'w') as vino_definitions:
    vino_definitions.write(
        json.dumps(concepts, indent=4)
    )

### winemag

In [6]:
bs = BeautifulSoup(
    download('https://www.winemag.com/glossary/').text
)

In [7]:
dls = bs.find_all('dl', { 'class': 'terms' })

concepts = []
for dl in dls:
    text = '\n'.join(
        [ str(l) for l in dl.find_all(['dt', 'dd']) ]
    )

    text = re.sub(r'<dt[^>]+>', '', text)
    text = re.sub(r'</dt>\s+<dd>', ' - ', text)
    text = re.sub(r'</dd>', '', text)
    text = transform_unicode(text)

    for line in text.split('\n'):
        concept = parse(line)
        if concept is not None:
            concepts.append(concept)

In [8]:
with open('./terminology/winemag.json', 'w') as winemag_definitions:
    winemag_definitions.write(
        json.dumps(concepts, indent=4)
    )

### usualwines

In [9]:
bs = BeautifulSoup(
    download('https://usualwines.com/blogs/knowledge-base/wine-terms').text
)

In [56]:
article_wrapper = bs.find('div', { 'class': 'article-wrapper' })

ps = article_wrapper.find_all('p')
text = '\n'.join(
    [str(el) for el in ps]
)

text = re.sub(r'<p><b>(.+?)</b></p>', 'DEF: \g<1> - ', text, flags=re.MULTILINE)
text = re.sub(r'<span[^>]+?>(.+?)</span>', ' \g<1> ', text)
text = re.sub(r'<a[^>]+?>(.+?)</a>', ' \g<1> ', text)
text = re.sub(r'<img[^>]+?/>', ' ', text)
text = re.sub(r'\s+<p>(.+?)</p>', ' \g<1> ', text)
text = re.sub(r'<strong>(.+?)</strong>', ' \g<1> ', text)

text = transform_unicode(text)

definitions = re.findall(
    r'^DEF:[^\n]+', text,
    flags=re.MULTILINE
)

concepts = []
for line in definitions:
    line = re.sub(r'^DEF:', ' ', line)

    concept = parse(line)
    if concept is not None:
        concepts.append(concept)

In [57]:
with open('./terminology/usualwines.json', 'w') as winemag_definitions:
    winemag_definitions.write(
        json.dumps(concepts, indent=4)
    )