In [2]:
import re
import json
import requests

from bs4 import BeautifulSoup

In [49]:
def download(url):
    response = requests.get(
        url,
        headers={'User-Agent': 'Mozilla/5.0'}
    )

    return response

def parse(line):
    parts = line.split(r' - ')

    label = parts[0].strip()
    definition = ' '.join(parts[1:]).strip()

    if len(label) == 0:
        return None

    return {
        'label': label[0].upper() + label[1:],
        'definition': definition.lower()
    }

def transform_unicode(text):
    text = re.sub('—', '-', text)
    text = re.sub('\xa0', ' ', text)
    text = re.sub('\u00a0', '', text)
    text = re.sub('\u201c', '"', text)
    text = re.sub('\u201d', '"', text)
    text = re.sub('\u2019', "'", text)
    text = re.sub('\u2018', "'", text)
    text = re.sub('\u2013', '-', text)
    text = re.sub('\u00e9', 'e', text)
    text = re.sub('\u00f4', 'o', text)
    text = re.sub('\u00fc', 'u', text)
    text = re.sub('\ufeff', ' ', text)

    text = re.sub(r'\(en\)', '', text)
    text = re.sub(r'[ ]+', ' ', text)

    text = re.sub(r'([,.])(\")', '\g<2>\g<1>', text)
    text = re.sub(r'\s([,.])(\s|$)', '\g<1>\g<2>', text)

    return text

### vinology

In [3]:
bs = BeautifulSoup(
    download('https://www.vinology.com/wine-terms/').text
)

In [4]:
def clean(text):
    text = transform_unicode(text)
    text = re.sub(
        'We created this glossary of wine terminology for all wine lovers, especially our wine students.\s+In this wine dictionary, you can find the most important wine terms and their definitions. Enjoy!',
        '',
        text
    )
    text = re.sub(r'[ ]+', ' ', text)

    return text

div = bs.find(
    'div',
    { 'class': 'entry-content' }
)

ps = div.find_all('p')

text = '\n'.join(
    [el.getText() for el in ps]
)

concepts = []
for line in clean(text).split('\n'):
    if len(line) == 0:
        continue

    concept = parse(line)
    if concept is not None:
        concepts.append(concept)

In [5]:
with open('./terminology/vinology.json', 'w') as vino_definitions:
    vino_definitions.write(
        json.dumps(concepts, indent=4)
    )

### winemag

In [6]:
bs = BeautifulSoup(
    download('https://www.winemag.com/glossary/').text
)

In [7]:
dls = bs.find_all('dl', { 'class': 'terms' })

concepts = []
for dl in dls:
    text = '\n'.join(
        [ str(l) for l in dl.find_all(['dt', 'dd']) ]
    )

    text = re.sub(r'<dt[^>]+>', '', text)
    text = re.sub(r'</dt>\s+<dd>', ' - ', text)
    text = re.sub(r'</dd>', '', text)
    text = transform_unicode(text)

    for line in text.split('\n'):
        concept = parse(line)
        if concept is not None:
            concepts.append(concept)

In [8]:
with open('./terminology/winemag.json', 'w') as winemag_definitions:
    winemag_definitions.write(
        json.dumps(concepts, indent=4)
    )

### usualwines

In [9]:
bs = BeautifulSoup(
    download('https://usualwines.com/blogs/knowledge-base/wine-terms').text
)

In [21]:
article_wrapper = bs.find('div', { 'class': 'article-wrapper' })

ps = article_wrapper.find_all('p')
text = '\n'.join(
    [str(el) for el in ps]
)

text = re.sub(
    r'<p><b>(.+?)</b></p>',
    'DEF: \g<1> - ',
    text,
    flags=re.MULTILINE
)
text = re.sub(
    r'<span[^>]+?>(.+?)</span>',
    ' \g<1> ',
    text
)
text = re.sub(
    r'<a[^>]+?>(.+?)</a>', 
    ' \g<1> ',
    text
)
text = re.sub(r'<img[^>]+?/>', ' ', text)
text = re.sub(r'\s+<p>(.+?)</p>', ' \g<1> ', text)
text = re.sub(r'<strong>(.+?)</strong>', ' \g<1> ', text)

text = transform_unicode(text)

definitions = re.findall(
    r'^DEF:[^\n]+', text,
    flags=re.MULTILINE
)

concepts = []
for line in definitions:
    line = re.sub(r'^DEF:', ' ', line)

    concept = parse(line)
    if concept is not None:
        concepts.append(concept)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [11]:
with open('./terminology/usualwines.json', 'w') as winemag_definitions:
    winemag_definitions.write(
        json.dumps(concepts, indent=4)
    )

### tiedemannwines

In [7]:
bs = BeautifulSoup(
    download('https://www.tiedemannwines.com/wine-descriptions').text
)

In [22]:
text = ''
wrappers = bs.find_all('div', { 'class': 'sqs-block-content' })

for wrapper in wrappers:
    ps = wrapper.find_all('p')
    text += '\n'.join(
        [str(el) for el in ps]
    )

text = re.sub(
    r' - </strong>',
    '</strong> - ',
    text,
    flags=re.MULTILINE
)

text = re.sub(
    r'</?strong>',
    '',
    text
)

text = re.sub(
    r'</p>',
    '',
    text
)

text = re.sub(
    r'<p>',
    'DEF: ',
    text
)

text = transform_unicode(text)

concepts = []
for line in text.split('\n'):
    line = re.sub(r'^DEF:', ' ', line)

    concept = parse(line)
    if concept is not None:
        concepts.append(concept)

In [23]:
with open('./terminology/tiedemannwines.json', 'w') as tiedemannwines_definitions:
    tiedemannwines_definitions.write(
        json.dumps(concepts, indent=4)
    )

### wineinvestment

In [50]:
bs = BeautifulSoup(
    download('https://www.wineinvestment.com/wine-blog/2019/11/how-to-describe-wine-like-a-pro/').text
)

In [51]:
wrapper = bs.find('div', { 'class': 'm-generalContentBlock__content' })

concepts = []
for item in wrapper.find_all('div'):
    label = transform_unicode(item.find('h3').text)
    definition = transform_unicode(item.find('p').text)

    concepts.append({
        'label': label[0].upper() + label[1:],
        'definition': definition.lower()
    })

In [52]:
with open('./terminology/wineinvestment.json', 'w') as wineinvestment_definitions:
    wineinvestment_definitions.write(
        json.dumps(concepts, indent=4)
    )

### KB Candidate

In [53]:
sites = [
    'usualwines',
    'vinology',
    'winemag',
    'tiedemannwines',
    'wineinvestment'
]

kb_items = []
for site in sites:
    with open(f'terminology/{site}.json') as site_input_file:
        kb_items.extend(
            json.loads(site_input_file.read())
        )

kb_items = sorted(
    kb_items,
    key=lambda item: item['label']
)

In [54]:
kb = {}
concept_id = 1

def get_key(text: str) -> str:
    return text.lower()

for item in kb_items:
    
    label = item['label']
    definition = item['definition']

    key = get_key(label)

    if key in kb:
        kb[key]['definitions'].append(definition)
    else:
        kb[key] = {
            'label': label,
            'definitions': [definition],
            'alias': [],
            'concept': f'W-{concept_id}',
            'category': '-1'
        }

        concept_id += 1

In [55]:
with open('./terminology/kb_v1.json', 'w') as vino_definitions:
    vino_definitions.write(
        json.dumps(list(kb.values()), indent=4)
    )