# Scraping language data from Wikipedia using bs4 

In [None]:
'''
To collect the language data needed to train various language detection
models I'm going to scrape wikipedia. The lanugages I'll be focusing on 
are English, Japanese, Spanish, German, Russian, French, Italian, and Chinese
since those 8 langauges have the most wikipedia articles. Since this is just 
practice I'll only be scraping one article in each langauage.
'''

In [1]:
import re, requests
from bs4 import BeautifulSoup
from collections import defaultdict
FOCUS = { 'en': 0, 'ja': 1, 'es': 2, 'de': 3, 'ru': 4, 'fr': 5, 'it': 6, 
          'zh': 7 }

In [2]:
# create function for retrieving wikipedia pages in different languages

def get_hyperlinks(target_link):
    r = requests.get(target_link)
    soup = BeautifulSoup(r.content, 'html.parser')
    hyperlinks = soup.find_all("a", lang=True)

    lang_links = defaultdict(str)

    lang_links['en'] = target_link
    for h in hyperlinks:
        if h['lang'] in FOCUS.keys():
            lang_links[h['lang']] = h['href']
    
    return lang_links

In [3]:
# these regexes will be used to split each article into a list of sentences

e_sents = re.compile(r'[。!?]') 
w_sents = re.compile(r'(?<!\w\.\w\.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s')

# I also need to blacklist certain html tags to avaoid scraping unwanted data

blacklist = [ '[document]', 'a', 'abbr', 'bdi', 'cite', 'div', 'h2', 'h3', 
              'label', 'li', 'script', 'span', 'style', 'sup', 'th', 'ul']

In [4]:
 # create funciton for scraping sites/parsing sentences
 
def fetch_lang_data(target_link):

    data = []
    lang_links = get_hyperlinks(target_link)

    for lang, link in lang_links.items():
        r = requests.get(link)
        soup = BeautifulSoup(r.content, 'html.parser')

        text = soup.find_all(string=True)

        output = ''
        for t in text:
            if t.parent.name not in blacklist:
                output += '{} '.format(t)

        if lang not in ['ja', 'zh']:
            # fix spacing
            output = re.sub(r'^\s+', r'', output) # initial spaces
            output = re.sub(r'\(\s+\)', r' ', output) # empty parenthesis
            output = re.sub(r"\s+([,.'\":])", r'\1', output) # spaced punctuation
            output = re.sub(r'(\s{2,})', r' ', output) # long spaces
            # parse sentences
            output = w_sents.split(output)
        else:
            # fix spacing
            output = re.sub(r'^\s+', r'', output) # initial spaces
            output = re.sub(r'\s{2,}', r' ', output) # long spaces
            # parse sentences
            output = e_sents.split(output)
        
        for sent in output:
            data.append((lang, sent))

    return data


In [5]:
# get language data as a list of tuples

language_data = fetch_lang_data("https://en.wikipedia.org/wiki/Alice's_Adventures_in_Wonderland")

# create dataframe from laguage data

import pandas as pd

language_df = pd.DataFrame(language_data, columns=['language', 'sentence'])
language_df.head()

Unnamed: 0,language,sentence
0,en,Alice's Adventures in Wonderland - Wikipedia A...
1,en,"It tells of a young girl named, who falls thro..."
2,en,It is considered to be one of the best example...
3,en,"The tale plays with, giving the story lasting ..."
4,en,One of the best-known and most popular works o...


In [6]:
import spacy

spacy_packs = { 'en': 'en_core_web_md', 
                'ja': 'ja_core_news_md', 
                'es': 'es_core_news_md', 
                'de': 'de_core_news_md', 
                'ru': 'ru_core_news_md', 
                'fr': 'fr_core_news_md', 
                'it': 'it_core_news_md', 
                'zh': 'zh_core_web_md' }

# create function that tokenizes each sentence in the dataframe, lemmatizes them, 
# and removes stopwords, punctuation, and numbers

def tokenize(sentence, lang):
    doc = nlp(sentence)
    if lang == 'zh':
        return [tok.text for tok in doc if not tok.is_punct and tok.pos_ != 'NUM']
    else:
        return [tok.lemma_.lower() for tok in doc if not tok.is_punct 
            and not tok.is_stop and not tok.is_space and tok.pos_ != 'NUM']

In [7]:
# encode language (integer encoding)
language_df['language_code'] = language_df['language'].apply(lambda x: FOCUS[x])

'''
The code below would create one-hot vectors for each target language,
but since I'll be using the CrossEntropyLoss function later I have to encode
the target languages as integers instead
'''
# import numpy as np
# language_df['one_hot_vector'] = language_df['language'].apply(lambda x: np.eye(8)[LABELS[x]].astype('int').tolist())

"\nThe code below would create one-hot vectors for each target language,\nbut since I'll be using the CrossEntropyLoss function later I have to encode\nthe target languages as integers instead\n"

In [8]:
# filter dataframe by each language, apply tokenizing function to it, 
# and add to list

series_list = []
for lang, pack in spacy_packs.items():
    nlp = spacy.load(pack, disable=['ner', 'parser'])
    series = language_df[language_df['language']==lang]['sentence'].apply(tokenize, args=(lang,))
    series_list.append(series)

In [9]:
# concat list of series

tokens = pd.concat(series_list)

# add tokens column to dataframe and output as pickle

language_df = language_df.assign(tokens=tokens)
# remove rows with empty tokens list
language_df = language_df[~language_df['tokens'].apply(lambda x: len(x)==0)]
language_df.to_pickle('language_data.pickle')

language_df.head()

Unnamed: 0,language,sentence,language_code,tokens
0,en,Alice's Adventures in Wonderland - Wikipedia A...,0,"[alice, adventures, wonderland, wikipedia, ali..."
1,en,"It tells of a young girl named, who falls thro...",0,"[tell, young, girl, name, fall, rabbit, hole, ..."
2,en,It is considered to be one of the best example...,0,"[consider, good, example, genre]"
3,en,"The tale plays with, giving the story lasting ...",0,"[tale, play, give, story, last, popularity, ad..."
4,en,One of the best-known and most popular works o...,0,"[well, know, popular, work, english, language,..."
