# Scraping language data from Wikipedia using bs4 

### To collect the language data needed to train various language detection models I'm going to scrape articles from Wikipedia. The languages I'll be focusing on are English, Japanese, Spanish, German, Russian, French, Italian, and Chinese since most Wikipedia articles are written in those 8 langauges. Since this is just practice I'll only be scraping one article (written in each langauage).

In [1]:
import re, requests
from bs4 import BeautifulSoup
from collections import defaultdict
# I'll use this dictionary for integer encoding later
FOCUS = { 'en': 0, 'ja': 1, 'es': 2, 'de': 3, 'ru': 4, 'fr': 5, 'it': 6, 
          'zh': 7 }

In [2]:
# create function for retrieving the link for a wikipedia article in different languages

def get_hyperlinks(target_link):
    r = requests.get(target_link)
    soup = BeautifulSoup(r.content, 'html.parser')
    hyperlinks = soup.find_all("a", lang=True)

    lang_links = defaultdict(str)

    lang_links['en'] = target_link
    for h in hyperlinks:
        if h['lang'] in FOCUS.keys():
            lang_links[h['lang']] = h['href']
    
    # return defaultdict
    return lang_links

In [3]:
# these regexes will be used to split each article into a list of sentences
# I prefer using my own custorm regex for this task step in processing

e_sents = re.compile(r'[！!？?。]') # eastern languages
w_sents = re.compile(r'(?<!\w\.\w\.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s') # western languages

# I also need to blacklist certain html tags to avoid scraping unwanted data

blacklist = [ '[document]', 'a', 'abbr', 'bdi', 'cite', 'div', 'h2', 'h3', 
              'label', 'li', 'script', 'span', 'style', 'sup', 'th', 'ul']

In [4]:
 # create funciton for scraping articles / parsing sentences
 
def fetch_lang_data(target_link):

    data = []
    lang_links = get_hyperlinks(target_link)

    for lang, link in lang_links.items():
        r = requests.get(link)
        soup = BeautifulSoup(r.content, 'html.parser')

        text = soup.find_all(string=True)

        output = ''
        for t in text:
            if t.parent.name not in blacklist:
                output += '{} '.format(t)

        if lang not in ['ja', 'zh']:
            # fix spacing
            output = output.strip()
            output = re.sub(r'\(\s+\)', r' ', output) # empty parenthesis
            output = re.sub(r"\s+([,.'\":])", r'\1', output) # spaced punctuation
            output = re.sub(r'(\s{2,})', r' ', output) # long spaces
            # parse sentences
            output = w_sents.split(output)
        else:
            # fix spacing
            output = output.strip()
            output = re.sub(r'\s{2,}', r' ', output) # long spaces
            # parse sentences
            output = e_sents.split(output)
        
        for sent in output:
            data.append((lang, sent))

    # return list of tuples
    return data


In [8]:
# get language data as a list of tuples

article_link = "https://en.wikipedia.org/wiki/Alice's_Adventures_in_Wonderland"
language_data = fetch_lang_data(article_link)

# create dataframe from language data

import pandas as pd

language_df = pd.DataFrame(language_data, columns=['language', 'sentence'])

# create column with language encodings (integer encoding)
language_df['language_code'] = language_df['language'].apply(lambda x: FOCUS[x])

# import numpy as np

## create column with one-hot encodings
# language_df['one_hot_vector'] = language_df['language'].apply(lambda x: np.eye(8)[LABELS[x]].astype('int').tolist())

language_df.head()

Unnamed: 0,language,sentence,language_code
0,en,Alice's Adventures in Wonderland - Wikipedia A...,0
1,en,A young girl named falls through a rabbit hole...,0
2,en,It is seen as a prime example of the genre.,0
3,en,Its play with gives the story lasting populari...,0
4,en,One of the best-known works of Victorian Engli...,0


In [9]:
import spacy

spacy_models = { 'en': 'en_core_web_md', 
                'ja': 'ja_core_news_md', 
                'es': 'es_core_news_md', 
                'de': 'de_core_news_md', 
                'ru': 'ru_core_news_md', 
                'fr': 'fr_core_news_md', 
                'it': 'it_core_news_md', 
                'zh': 'zh_core_web_md' }

# create function that tokenizes each sentence in the dataframe, lemmatizes them, 
# and removes stopwords, punctuation, and numbers

def tokenize(sentence, lang):
    doc = nlp(sentence)
    if lang == 'zh':
        return [tok.text for tok in doc if not tok.is_punct and tok.pos_ != 'NUM']
    else:
        return [tok.lemma_.lower() for tok in doc if not tok.is_punct 
            and not tok.is_stop and not tok.is_space and tok.pos_ != 'NUM']

In [10]:
# filter dataframe by each language, apply tokenizing function to it, and add to list

series_list = []

for lang, model in spacy_models.items():
    nlp = spacy.load(model, disable=['ner', 'parser'])
    series = language_df[language_df['language']==lang]['sentence'].apply(tokenize, args=(lang,))
    series_list.append(series)

In [9]:
# concat the list of series
tokens = pd.concat(series_list)

# add tokens column to dataframe
language_df = language_df.assign(tokens=tokens)
# remove rows with empty tokens list
language_df = language_df[~language_df['tokens'].apply(lambda x: len(x)==0)]
# output as pickle
language_df.to_pickle('language_data.pickle')

language_df.head()

Unnamed: 0,language,sentence,language_code,tokens
0,en,Alice's Adventures in Wonderland - Wikipedia A...,0,"[alice, adventures, wonderland, wikipedia, ali..."
1,en,A young girl named falls through a rabbit hole...,0,"[young, girl, name, fall, rabbit, hole, fantas..."
2,en,It is seen as a prime example of the genre.,0,"[see, prime, example, genre]"
3,en,Its play with gives the story lasting populari...,0,"[play, give, story, last, popularity, adult, c..."
4,en,One of the best-known works of Victorian Engli...,0,"[well, know, work, victorian, english, fiction..."
