# Introduction

Aim: to recognize the tone of a Mandarin sylable from an audio recording.

TODO: Add detailed introduction

## Data Collection

Wiktionary.com contains pronunciation examples for a large selection of Mandarin expressions. In this section, Mandarin expressions with available pronunciations are identified, and the audio files are downloaded.

In [9]:
import requests
from bs4 import BeautifulSoup
import re
from tqdm.notebook import tqdm
import pandas as pd
import pickle
import time
import os

In [2]:
def clean_html(soup):
    """
    Remove all html with the unwanted classes
    """
    unwanted_classes = ['sister-wikipedia', 'thumb', 'reference', 'cited-source']
    for tag in soup.find_all(True, {'class': unwanted_classes}):
        tag.extract()
        
def parse_next_page_links(soup, category):
    """
    Return url of the next page for multi-page lists.
    """
    link_tags = soup.find('div', {'id': 'mw-pages'}).find_all('a', {'title': category})
    return [link['href'] for link in link_tags if link.text == 'next page']

def parse_category_words(soup):
    """
    Return list of (linked) words from a single category page of Wiktionary
    """
    words_content = soup.find('div', {'id': 'mw-pages'}).find('div', {'class': 'mw-content-ltr'})
    words = [word.text for word in words_content.find_all('a')]
    return words

def get_category_data(soup, category):
    """
    Return all (linked) words from a Wiktionary category including multiple pages
    """
    words = []
    next_page_links = parse_next_page_links(soup, category)
    while len(next_page_links) > 0:
        words += parse_category_words(soup)
        response = session.get('https://en.wiktionary.org/' + next_page_links[0])
        soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
        clean_html(soup)
        next_page_links = parse_next_page_links(soup, category)
    words += parse_category_words(soup)
    return words

In [3]:
#Get html for page 1 of Category:Mandarin terms with audio links
session = requests.Session()
category = "Category:Mandarin terms with audio links"
response = session.get("https://en.wiktionary.org/wiki/{}".format(category))
soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')

#Extract words from html
clean_html(soup)
words = get_category_data(soup, category)

#Remove words that are not Chinese characters
words = [w for w in words if not re.search(r'[^\u4e00-\u9fff]+', w)]

#Print the first 10 words as examples
print(words[:10])

['鞄', '一', '一下兒', '一些', '一共', '一再', '一切', '一向', '一塊兒', '一定']


In [4]:
def get_pronunciation_source(word, session):
    """
    Return the pinyin and link to audio pronunciation example for a 
    Mandarin word from Wiktionary
    """
    source = None
    response = session.get("https://en.wiktionary.org/wiki/{}".format(word))
    soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
    pronunciations_box = soup.find("div", {"data-toggle-category": "pronunciations"})
    
    pinyin_tag = pronunciations_box.find_all("a", text="Pinyin")[0]
    pinyin = [a for a in pinyin_tag.find_parent().find_parent().find_parent().find_all("a") if a.text != "Pinyin"][0].text
    
    mandarin_li = [li for li in pronunciations_box.find_all("li") if li.find_all("a", text="Mandarin")]
    if mandarin_li[0].find("source"):
        source = mandarin_li[0].find("source")["src"]
    return pinyin, source

In [6]:
#Collect pinyins and audio sources for Mandarin words with audio links
sources = []
pinyins = []
for word in tqdm(words):
    pinyin, source = get_pronunciation_source(word, session)
    sources.append(source)
    pinyins.append(pinyin)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4932.0), HTML(value='')))




In [7]:
df = pd.DataFrame({"word": words, "pinyin": pinyins, "source": sources})
df = df.dropna()

with open("dataset", "wb") as file:
    pickle.dump(df, file)

### Download audio files

In [15]:
if not os.path.isdir("data/"):
    os.mkdir("data/")
for word, source in tqdm(list(zip(df.word.tolist(), df.source.tolist()))):
    if not os.path.isfile("data/{}.ogg".format(word)):
        response = session.get("http:" + source)
        if response.status_code == 200:
            with open("data/{}.ogg".format(word), "wb") as file:
                file.write(response.content)
        else:
            print(response.status_code)
            print("Error downloading {}.ogg".format(word))
        #Avoid too many requests
        time.sleep(0.7)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4910.0), HTML(value='')))


