# Introduction

Aim: to recognize the tone of a Mandarin sylable from an audio recording.

TODO: Add detailed introduction

## Data Collection

Wiktionary.com contains pronunciation examples for a large selection of Mandarin expressions. In this section, Mandarin expressions with available pronunciations are identified, and the audio files are downloaded.

In [9]:
import requests
from bs4 import BeautifulSoup
import re
from tqdm.notebook import tqdm
import pandas as pd
import pickle
import time
import os

In [2]:
def clean_html(soup):
    """
    Remove all html with the unwanted classes
    """
    unwanted_classes = ['sister-wikipedia', 'thumb', 'reference', 'cited-source']
    for tag in soup.find_all(True, {'class': unwanted_classes}):
        tag.extract()
        
def parse_next_page_links(soup, category):
    """
    Return url of the next page for multi-page lists.
    """
    link_tags = soup.find('div', {'id': 'mw-pages'}).find_all('a', {'title': category})
    return [link['href'] for link in link_tags if link.text == 'next page']

def parse_category_words(soup):
    """
    Return list of (linked) words from a single category page of Wiktionary
    """
    words_content = soup.find('div', {'id': 'mw-pages'}).find('div', {'class': 'mw-content-ltr'})
    words = [word.text for word in words_content.find_all('a')]
    return words

def get_category_data(soup, category):
    """
    Return all (linked) words from a Wiktionary category including multiple pages
    """
    words = []
    next_page_links = parse_next_page_links(soup, category)
    while len(next_page_links) > 0:
        words += parse_category_words(soup)
        response = session.get('https://en.wiktionary.org/' + next_page_links[0])
        soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
        clean_html(soup)
        next_page_links = parse_next_page_links(soup, category)
    words += parse_category_words(soup)
    return words

In [3]:
#Get html for page 1 of Category:Mandarin terms with audio links
session = requests.Session()
category = "Category:Mandarin terms with audio links"
response = session.get("https://en.wiktionary.org/wiki/{}".format(category))
soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')

#Extract words from html
clean_html(soup)
words = get_category_data(soup, category)

#Remove words that are not Chinese characters
words = [w for w in words if not re.search(r'[^\u4e00-\u9fff]+', w)]

#Print the first 10 words as examples
print(words[:10])

['鞄', '一', '一下兒', '一些', '一共', '一再', '一切', '一向', '一塊兒', '一定']


In [4]:
def get_pronunciation_source(word, session):
    """
    Return the pinyin and link to audio pronunciation example for a 
    Mandarin word from Wiktionary
    """
    source = None
    response = session.get("https://en.wiktionary.org/wiki/{}".format(word))
    soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
    pronunciations_box = soup.find("div", {"data-toggle-category": "pronunciations"})
    
    pinyin_tag = pronunciations_box.find_all("a", text="Pinyin")[0]
    pinyin = [a for a in pinyin_tag.find_parent().find_parent().find_parent().find_all("a") if a.text != "Pinyin"][0].text
    
    mandarin_li = [li for li in pronunciations_box.find_all("li") if li.find_all("a", text="Mandarin")]
    if mandarin_li[0].find("source"):
        source = mandarin_li[0].find("source")["src"]
    return pinyin, source

In [6]:
#Collect pinyins and audio sources for Mandarin words with audio links
sources = []
pinyins = []
for word in tqdm(words):
    pinyin, source = get_pronunciation_source(word, session)
    sources.append(source)
    pinyins.append(pinyin)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4932.0), HTML(value='')))




In [7]:
df = pd.DataFrame({"word": words, "pinyin": pinyins, "source": sources})
df = df.dropna()

with open("dataset", "wb") as file:
    pickle.dump(df, file)

### Download audio files

In [15]:
if not os.path.isdir("data/"):
    os.mkdir("data/")
for word, source in tqdm(list(zip(df.word.tolist(), df.source.tolist()))):
    if not os.path.isfile("data/{}.ogg".format(word)):
        response = session.get("http:" + source)
        if response.status_code == 200:
            with open("data/{}.ogg".format(word), "wb") as file:
                file.write(response.content)
        else:
            print(response.status_code)
            print("Error downloading {}.ogg".format(word))
        #Avoid too many requests
        time.sleep(0.7)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4910.0), HTML(value='')))




## Data preprocessing

### Pinyin preprocessing

The ground truth data for supervised learning algorithms should be numbers (1, 2, 3, or 4) representing the Mandarin tone. Therefore, these tone numbers must first be calculated from the pinyin pronunciation guides downloaded from Wiktionary. For example, yīgòng should be converted to \[1, 4\].

In [1]:
import pickle
import re
from itertools import groupby, cycle 

In [2]:
with open("dataset", "rb") as file:
    df = pickle.load(file)

In [3]:
def get_tones(pinyin, hanzi):
    """
    Returns a list of integers representing the tones for a pinyin word
    """
    num_chars = len(hanzi)
    tones = []
    found_tones = re.findall("[āēīōūǖĀĒĪŌŪǕáéíóúǘÁÉÍÓÚǗǎěǐǒǔǚǍĚǏǑǓǙàèìòùǜÀÈÌÒÙǛ]", 
                             pinyin)
    if len(found_tones) == num_chars:
        for sylable in found_tones:
            if re.search("[āēīōūǖĀĒĪŌŪǕ]", sylable):
                tones.append(1)
            elif re.search("[áéíóúǘÁÉÍÓÚǗ]", sylable):
                tones.append(2)
            elif re.search("[ǎěǐǒǔǚǍĚǏǑǓǙ]", sylable):
                tones.append(3)
            elif re.search("[àèìòùǜÀÈÌÒÙǛ]", sylable):
                tones.append(4)
    elif len(found_tones) < num_chars:
        for sylable in found_tones:
            if re.search("[āēīōūǖĀĒĪŌŪǕ]", sylable):
                tones.append(1)
            elif re.search("[áéíóúǘÁÉÍÓÚǗ]", sylable):
                tones.append(2)
            elif re.search("[ǎěǐǒǔǚǍĚǏǑǓǙ]", sylable):
                tones.append(3)
            elif re.search("[àèìòùǜÀÈÌÒÙǛ]", sylable):
                tones.append(4)
        for i in range(num_chars - len(found_tones)):
            tones.append(5)
            
    return tones

In [4]:
get_tones("yīgòng", "一共")

[1, 4]

In some cases, the actual pronounced tone does not match the pinyin guide; however, these cases follow specific rules. For example, 可以 has pinyin kěyǐ, indicating the tone pair \[3, 3\], but the actual pronounciation is \[2, 3\].

In [5]:
def group_sequence(li): 
    """
    A helper function for tone adjustment
    Takes a list of integers as its argument
    Returns a list of tuples containing subsequences of l that 
    increase by 1 for each index 
    e.g., [1, 2, 3, 1, 2, 2, 1, 1] returns [(1, 2, 3), (1, 2)]
    """
    temp_list = cycle(li) 
    next(temp_list) 
    groups = groupby(li, key = lambda j: j + 1 == next(temp_list)) 
    for k, v in groups: 
        if k: 
            yield tuple(v) + (next((next(groups)[1])), ) 

def tone_adjustment(tones, hanzi):
    """
    Adjusts a list of tones according to the rules for mandarin tone 
    adjustment and return the corrected list of tones
    (e.g., [3, 3] --> [2, 3])
    """
    three_indexes = [i for i, v in enumerate(tones) if v ==3]
    if len(three_indexes) > 0:
        #Find the indexes of adjacent groups of third tones.
        three_chains = list(group_sequence(three_indexes))
        for three_chain in three_chains:
            tones[min(three_chain):max(three_chain) + 1] = [2]*(max(three_chain) - min(three_chain)) + [3]
    if "不" in hanzi and hanzi[-1] != "不":
        if tones[hanzi.index("不") + 1] == 4:
            tones[hanzi.index("不")] = 2
    if "一" in hanzi and hanzi[-1] != "一":
        following_char = hanzi[hanzi.index("一")+1]
        if following_char not in ["一", "二", "三", "四", "五", "六", "七", "八", "九"]:
            if tones[hanzi.index("一")+1] == 4:
                tones[hanzi.index("一")] = 2
            else:
                tones[hanzi.index("一")] = 4
    return tones

In [6]:
hanzi = "可以"
pinyin = "kěyǐ"
tones = get_tones(pinyin, hanzi)
tone_adjustment(tones, hanzi)

[2, 3]

In [7]:
df["tones"] = df.apply(lambda x: tone_adjustment(get_tones(x[1], x[0]), 
                                          x[0]), axis=1)
df.head()

Unnamed: 0,word,pinyin,source,tones
0,鞄,páo,//upload.wikimedia.org/wikipedia/commons/1/16/...,[2]
1,一,yī,//upload.wikimedia.org/wikipedia/commons/b/b0/...,[1]
2,一下兒,yīxiàr,//upload.wikimedia.org/wikipedia/commons/7/7c/...,"[2, 4, 5]"
3,一些,yīxiē,//upload.wikimedia.org/wikipedia/commons/5/5e/...,"[4, 1]"
4,一共,yīgòng,//upload.wikimedia.org/wikipedia/commons/5/51/...,"[2, 4]"
