In [1]:
import urllib
import time
import shutil
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
# from the accent.gmu website, pass in list of languages to scrape mp3 files and save them to disk
def mp3getter(lst):
    for j in range(len(lst)):
        for i in range(1,lst[j][1]+1):
            while True:
                try:
                    urllib.urlretrieve("http://accent.gmu.edu/soundtracks/{0}{1}.mp3".format(lst[j][0], i), '{0}{1}.mp3'.format(lst[j][0], i))
                except:
                    time.sleep(2)
                else:
                    break

In [3]:
# from list of languages, return urls of each language landing page
def lang_pages(lst):
    urls=[]
    for lang in lst:
        urls.append('http://accent.gmu.edu/browse_language.php?function=find&language={}'.format(lang))
    return urls

In [4]:
# from http://accent.gmu.edu/browse_language.php, return list of languages
def get_languages():
    url = "http://accent.gmu.edu/browse_language.php"
    html = get(url)
    soup = BeautifulSoup(html.content, 'html.parser')
    languages = []
    language_lists = soup.findAll('ul', attrs={'class': 'languagelist'})
    for ul in language_lists:
        for li in ul.findAll('li'):
            languages.append(li.text)
    return languages

In [5]:
# from list of languages, return list of urls
def get_language_urls(lst):
    urls = []
    for language in lst:
        urls.append('http://accent.gmu.edu/browse_language.php?function=find&language=' + language)
    return urls

In [6]:
# from language, get the number of speakers of that language
def get_num(language):
    url = 'http://accent.gmu.edu/browse_language.php?function=find&language=' + language
    html = get(url)
    soup = BeautifulSoup(html.content, 'html.parser')
    test = soup.find_all('div', attrs={'class': 'content'})
    try:
        num = int(test[0].find('h5').text.split()[2])
    except AttributeError:
        num = 0
    return num

In [7]:
# from list of languages, return list of tuples (LANGUAGE, LANGUAGE_NUM_SPEAKERS) for mp3getter, ignoring languages
# with 0 speakers
def get_formatted_languages(languages):
    formatted_languages = []
    for language in languages:
        num = get_num(language)
        if num != 0:
            formatted_languages.append((language,num))
    return formatted_languages

In [8]:
# from each language whose url is contained in the above list, save the number of speakers of that language to a list
def get_nums(lst):
    nums = []
    for url in lst:
        html = get(url)
        soup = BeautifulSoup(html.content, 'html.parser')
        test = soup.find_all('div', attrs={'class': 'content'})
        nums.append(int(test[0].find('h5').text.split()[2]))
    return nums

In [9]:
def get_speaker_info(start, stop):
    '''
    Inputs: two integers, corresponding to min and max speaker id number per language
    Outputs: Pandas Dataframe containing speaker filename, birthplace, native_language, age, sex, age_onset of English
    '''

    user_data = []
    for num in range(start,stop):
        info = {'speakerid': num, 'filename': 0, 'birthplace':1, 'native_language': 2, 'age':3, 'sex':4, 'age_onset':5}
        url = "http://accent.gmu.edu/browse_language.php?function=detail&speakerid={}".format(num)
        html = get(url)
        soup = BeautifulSoup(html.content, 'html.parser')
        body = soup.find_all('div', attrs={'class': 'content'})
        try:
            info['filename']=str(body[0].find('h5').text.split()[0])
            bio_bar = soup.find_all('ul', attrs={'class':'bio'})
            info['birthplace'] = str(bio_bar[0].find_all('li')[0].text)[13:-6]
            info['native_language'] = str(bio_bar[0].find_all('li')[1].text.split()[2])
            info['age'] = float(bio_bar[0].find_all('li')[3].text.split()[2].strip(','))
            info['sex'] = str(bio_bar[0].find_all('li')[3].text.split()[3].strip())
            info['age_onset'] = float(bio_bar[0].find_all('li')[4].text.split()[4].strip())
            user_data.append(info)
        except:
            info['filename'] = ''
            info['birthplace'] = ''
            info['native_language'] = ''
            info['age'] = ''
            info['sex'] = ''
            info['age_onset'] = ''
            user_data.append(info)
        df = pd.DataFrame(user_data)
        df.to_csv('speaker_info_{}.csv'.format(stop))
    return df

In [10]:
# copy files from one list of wav files to a specified location
def copy_files(lst, path):
    for filename in lst:
        shutil.copy2('{}.wav'.format(filename), '{}/{}.wav'.format(path, filename))

In [11]:
languages = get_languages()

In [12]:
languages

[u'aceh',
 u'afrikaans',
 u'agni',
 u'agny',
 u'akan',
 u'albanian',
 u'amazigh',
 u'american sign language',
 u'amharic',
 u'ancient greek',
 u'antigua and barbuda creole english',
 u'anyin',
 u'appolo',
 u'arabic',
 u'aramaic',
 u'armenian',
 u'aromanian',
 u'ashanti',
 u'asl',
 u'azerbaijani',
 u'azerbaijani, south',
 u'azeri turk',
 u'babur',
 u'bafang',
 u'baga',
 u'bahasa indonesia',
 u'bai',
 u'balant',
 u'balanta ganja',
 u'bamanankan',
 u'bambara',
 u'bamun',
 u'banganthe',
 u'bangla',
 u'baoule',
 u'bari',
 u'basque',
 u'bassa',
 u'bavarian',
 u'belarusan',
 u'bengali',
 u'bikol',
 u'bislama',
 u'bosnian',
 u'bulgarian',
 u'burmese',
 u'cameroon creole english',
 u'cantonese',
 u'carolinian',
 u'catalan',
 u'cebuano',
 u'chaam',
 u'chagga',
 u'chaldean',
 u'chaldean neo aramaic',
 u'chamorro',
 u'chichewa',
 u'chin, mizo',
 u'chinese',
 u'chittagonian',
 u'chuukese',
 u'classical greek',
 u'cotocoli',
 u'creole',
 u'creole french',
 u'crioulo',
 u'croatian',
 u'czech',
 u'dan

In [None]:
type(languages[0])

In [None]:
formatted_lang = get_formatted_languages(languages)

In [None]:
formatted_lang

In [None]:
mp3getter(formatted_lang)