### Goal: to create a seq2seq model that returns a mainland Chinese translation for a given Taiwanese Mandarin sequence

In [1]:
from ted_crawler import TedCrawler
from pprint import pprint
import json
from utils import compare_times

### Building a corpus

#### Scraping data
1. Source: [Ted Talks](https://www.ted.com/talks)
2. Look for API
    - if found: break
2. Look for a pattern.
    - Examine network activity.
    - Look for a transcript file.
3. Write a program that exploits the pattern.
4. Profit.

In [4]:
# Instantiate crawler
crawler = TedCrawler()

Checking page 1...
Bookmark not found.


In [7]:
# Get a list of talk meta data
talk_list = crawler.get_talk_list()

In [22]:
# Get details for a talk
talk = crawler.get_talk_details(talk_list[20])
pprint(talk_details)

{'name': 'Frans de Waal',
 'post_date': 'Jun 2018 Rated Fascinating, Ingenious',
 'tags': None,
 'talk_link': '/talks/frans_de_waal_the_surprising_science_of_alpha_males',
 'title': 'The surprising science of alpha males',
 'transcripts': {}}


In [15]:
# Show list of languages on TED
crawler.get_languages()

{'Afrikaans': 'af',
 'Albanian': 'sq',
 'Algerian Arabic': 'arq',
 'Amharic': 'am',
 'Arabic': 'ar',
 'Armenian': 'hy',
 'Assamese': 'as',
 'Asturian': 'ast',
 'Azerbaijani': 'az',
 'Basque': 'eu',
 'Belarusian': 'be',
 'Bengali': 'bn',
 'Bislama': 'bi',
 'Bosnian': 'bs',
 'Bulgarian': 'bg',
 'Burmese': 'my',
 'Catalan': 'ca',
 'Cebuano': 'ceb',
 'Chinese, Simplified': 'zh-cn',
 'Chinese, Traditional': 'zh-tw',
 'Chinese, Yue': 'zh',
 'Creole, Haitian': 'ht',
 'Croatian': 'hr',
 'Czech': 'cs',
 'Danish': 'da',
 'Dutch': 'nl',
 'Dzongkha': 'dz',
 'English': 'en',
 'Esperanto': 'eo',
 'Estonian': 'et',
 'Filipino': 'fil',
 'Finnish': 'fi',
 'French': 'fr',
 'French (Canada)': 'fr-ca',
 'Galician': 'gl',
 'Georgian': 'ka',
 'German': 'de',
 'Greek': 'el',
 'Gujarati': 'gu',
 'Hakha Chin': 'cnh',
 'Hausa': 'ha',
 'Hebrew': 'he',
 'Hindi': 'hi',
 'Hungarian': 'hu',
 'Hupa': 'hup',
 'Icelandic': 'is',
 'Igbo': 'ig',
 'Indonesian': 'id',
 'Ingush': 'inh',
 'Irish': 'ga',
 'Italian': 'it',
 'J

In [23]:
# Get subtitles for a talk
crawler.get_subtitles(talk, 'zh-tw', 'zh-cn')

zh-cn not found.


{'talk_link': '/talks/susan_emmett_this_simple_test_can_help_kids_hear_better',
 'transcripts': {'zh-tw': {'paragraphs': [{'cues': [{'time': 1246,
       'text': '聽見聲音對於阿拉斯加的'},
      {'time': 3881, 'text': '原住民而言很重要。'}]},
    {'cues': [{'time': 6718, 'text': '聽力缺損會造成在開放水域捕魚、\n獵殺北美馴鹿，'},
      {'time': 11103, 'text': '以及採收莓類變得更困難，'},
      {'time': 13629, 'text': '這些活動對於阿拉斯加\n原住民文化而言都很重要。'},
      {'time': 18279, 'text': '並不是只有阿拉斯加農村的人\n才有聽力缺損問題。'},
      {'time': 21582, 'text': '它是全球性的問題。'},
      {'time': 23290, 'text': '「全球性疾病負擔研究」\n估計全世界有 11 億人'},
      {'time': 29493, 'text': '帶著聽力缺損在過生活。'},
      {'time': 32329, 'text': '這個數目比整個\n撒哈拉以南非洲的人口還多。'},
      {'time': 38198, 'text': '這些人當中，超過 80%\n都住在中低收入的國家，'},
      {'time': 42060, 'text': '許多人沒有辦法取得聽力照護。'}]},
    {'cues': [{'time': 46670, 'text': '他們的生活受到很大的衝擊。'},
      {'time': 51050, 'text': '阿努克是我在阿拉斯加\n治療的一位三歲男孩。'},
      {'time': 55596, 'text': '當他才幾乎要四個月大時，\n就開始出現耳朵感染。'},
      {'time': 59872, 'text': '他的父母帶他去診所，'},
      {'tim

In [2]:
# All transcripts
with open("TED_transcripts_1.json", encoding="utf8") as fp:
    transcripts = json.load(fp)

print(f"Total number of TED talks with zh-tw and zh-cn: {len(transcripts)}")

Total number of TED talks with zh-tw and zh-cn: 2771


#### JSON structure

In [3]:
for key in transcripts:
    print(key)

How I turn negative online comments into positive offline conversations
What I've learned about parenting as a stay-at-home dad
How work kept me going during my cancer treatment
A woman's fury holds lifetimes of wisdom
Visions of Africa's future, from African filmmakers
War and what comes after
SpaceX's plan to fly you across the globe in 30 minutes
A Parkland teacher's homework for us all
Why it's worth listening to people you disagree with
The "dead zone" of the Gulf of Mexico
The harm reduction model of drug addiction treatment
A printable, flexible, organic solar cell
What's missing in the global debate over refugees
What if we ended the injustice of bail?
How we need to remake the internet
How the arts help homeless youth heal and build
How language shapes the way we think
How a team of chefs fed Puerto Rico after Hurricane Maria
The Standing Rock resistance and our fight for indigenous rights
How I use the drum to tell my story
Should we create a solar shade to cool the earth?
To

In [4]:
t1 = transcripts['How I turn negative online comments into positive offline conversations']
print(t1.keys())

dict_keys(['talk_link', 'transcripts', 'name', 'title', 'post_date', 'tags'])


In [5]:
print(t1['transcripts'].keys())

dict_keys(['zh-cn', 'zh-tw'])


In [6]:
pprint(t1['transcripts']['zh-tw'])

{'paragraphs': [{'cues': [{'text': '嗨。', 'time': 476},
                          {'text': '我在網上常常會收到仇恨。', 'time': 2848},
                          {'text': '很多的仇恨。', 'time': 6373},
                          {'text': '這和我工作的領域有關。', 'time': 7563},
                          {'text': '我是數位創作者，', 'time': 10227},
                          {'text': '我專門為網路製做東西。', 'time': 11806},
                          {'text': '比如，幾年前，我做了一系列\n影片，叫做《每一個字》，',
                           'time': 15292},
                          {'text': '我把熱門電影拿來編輯，', 'time': 19111},
                          {'text': '縮減成由有色人種說出的幾個字，', 'time': 21253},
                          {'text': '這是種憑經驗且可以獲得的方式，', 'time': 24125},
                          {'text': '目的是在談好萊塢的人種比例問題。', 'time': 26673},
                          {'text': '後來，在跨性別恐懼的廁所法案', 'time': 30076},
                          {'text': '開始得到美國媒體的注意力，', 'time': 33506},
                          {'text': '我主持並製作了一系列的訪談，', 'time': 37069},
                          {'text'

In [7]:
pprint(t1['transcripts']['zh-cn'])

{'paragraphs': [{'cues': [{'text': '嗨。', 'time': 476},
                          {'text': '我在网上收到了仇恨的情绪。', 'time': 2848},
                          {'text': '很多很多。', 'time': 6373},
                          {'text': '这跟我的工作领域有关。', 'time': 7303},
                          {'text': '我是一名数字创作者，', 'time': 10197},
                          {'text': '专门制作网络作品。', 'time': 11866},
                          {'text': '大概几年前，我制作了一个\n视频系列，名叫“每字每句“，',
                           'time': 15082},
                          {'text': '是把流行影片剪辑成', 'time': 19139},
                          {'text': '只有非白人演员说台词的短视频，', 'time': 21309},
                          {'text': '以此来直观实际地讨论\n好莱坞的有色群体代表', 'time': 24161},
                          {'text': '这一话题。', 'time': 28593},
                          {'text': '后来，由于“跨性别厕所令”', 'time': 30076},
                          {'text': '开始在全美范围引发媒体关注，', 'time': 33552},
                          {'text': '我主持并制作了一个访谈系列，', 'time': 37095},
                          {'text': '名叫

In [8]:
pprint(compare_times(t1))

Total number of lines in "How I turn negative online comments into positive offline conversations"
TW:236	CN:236
None


### Because most of the lines were aligned, I didn't do any other alignment (and because I don't know how).

#### Cleaning
- I removed punctuation and saved each aligned text on a separate line in a text file.

In [14]:
with open("tw_cn_lines_cleaned_and_aligned.txt", encoding="utf8") as fp:
    clean_trans = fp.read().split("\n")

In [16]:
print(f"Total tw-cn pairs: {len(clean_trans)}")

Total tw-cn pairs: 658726


### seq2seq

- I used Jason Brownlee's [How to Develop a Neural Machine Translation System from Scratch](https://machinelearningmastery.com/develop-neural-machine-translation-system-keras/)

- Ran into a problem with numpy arrays; ultimately split tw and cn into separate arrays and combined them later
- Better than last semester, but still worthless...
- Still ran into memory issues