In [1]:
import json
import pandas as pd
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
file_path = f"../transcripts/en/ted_talks_en.json"
f = open(file_path,encoding="utf-8")
file_en = json.load(f)
f.close()
file_path = f"../transcripts/zh-cn/ted_talks_zh-cn.json"
f = open(file_path,encoding="utf-8")
file_cn = json.load(f)
f.close()
file_path = f"../transcripts/ko/ted_talks_ko.json"
f = open(file_path,encoding="utf-8")
file_ko = json.load(f)
f.close()

### Create annotated json corpus

In [144]:
def process_text(text):
    paras = text.replace("\n", " ").strip(" ").replace("  ", " ").split("[PARAGRAPH]")
    
    return [para.strip(" ") for para in paras if len(para) > 1]

In [122]:
en_corpus = pd.DataFrame(file_en)
en_corpus["text"] = en_corpus["text"].apply(process_text)

In [145]:
ko_corpus = pd.DataFrame(file_ko)
ko_corpus["text"] = ko_corpus["text"].apply(process_text)

In [146]:
cn_corpus = pd.DataFrame(file_cn)
cn_corpus["text"] = cn_corpus["text"].apply(process_text)

In [147]:
def convert_corpus_to_list(corpus):
    return corpus.apply(lambda x:x.to_dict(), axis=1).to_list()

In [148]:
def write_json(corpus, file_path):
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(corpus, json_file,  indent=4, separators=(',', ':'))

In [149]:
def extract_named_entity(talk):
    paras = []
    
    for para in talk:
        doc = nlp(para)
        ne_lex = {}
        ents = []
        ne_lex["text"] = para            
        for ent in doc.ents:
            ent_lex = {}
            ent_lex["start"] = ent.start_char
            ent_lex["end"] = ent.end_char
            ent_lex["text"] = ent.text
            ent_lex["label"] = ent.label_
            ents.append(ent_lex)
        ne_lex["ents"] = ents
        paras.append(ne_lex)
    
    return paras

In [150]:
def extract_paragraph(talk):
    paras = []
    
    for para in talk:
        ne_lex = {}
        ents = []
        ne_lex["text"] = para
        ne_lex["ents"] = ents
        paras.append(ne_lex)
    
    return paras

In [151]:
en_corpus["text"] = en_corpus["text"].apply(extract_named_entity)
ko_corpus["text"] = ko_corpus["text"].apply(extract_paragraph)
cn_corpus["text"] = cn_corpus["text"].apply(extract_paragraph)

In [156]:
en_corpus_list = convert_corpus_to_list(en_corpus)
ko_corpus_list = convert_corpus_to_list(ko_corpus)
cn_corpus_list = convert_corpus_to_list(cn_corpus)

In [157]:
en_file_path = "../transcripts/en/annotated/annotated_ted_talks_en.json"
ko_file_path = "../transcripts/ko/annotated/annotated_ted_talks_ko.json"
cn_file_path = "../transcripts/zh-cn/annotated/annotated_ted_talks_cn.json"
write_json(en_corpus_list, en_file_path)
write_json(ko_corpus_list, ko_file_path)
write_json(cn_corpus_list, cn_file_path)