In [1]:
from fastapi import FastAPI, Query
from fastapi.responses import FileResponse, HTMLResponse
from spacy import displacy
from typing import List
import pandas as pd
import numpy as np
import uvicorn
import spacy
import json
from collections import defaultdict

# Keyword Search Function

### 1. Take a keyword and language, search the keyword in English corpus, and return parallel paragraphs

In [2]:
def open_file_return_corpus(path):
    f = open(path, encoding="utf-8")
    corpus = json.load(f)
    f.close()
    return corpus

In [3]:
paths = ["../transcripts/en/final/final_annotations.json",
         "../transcripts/ko/filtered/filtered_annotated_ted_talks_ko.json", 
         "../transcripts/zh-cn/filtered/filtered_annotated_ted_talks_cn.json"]

en_corpus = open_file_return_corpus(paths[0])
ko_corpus = open_file_return_corpus(paths[1])
cn_corpus = open_file_return_corpus(paths[2])

In [4]:
valid_labels = {"PERSON", "ORG", "EVENT", "PRODUCT", "LOC", "WORK_OF_ART"}

In [5]:
def find_paragraphs(keyword, language='all'):
    """find paragraphs of parallel corpus, given keyword and language"""
    parallel_corpora = defaultdict(dict)
    df = None
    # find a paragraph in a corpus
    count = 0
    for i, talk in enumerate(en_corpus):
        for k, para in enumerate(talk["text"]):
            if keyword.lower() in para["text"].lower():
                parallel_corpora[count]["talk_id"] = i
                parallel_corpora[count]["title"] = " ".join(
                    talk["title"].split("_")
                ).title()
                parallel_corpora[count]["en"] = para["text"][:500] + "..."
                parallel_corpora[count]["para_id"] = k

                ents = set()
                for ent in para["ents"]:
                    if ent["label"] in valid_labels:
                        ents.add((ent["text"], ent["label"]))

                parallel_corpora[count]["entities"] = list(ents)
                
                count+=1
    
    # language option
    print(language)
    if language == "all":
        for key, para_dict in parallel_corpora.items():
            talk_id = para_dict["talk_id"]
            para_id = para_dict["para_id"]
            ko_para = ko_corpus[talk_id]["text"][para_id]
            cn_para = cn_corpus[talk_id]["text"][para_id]

            parallel_corpora[key]["ko"] = ko_para["text"][:500] + "..."
            parallel_corpora[key]["zh-cn"] = cn_para["text"][:500] + "..."
        df = pd.DataFrame(parallel_corpora).T[
            ["title", "en", "ko", "zh-cn", "entities", "para_id"]
        ]
    if language == "ko":
        for key, para_dict in parallel_corpora.items():
            talk_id = para_dict["talk_id"]
            para_id = para_dict["para_id"]
            ko_para = ko_corpus[talk_id]["text"][para_id]
            
            parallel_corpora[key]["ko"] = ko_para["text"][:500] + "..."
        df = pd.DataFrame(parallel_corpora).T[
            ["title", "en", "ko", "entities", "para_id"]
        ]
    if language == "cn":
        for key, para_dict in parallel_corpora.items():
            talk_id = para_dict["talk_id"]
            para_id = para_dict["para_id"]
            cn_para = cn_corpus[talk_id]["text"][para_id]
            
            parallel_corpora[key]["zh-cn"] = cn_para["text"][:500] + "..."
        df = pd.DataFrame(parallel_corpora).T[
            ["title", "en", "zh-cn", "entities", "para_id"]
        ]
        
    if language == "en_only":
        df = pd.DataFrame(parallel_corpora).T[["title", "en", "entities", "para_id"]]
    
    return df

In [6]:
find_paragraphs("iPhone", 'ko')

ko


Unnamed: 0,title,en,ko,entities,para_id
0,Golan Levin Art That Looks Back At You,Hello! My name is Golan Levin. I'm an artist a...,안녕하세요. 저는 Golan Levin입니다. 저는 예술가이자 공학자입니다. 저처럼...,"[(Golan Levin, PERSON), (Apple, ORG), (Artforu...",0
1,Rives The 4 A M Mystery,1932. Not just the earliest cryptic reference ...,1932년. 새벽 4시에 대해 제가 찾을 수 있는 가장 최초의 숨은 참고문은 아닐겁...,"[(The Giacometti Code, EVENT), (iPhones, PERSO...",11
2,Hasan Elahi Fbi Here I Am,"Now it's all done on my iPhone, and it all goe...",지금은 모든 것이 제 아이폰으로 이루어지죠. 그리고 이 모든 것은 제 서버로 바로 ...,"[(Maine, LOC), (Skowhegan, LOC), (iPhone, PROD...",15
3,Hasan Elahi Fbi Here I Am,"Bruno Giussani: Hasan, just curious. You said,...","Bruno Giussani: 하산, 그냥 궁금해서 그러는데, 당신은 ""지금은 모든 ...","[(iPhone, PRODUCT), (Hasan, PERSON), (Bruno Gi...",22
4,Greg Gage The Cockroach Beatbox,"When you think about the brain, it's difficult...",(음악) 뇌를 생각하면 이해하기 어려워요. 제가 지금 여러분께 심장이 어떤 일을 하...,"[(GG, PERSON), (Spikerbox, ORG), (Greg Gabe, P...",0
5,Beth Noveck Demand A More Open Source Government,And I want to be clear to mention that this op...,그리고 저는 분명하게 말하고 싶습니다. 이 열린 정부를 위한 혁명은 정부를 민영화하...,"[(Texas, LOC), (the State of Texas, LOC), (Dal...",14
6,Beth Noveck Demand A More Open Source Government,So a lot of these innovations are local. In Sa...,그래서 이런 혁신들은 대부분 지역적입니다. 캘리포니아주 산 라몬에서 그들은 아이폰 ...,"[(San Ramon, LOC), (iPhone, PRODUCT), (CPR, OR...",16
7,Sherry Turkle Connected But Alone,"Over and over I hear, ""I would rather text tha...","제가 끊임없이 듣는 말이, ""말하느니 그냥 문자할래요."" 입니다. 그리고 제가 보는...","[(Apple, ORG), (Facebook, ORG), (iPhone, PRODU...",11
8,Stephen Wolfram Computing A Theory Of All Know...,"Well, a crucial idea of Wolfram Alpha is that ...",울프램 알파의 핵심적인 아이디어 하나는 일반적인 인간의 언어를 통해 질문을 할 수 ...,"[(Wolfram Alpha, PERSON), (iPhone, PRODUCT)]",12
9,Molly Crockett Beware Neuro Bunk,So the first unproven claim is that you can us...,첫 번째 증명되지 않은 주장은 뇌스캔을 이용하여 사람들의 생각과 감정을 읽어낼 수 ...,"[(The New York Times, ORG), (You Love Your iPh...",15


# Entity Search Function

take an entity, return dataframe which contains its contexts

In [None]:
# 1. truncate sentence around the entity
# 2. language option

In [71]:
def find_entity(entity):
    '''only show two sentences around the entity'''
    entity_dict = defaultdict(dict)
    df = None
    count = 0
    for i, talk in enumerate(en_corpus):
        for k, para in enumerate(talk["text"]):
            for j, ent in enumerate(para["ents"]):
                if ent["text"].lower() == entity.lower() and ent["label"] in valid_labels:
                    entity_dict[count]["talk_id"] = i
                    entity_dict[count]["title"] = " ".join(talk["title"].split("_")).title()
                    entity_dict[count]["para_id"] = k
                    entity_dict[count]["entity"] = ent["text"]
                    entity_dict[count]["label"] = ent["label"]
                    entity_dict[count]["start_index"] = ent["start"]
                    entity_dict[count]["end_index"] = ent["end"]
                    try:
                        entity_dict[count]["text"] = " ".join([para["text"][:ent["start"]].split(".")[-1], ent["text"], para["text"][ent["end"]:].split(".")[0], "."])
                    except IndexError:
                        entity_dict[count]["text"] = para["text"] 
                    count += 1

    df = pd.DataFrame(entity_dict).T[["title", "entity", "label", "text"]]
    return df

In [72]:
x = find_entity("iPhone")
x

Unnamed: 0,title,entity,label,text
0,Golan Levin Art That Looks Back At You,iPhone,PRODUCT,And somehow the idea that one would want to m...
1,Hasan Elahi Fbi Here I Am,iPhone,PRODUCT,"Now it's all done on my iPhone , and it all g..."
2,Hasan Elahi Fbi Here I Am,iPhone,PRODUCT,"You said, ""Now everything automatically goes ..."
3,Beth Noveck Demand A More Open Source Government,iPhone,PRODUCT,So what is Texas doing? They're asking you an...
4,Beth Noveck Demand A More Open Source Government,iPhone,PRODUCT,"In San Ramon, California, they published an ..."
5,Sherry Turkle Connected But Alone,iPhone,PRODUCT,"So for example, many people share with me thi..."
6,Stephen Wolfram Computing A Theory Of All Know...,iPhone,PRODUCT,And if you look at things like the iPhone a...
7,Paul Romer Why The World Needs Charter Cities,iPhone,PRODUCT,How could we have buildings that are low cost...
8,Matt Killingsworth Want To Be Happier Stay In ...,iPhone,PRODUCT,"org, it uses the iPhone to monitor people's ..."
9,Roz Savage Why I M Rowing Across The Pacific,iPhone,PRODUCT,We're going to make an iPhone app out of it


In [88]:
def find_entity_or_label(keyword, mode="entity"):
    '''only show two sentences around the entity'''
    entity_dict = defaultdict(dict)
    df = None
    count = 0
    for i, talk in enumerate(en_corpus):
        for k, para in enumerate(talk["text"]):
            for j, ent in enumerate(para["ents"]):
                
                if mode == "entity":
                    search = ent["text"].lower()
                elif mode == "label":
                    if ent["label"] in valid_labels:
                        search = ent["label"].lower()
                        
                if search == keyword.lower() and ent["label"] in valid_labels:
                    entity_dict[count]["talk_id"] = i
                    entity_dict[count]["title"] = " ".join(talk["title"].split("_")).title()
                    entity_dict[count]["para_id"] = k
                    entity_dict[count]["entity"] = ent["text"]
                    entity_dict[count]["label"] = ent["label"]
                    entity_dict[count]["start_index"] = ent["start"]
                    entity_dict[count]["end_index"] = ent["end"]
                    try:
                        entity_dict[count]["text"] = " ".join([para["text"][:ent["start"]].split(".")[-1], ent["text"], para["text"][ent["end"]:].split(".")[0], "."])
                    except IndexError:
                        entity_dict[count]["text"] = para["text"] 
                    count += 1

    df = pd.DataFrame(entity_dict).T[["title", "entity", "label", "text"]]
    return df

In [89]:
find_entity_or_label("PRODUCT", "label")

Unnamed: 0,title,entity,label,text
0,Annie Bosler And Don Greene How To Practice Ef...,Myelin,PRODUCT,Myelin is similar to insulation on electric...
1,Mark Applebaum The Mad Scientist Of Music,S-tog,PRODUCT,I can also understand that sort of implicit c...
2,Tod Machover Dan Ellsey Inventing Instruments ...,these Music Shapers,PRODUCT,"So, we make squeezy instruments, like these ..."
3,Naif Al Mutawa Superheroes Inspired By Islam,series,PRODUCT,"But the biggest thing we've done to date, whic..."
4,Naif Al Mutawa Superheroes Inspired By Islam,Scooby Doo,PRODUCT,"This is one of my sons, Rayan, who's a Scooby..."
...,...,...,...,...
187,John Underkoffler Pointing To The Future Of Ui,Urp,PRODUCT,"And finally, to pull out all the stops, this i..."
188,John Underkoffler Pointing To The Future Of Ui,TAMPER,PRODUCT,"This is a system called TAMPER , which is a ..."
189,Bill Joy What I M Worried About What I M Excit...,Kevlar,PRODUCT,"In two dimensions, if you make, like, a fabr..."
190,Margaret Gould Stewart How Youtube Thinks Abou...,YouTube,PRODUCT,And we do this every time that a video is upl...


# Label Search Function

take a set of labels, return dataframe which contains its contexts

In [9]:
def find_labels(labels):
    labels_dict = defaultdict(dict)

    for i, talk in enumerate(en_corpus):
        for k, para in enumerate(talk["text"]):
            for j, ent in enumerate(para["ents"]):
                if ent["label"] in labels and ent["label"] in valid_labels:
                    labels_dict[i]["title"] = " ".join(talk["title"].split("_")).title()
#                     entity_dict[i]["para_id"] = k
                    labels_dict[i]["entity"] = ent["text"]
                    labels_dict[i]["label"] = ent["label"]
                    labels_dict[i]["start_index"] = ent["start"]
                    labels_dict[i]["end_index"] = ent["end"]
                    labels_dict[i]["text"] = para["text"] #"..." + para["text"][ent["start"]-400:ent["end"]+400] + "..."
    df = pd.DataFrame(labels_dict).T[["title", "label", "entity", "text"]].sort_values(by=["label"])
    return df

In [10]:
find_labels(["PERSON", "EVENT"])

Unnamed: 0,title,label,entity,text
329,Margaret Gould Stewart How Youtube Thinks Abou...,EVENT,"The ""JK Wedding [Entrance] Dance""",So their little wedding video went on to get o...
36,Vusi Mahlasela Thula Mama,EVENT,♫ ♫,"♫ My song of love and my song of life, my song..."
117,Esther Duflo Social Experiments To Fight Poverty,EVENT,no Industrial Revolution,Consider for example transporting goods. Befor...
105,Noah Feldman Politics And Religion Are Technol...,EVENT,the Cold War,And many of these Muslims further say that the...
210,Ivan Krastev Can Democracy Exist Without Trust,EVENT,the Cold War,And if you go to 1989 — something that basical...
...,...,...,...,...
107,Lemn Sissay A Child Of The State,PERSON,Norman,"""Because you don't love us, Norman, clearly yo..."
106,Chris Abani On Humanity,PERSON,Vusi,"There are some of you in this room, amazing pe..."
104,Amory Lovins A 40 Year Plan For Energy,PERSON,Edgar Woolard,Now our team at RMI helps smart companies to g...
101,Nellie Mckay Clonie,PERSON,Clonie,"♫ Gee, that's swell. I guess you're just my fa..."


# DisplaCy

In [24]:
nlp = spacy.load("zh_core_web_sm")

def render_paragraph(keyword, index, render_language):
    if render_language == 'en':
        df = find_paragraphs(keyword, "en_only")
        title = df.iloc[index]["title"].lower().replace(" ", "_")
        para_id = df.iloc[index]["para_id"]
        for talk in en_corpus:
            if talk["title"] == title:
                paragraph = talk["text"][para_id]
                print(paragraph)
        return displacy.render(paragraph, style="ent", manual=True)
    elif render_language == 'cn':
        print("CNCNCNCNCNCNCNCCNCNCCNNCNCNCCN")
        df = find_paragraphs(keyword, "cn")
        title = df.iloc[index]["title"].lower().replace(" ", "_")
        para_id = df.iloc[index]["para_id"]
        for talk in cn_corpus:
            if talk["title"] == title:
                paragraph = talk["text"][para_id]
                paragraph = nlp(paragraph['text'])
#                 print(paragraph)
# #                 paragraph = 
        return displacy.render(paragraph, style="ent", options={"ents":["PEOPLE", "LOC", "ORG", "EVENT", "WORK_OF_ART", "PRODUCT"]})

In [27]:
render_paragraph("art", 1, 'cn')

CNCNCNCNCNCNCNCCNCNCCNNCNCNCCN
cn
