In [1]:
"""
Get articles/texts, and filter only sentences with white-listed vocab OR named entities.

Contents:
-load packages
-create helper functions
-data intialization from csv file of words &
 extend vocab set
-build data structure to house info moving forward
-write to .json formats
-read from existing .json formats
-get white-listed of words 
-scrape NYT
"""

'\nGet articles/texts, and filter only sentences with white-listed vocab OR named entities.\n\nContents:\n-load packages\n-create helper functions\n-data intialization from csv file of words &\n extend vocab set\n-build data structure to house info moving forward\n-write to .json formats\n-read from existing .json formats\n-get white-listed of words \n-scrape NYT\n'

In [175]:
import bs4, requests, sys, codecs, urllib.request, re
from bs4 import SoupStrainer
from bs4.element import Comment
import random
import string
import json
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import sklearn
import jieba
import translators as ts
import pinyin
import pprint
pp = pprint.PrettyPrinter()
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 0)

In [176]:
path = "/Users/elyebliss/Desktop/Vocabulary/language_learning/vocab_dfs/"
source_file = "chinese.json"

In [177]:
##METHODS

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = bs4.BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)



user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'
headers={'User-Agent':user_agent,}
parser = 'html.parser'  # or 'lxml' (preferred) or 'html5lib', if installed

In [178]:
def get_known(unknown_list):
    
    count_got = 0
    known_list = []
    for word in unknown_list:
        decision = str(input(word+"\nKnown =k"))
        if decision =='k':
            known_list.append(word)
            count_got +=1
            print("got "+str(count_got))
        elif decision=='q':
            break
        try:
            print(ts.google(word))
            print(pinyin.get(word))
        except:
            print('cant find')
    return known_list

In [179]:
def filter_text(webpage,start=None,stop=None,text=None,print_word_lvl=False,\
               return_percent=False):


    try:
        if text is None:
            request=urllib.request.Request(webpage,None,headers) #The assembled request
            response = urllib.request.urlopen(request)
            data = response.read()
            contents = text_from_html(data)
        else:
            contents = webpage

        known_array = []
        unk_array = []
        contents_array = re.split("。|\n",contents)

        if (start is not None) and (stop is not None):
            contents_array=contents_array[max(start,0):min(stop,len(contents_array))]

        disallowed_words = set()

        total_words = 0
        unknown_words = 0

        lines = []
        unknowns = []
        for line in contents_array:

            if bool(re.search(r'[\u4e00-\u9fff]',line)):

                tokenized = list(jieba.cut((line)))

                add_line = True

                unk_str = ""
                for word in tokenized:

                    total_words +=1

                    if bool(re.search(r'[\u4e00-\u9fff]',word)) and not ((word in vocab['white_listed']) or\
                                                                (word in vocab['black_listed'])):

                        unk_str += '"'+word+'"'+", "
                        add_line = False
                        disallowed_words.add(word)
                        unknown_words +=1

                if add_line:
                    known_array.append(line)
                    unk_array.append("...")
                else:
                    known_array.append("...")
                    unk_array.append(line)

                if len(unk_str)>0:
                    unk_str = unk_str[0:len(unk_str)-2]

                unknowns.append(unk_str)

        if print_word_lvl and total_words>0:
            print("word-level % known = "+str((1-(unknown_words/total_words))*100))

        if return_percent and total_words>0:
            return (1-(unknown_words/total_words))*100

        return_pd = pd.DataFrame(list(zip(known_array,unk_array,unknowns)))
        if len(return_pd)==0:
            return_pd = pd.DataFrame(pd.np.empty((0, 3)))
        return_pd.columns = ["knowns","unknowns","unk_words"]



        with open(path+'unknown_chinese_list.txt',"w") as outfile:
            outfile.write(str(set(disallowed_words)))
        return return_pd

    except:
        pass
    
    
    

#News: NYTimes

In [180]:
#read from existing .json formats
with open(path+source_file, "r") as path_in:
    vocab = json.loads(path_in.read())
vocab['white_listed'] = set(vocab['white_listed'])
vocab['unseparable'] = set(vocab['unseparable'])

#get 10k freq word info:
"""
3070
8337
"""
with open(path+'ch_freq.csv','r') as infile:
    freq_df = pd.read_csv(infile)
uncovered = []
for word in freq_df.word:
    if word not in vocab['white_listed']:
        uncovered.append(word)
print(len(uncovered))
print(len(vocab['white_listed']))

3070
8337


In [181]:
#NYtimes
request=urllib.request.Request('https://cn.nytimes.com/',None,headers)
resp = urllib.request.urlopen(request)
soup = bs4.BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))

#and ('/china/' in str(link['href']) or '/usa/' in str(link['href']) or '/culture/' in str(link['href']) or '/opinion/' in str(link['href']) or '/morning-brief/' in str(link['href']) or '/technology/' in str(link['href']) or '/business/' in str(link['href']) or '/world/' in str(link['href']) or '/style/' in str(link['href'])):
pages = []
titles = []
seen = set()
for link in soup.find_all('a', href=True):
    if bool(re.search('[0-9]',str(link['href']))):
        if 'https://' in str(link['href']):
            page_append = str(link['href'])
            if page_append not in seen:
                seen.add(page_append)
                pages.append(page_append)

                subs = re.findall('(?<=/).*?(?=\/)',page_append)
                titles.append(subs[len(subs)-1])
        else:
            page_append = 'https://cn.nytimes.com'+str(link['href'])
            if page_append not in seen:
                seen.add(page_append)
                pages.append(page_append)
                subs = re.findall('(?<=/).*?(?=\/)',page_append)
                titles.append(subs[len(subs)-1])
#pages = list(dict.fromkeys(pages))

#titles

In [148]:
#load prev corpus and add
known_corpus = set()
with open(path+"all_known_chinese_news_lines.txt","r",encoding='utf-8') as infile:
    for line in infile.read().split('\n'):
        known_corpus.add(line)
len(known_corpus)

779

In [182]:
#Find % of all sentences of news
knowns = []
all_lines = 0

known_percents = []

for webpage in pages:
    
    test_df = filter_text(webpage)
    if test_df is not None:
        all_lines += len(list(test_df.knowns))

        known_percents.append(filter_text(webpage,return_percent=True))

        for item in list(test_df.knowns[test_df.knowns != '...']):
            knowns.append(item)
percent = len(knowns)/float(all_lines)
print("percent all known= "+str(percent))


percent all known= 0.12036553524804178


In [183]:
#sorted(list(set(knowns)),key=len,reverse=True)
sorted(list(set(knowns).difference(known_corpus)),key=len,reverse=True)


[' 视频制作： Axel Boada 、 Muyi Xiao Advertisement Continue reading the main story Site Information Navigation © 2022  The New York Times Company NYTCo Contact Us Accessibility Work with us Advertise T Brand Studio Your Ad Choices Privacy Policy Terms of Service Terms of Sale Site Map Canada International Help Subscriptions               ',
 ' Send any friend a story As a subscriber, you have 10 gift articles to give each month. Anyone can read what you share. Give this article Give this article Give this article By Emily Chan and Koney Bai 2022年12月1日， 20:57 美国东部时间 Image 周日，警方封锁了北京的部分街道，那里发生了反对中国“新冠清零”做法的抗议活动',
 '                     PLAINCLOTHES  OFFICER         PLAINCLOTHES  OFFICER             The New York Times; video via Reuters    拿着花的男子继续说：“乌鲁木齐的人是怎么死的？我们都清楚，对不对？”  随后，当局采取了行动',
 '人群回答：“没有！”      “中国人，希望勇敢一点！”          Reuters    人群里响起了掌声',
 ' 广告 我问在上海从事金融工作的20多岁的布鲁斯，这些抗议活动是否意味着人们对习近平的看法发生了改变',
 '中国的审查机构是世界上同类机构中最先进的，它在社交媒体上搜索并删除了无数显示抗议政府和愤怒爆发的帖子',
 '“结束封控！”“言论自由！”“我要电影！” 不少人对上海上周六的抗议

In [150]:
#manually remove from white
remove_from_white = []

for word in remove_from_white:
    if word in vocab['white_listed']:
        vocab['white_listed'].remove(word)

In [184]:
for item in list(set(knowns)):
    known_corpus.add(item)

with open(path+"all_known_chinese_news_lines.txt","w",encoding='utf-8') as outfile:
    for line in list(set(known_corpus)):
        outfile.write(line+'\n')
len(known_corpus)

909

In [185]:
websites = set()

with open(path+'viewed_websites_ch.txt',"r") as infile:
    for line in infile.read().split('\n'):
        websites.add(line)
print(len(websites))

30


In [None]:
"""regex to remove links with keywords:
-photos
-interactive
-video
"""

In [197]:
page_titles = list(zip(range(0,len(known_percents)),known_percents,titles,pages))
page_titles = [item for item in page_titles if type(item[1])==float]

page_titles = [item for item in page_titles if not (item[3] in websites)]

keywords = ['photo','interactive','video']
banned_articles = []
for item in page_titles:
    for word in keywords:
        if bool(re.search(word,item[3])):
                banned_articles.append(item)
page_titles = list(set(page_titles).difference(set(banned_articles)))
page_titles = sorted(page_titles,key = lambda x: x[1],reverse=True)
page_titles

[(13,
  90.97056483691328,
  'china-covid-protests-xi-jinping',
  'https://cn.nytimes.com/china/20221202/china-covid-protests-xi-jinping/'),
 (10,
  90.89825119236885,
  'china-protests-covid-beijing',
  'https://cn.nytimes.com/china/20221129/china-protests-covid-beijing/'),
 (21,
  90.74578116494284,
  'china-covid-protests-xi-jinping',
  'https://cn.nytimes.com/opinion/20221201/china-covid-protests-xi-jinping/'),
 (5,
  90.36323202372128,
  'china-covid-protest',
  'https://cn.nytimes.com/china/20221201/china-covid-protest/'),
 (24, 90.0, 'c02qatar', 'https://cn.nytimes.com/slideshow/20221202/c02qatar/'),
 (8,
  89.79074191502853,
  'china-protests-blank-sheets',
  'https://cn.nytimes.com/china/20221129/china-protests-blank-sheets/'),
 (6,
  89.53051643192488,
  'china-protests-covid-jobs',
  'https://cn.nytimes.com/business/20221130/china-protests-covid-jobs/'),
 (12,
  89.26605504587157,
  'china-world-cup-tv',
  'https://cn.nytimes.com/sports/20221129/china-world-cup-tv/'),
 (37,


In [198]:
webpage = pages[13]
if webpage in websites:
    print("already scanned! choose another!")
websites.add(webpage)

In [199]:

print(len(websites))
with open(path+'viewed_websites_ch.txt',"w") as outfile:
    for line in websites:
        outfile.write(line+'\n')      


31


In [201]:
#display() #percent_threshold=0.95
print(webpage)
art = filter_text(webpage,print_word_lvl=True)

https://cn.nytimes.com/china/20221202/china-covid-protests-xi-jinping/
word-level % known = 90.96337579617835


In [202]:
with open(path+'unknown_chinese_list.txt',"r") as input_file:
    new_words = input_file.read()
    new_words = re.sub("[\n\'\[\]\{\}]","",new_words)
    new_words = new_words.split(',')
    new_words = [line.strip() for line in new_words] #update regex
#new_words
print(len(new_words))
#known_manual = get_known(new_words)

192


In [203]:
import mitosheet
unk_df = pd.DataFrame(new_words)
unk_df.columns = ['word']
unk_df['status'] = pd.Series(['' for word in new_words])

In [204]:
mitosheet.sheet(unk_df, analysis_to_replay="id-adlszusppd")

MitoWidget(analysis_data_json='{"analysisName": "id-adlszusppd", "analysisToReplay": null, "code": [], "stepSu…

In [205]:
translations = []
pinyins = []
for i in range(0,len(unk_df)):
    try:
        translations.append(ts.google(unk_df.word.iloc[i]))
        pinyins.append(pinyin.get(unk_df.word.iloc[i]))
    except:
        translations.append('')
        pinyins.append('')
unk_df['translations']=translations
unk_df['pinyins']=pinyins

In [163]:
add_ints = [0, 4, 5, 7, 14, 24, 31, 34, 36, 39, 51, 57, 59, 64, 69, 75, 80, 85, 88, 92, 96, 99, 105, 110, 124, 130, 131, 132, 133, 134, 140, 145, 149, 151, 155, 156, 157, 174, 181, 182, 183, 187, 188, 189, 191]

In [164]:
for i in add_ints:
    # Set a cell value in status
    unk_df.at[i, 'status'] = 'k'

In [165]:
mitosheet.sheet(unk_df, analysis_to_replay="id-itpvddfexb")

MitoWidget(analysis_data_json='{"analysisName": "id-itpvddfexb", "analysisToReplay": null, "code": [], "stepSu…

In [166]:
from mitosheet import *; register_analysis("id-itpvddfexb");
    
# Set a cell value in status
unk_df.at[51, 'status'] = "k"

# Set a cell value in status
unk_df.at[73, 'status'] = "k"

# Set a cell value in status
unk_df.at[93, 'status'] = "k"

# Set a cell value in status
unk_df.at[110, 'status'] = "k"


In [170]:
add_words = list(unk_df.word[unk_df['status']=='k'])
print(len(add_words))
for word in add_words:
    vocab['white_listed'].add(word)

55


In [171]:
#write to .json formats
df = vocab
print(len(vocab['white_listed'])) 
df['white_listed'] = list(df['white_listed'])
df['unseparable'] = list(df['unseparable'])
df['black_listed'] = list(df['black_listed'])
with open(path+source_file, "w") as outfile:
    json.dump(df,outfile)

8337


In [172]:
#read from existing .json formats
with open(path+source_file, "r") as path_in:
    vocab = json.loads(path_in.read())
vocab['white_listed'] = set(vocab['white_listed'])
vocab['unseparable'] = set(vocab['unseparable'])
print(len(vocab['white_listed']))

8337


In [173]:
#display() #percent_threshold=0.95
#print(webpage)
filter_text(webpage,print_word_lvl=True)

word-level % known = 95.22960320998662


Unnamed: 0,knowns,unknowns,unk_words
0,...,国际 中国 商业与经济 镜头 科技 科学 健康 教育 文化 风尚 旅游 房地产 观点与评论 简繁中文 简体 繁体 纽约时报 出版语言 ENGLISH (英语) ESPAÑOL (西班牙语) 字体大小 小 中 大 超大 中国 中文 中 中英双语 双语 英文 英 新新世界 自豪、害怕与矛盾：中国年轻抗议者讲述自己的故事 袁莉 2022年11月30日 上海周日的抗议活动吸引了大批参与者，政府也派来了大批警察,"""风尚"", ""讲述"", ""袁莉"""
1,...,The New York Times 他们参加了自己的第一次示威,"""示威"""
2,喊出了自己的第一个抗议口号,...,
3,经历了自己与警察的第一次冲突,...,
4,...,后来他们回到家中，浑身发抖，不敢相信自己竟然挑战了世界上最强大的威权主义政府和中国几十年来最铁腕的领导人,"""威权"", ""铁腕"""
...,...,...,...
79,...,他回答说，“这一次不是大家的看法和态度改变了，而是有看法和持批评态度的人敢勇敢地说出来了,"""和持"", ""人敢"""
80,...,” 袁莉为《纽约时报》撰写“ 新新世界 ”专栏，专注中国及亚洲科技、商业和政治交叉议题,"""袁莉为"", ""撰写"", ""交叉"""
81,欢迎在Twitter上关注她： @liyuan6,...,
82,翻译：纽约时报中文网 点击查看本文英文版,...,


##Game of Thrones

In [6]:
#read from existing .json formats
with open(path+source_file, "r") as path_in:
    vocab = json.loads(path_in.read())
vocab['white_listed'] = set(vocab['white_listed'])
vocab['unseparable'] = set(vocab['unseparable'])
print(len(vocab['white_listed'])) #7216

7216


In [34]:
#get 10k freq word info:
"""
3127
8059
"""
with open(path+'ch_freq.csv','r') as infile:
    freq_df = pd.read_csv(infile)
uncovered = []
for word in freq_df.word:
    if word not in vocab['white_listed']:
        uncovered.append(word)
print(len(uncovered))
print(len(vocab['white_listed']))

3127
8059


In [6]:
got = 'https://www.feiku6.com/read/bingyuhuozhige/8505346.html' #increment when done

In [10]:
#Find % of all sentences of GoT currently known
knowns = []
all_lines = 0

for i in range(8505341,8505661):
    chapter = 'https://www.feiku6.com/read/bingyuhuozhige/'+ str(i)+'.html'
    test_df = filter_text(chapter)
    all_lines += len(list(test_df.knowns))
    for item in list(test_df.knowns[test_df.knowns != '...']):
        knowns.append(item)
percent = len(knowns)/float(all_lines)
print("latest percent = "+str(percent))
#Latest = 0.10810387984981226

#Get after when adding in new words
#print("today's catch % of corpus:")
#str(100*(percent-prev_percent))+"%" #need to be commented out if first restarting notebook

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/w0/802yh79s18j0bn54dk09dm2m0000gn/T/jieba.cache
Loading model cost 0.947 seconds.
Prefix dict has been built successfully.


latest percent = 0.10810387984981226


In [188]:
remove_from_white = []

for word in remove_from_white:
    if word in vocab['white_listed']:
        vocab['white_listed'].remove(word)

In [13]:
#display the newly available lines
print(len(set(knowns).difference(prev_knowns)))
sorted(list(set(knowns).difference(prev_knowns)),key=len,reverse=True)

0


[]

In [12]:
#Get before when adding in new words
prev_knowns = set(knowns)
prev_percent = percent
len(knowns)

11056

In [36]:
display(filter_text(got,150,165,print_word_lvl=True))


word-level % known = 77.04280155642024


Unnamed: 0,knowns,unknowns,unk_words
0,...,他一定是喝多了，两只脚仿佛打了结，当即与一位女侍撞个满怀，使一壶掺香料的葡萄酒泼洒在地，四座顿时响起哄堂大笑,"""当即"", ""一壶"", ""掺"", ""泼洒"", ""顿时"", ""哄堂大笑"""
1,...,琼恩眼中的热泪滚下面颊，有人想搀他，但他甩开善意的手，凭着辨不清地面的眼睛，继续朝大门跑去,"""热泪"", ""面颊"", ""搀"", ""甩开"", ""善意"", ""凭着"", ""辨不清"""
2,...,白灵紧随其后，奔进低垂的夜幕,"""紧随其后"", ""低垂"", ""夜幕"""
3,...,空荡的庭院分外寂静，内墙城垛上只有一位拉紧斗篷抵御寒意的守卫，独自蜷缩墙角，虽然看上去百无聊赖，表情悲苦，但琼恩却有一千个一万个想和他交换位置的愿望,"""空荡"", ""庭院"", ""分外"", ""寂静"", ""城垛"", ""斗篷"", ""抵御"", ""寒意"", ""蜷缩"", ""百无聊赖"""
4,...,除此之外，整座孤城四下漆黑，满是寂寥,"""整座"", ""孤城"", ""四下"", ""寂寥"""
5,...,琼恩曾去过一座被遗弃的庄园，那里杳无人迹、沉默阴郁，四下一片肃然，惟有巨石在默默倾诉过往主人的景况,"""一座"", ""被遗弃"", ""庄园"", ""杳无人迹"", ""阴郁"", ""四下"", ""肃然"", ""惟有"", ""巨石"", ""默默"", ""倾诉"", ""过往"", ""景况"""
6,...,今夜的临冬城便让琼恩联想起当时的情景,"""情景"""
7,...,笙歌舞乐从身后敞开的窗户向外流泻，正是他此刻最不想听的靡靡之音,"""笙歌"", ""敞开"", ""窗户"", ""流泻"", ""靡靡之音"""
8,...,他用衣袖抹去泪水，气恼自己如何把持不住，随后准备转身离开,"""气恼"", ""把持"""
9,“小子,...,


In [30]:
with open(path+'unknown_chinese_list.txt',"r") as input_file:
    new_words = input_file.read()
    new_words = re.sub("[\n\'\[\]]","",new_words)
    new_words = new_words.split(',')
    new_words = [line.strip() for line in new_words] 
#new_words
known_manual = get_known(new_words)

窗户
Known =k
window
善意
Known =k
bona fide
倾诉
Known =k
Talk about
抹
Known =kk
wipe
庄园
Known =k
manor
守卫
Known =kk
guard
敞开
Known =k
Open
情景
Known =k
scene
壁
Known =k
wall
斗篷
Known =k
cloak
侏儒
Known =k
dwarf
掺
Known =k
Mix
杳无人迹
Known =k
杳 杳 迹 迹 迹 杳
满是
Known =kk
Full
去过
Known =kk
Go
一座
Known =k
One by one
把持
Known =k
control
叫住
Known =kk
Stop
肃然
Known =k
with distinction
庭院
Known =k
patio
被遗弃
Known =k
Abandoned
搀
Known =k
mix
孤城
Known =k
Lonely city
整座
Known =k
Whole seat
墙角
Known =kk
Corner
四下
Known =kk
All over
架上
Known =k
Stand on the shelf
靡靡之音
Known =k
Voiced
气恼
Known =kk
Annoyed
默默
Known =k
silently
寂寥
Known =k
Lonely
蜷缩
Known =k
Curl up
凭着
Known =k
Rely on
热泪
Known =k
Tears
甩开
Known =kk
Throw away
夜幕
Known =k
Night
衣袖
Known =kk
Sleeve
特正
Known =k
Special
低垂
Known =k
Drooping
过往
Known =k
Past
拉紧
Known =kk
tension
石像鬼
Known =k
Stone demon
顿时
Known =k
Immediately
厅堂
Known =k
Hall
城垛
Known =k
City stack
奔进
Known =kk
Go
悲苦
Known =kk
sorrow
那家伙
Known =kk
That rascal
寒意
Known =k
chill
哄堂大笑

In [31]:
print("today's catch:"+str(len(known_manual)))
for word in known_manual:
    vocab['white_listed'].add(word)

today's catch:27


In [32]:
#Manually add
add_to_white = ['石像鬼','联想起']

for word in add_to_white:
    vocab['white_listed'].add(word)

In [33]:
remove_from_white = ['四下','气恼','甩开','分外','巨石','辨不清']

for word in remove_from_white:
    if word in vocab['white_listed']:
        vocab['white_listed'].remove(word)

In [40]:
#write to .json formats
df = vocab
print(len(vocab['white_listed']))
df['white_listed'] = list(df['white_listed'])
df['unseparable'] = list(df['unseparable'])
df['black_listed'] = list(df['black_listed'])
with open(path+source_file, "w") as outfile:
    json.dump(df,outfile)

7216


In [42]:
#read from existing .json formats
with open(path+source_file, "r") as path_in:
    vocab = json.loads(path_in.read())
vocab['white_listed'] = set(vocab['white_listed'])
vocab['unseparable'] = set(vocab['unseparable'])
print(len(vocab['white_listed']))

7216


##Code not in use

In [318]:
def filter_subtitles(text_file):


   # try:
    contents = text_file

    known_array = []
    unk_array = []
    contents_array = re.split("\n",contents)

    timestamps = []
    phrases = []
    for line in contents_array:

        if bool(re.search(':[0-9]{2}',line)):
            timestamps.append(line)
        else:
            phrases.append(line)

    if len(timestamps)==len(phrases):
        
        disallowed_words = set()

        known_array = []
        unknowns = []
        unk_array = []
        for line in phrases:

            tokenized = list(jieba.cut((line)))

            add_line = True

            unk_str = ""
            for word in tokenized:
                """
                The below should only trigger 'unknown' for the phrase
                if there is an unknown + chinese word. Non-chinese, or
                white-listed should not trigger.
                """
                if bool(re.search(r'[\u4e00-\u9fff]',word)) and not (word in vocab['white_listed']):

                    if len(word)==1:

                        unk_str += '"'+word+'"'+", "
                        add_line = False
                        disallowed_words.add(word)
                    else:
                        for char in word:
                            if not char in vocab['white_listed']:
                                add_line = False
                                disallowed_words.add(word)
                        if word in disallowed_words:
                            unk_str += '"'+word+'"'+", "

            if add_line:
                known_array.append(line)
                unk_array.append("...")
            else:
                known_array.append("...")
                unk_array.append(line)

            if len(unk_str)>0:
                unk_str = unk_str[0:len(unk_str)-1]

            unknowns.append(unk_str)

        return_pd = pd.DataFrame(list(zip(timestamps,known_array,unk_array,unknowns)))
        return_pd.columns = ["times","knowns","unknowns","unk_words"]

        with open(path+'unknown_chinese_list.txt',"w") as outfile:
            outfile.write(str(list(disallowed_words)))
        return return_pd
    """
        
    except:
        pass
    """
        

    
    

In [146]:
with open(path+"all_known_lines.txt","w") as outfile:
    for line in list(set(knowns)):
        outfile.write(line+'\n')

In [321]:
with open(path+"chinese_text_input.txt","r") as infile:
    display(filter_subtitles(infile.read()))


Unnamed: 0,times,knowns,unknowns,unk_words
0,0:10,...,欢迎大家回到 《同一屋檐下》 本节目由运动 APP Keep,"""屋檐下"","
1,0:15,...,赞助播出 Keep 直播课 燃脂更快乐 六个年轻人,"""赞助"", ""播出"", ""直播"", ""燃脂"","
2,0:20,...,被“关”在一个屋子里 已经“关”了这是第八期了 事情在向着一个很,"""屋子里"", ""第八期"","
3,0:26,...,奇妙的方向发展 是吧 有点诡异了 有点诡异 有点看不懂了,"""奇妙"", ""诡异"", ""诡异"","
4,0:32,...,就是让我们所有人 都看不明白的操作 这个陈晓伟,"""陈晓伟"","
5,0:38,...,怎么了陈晓伟 他居然约了吴嘉雯去看展 晚点吃完饭跟我说,"""陈晓伟"", ""吴嘉雯"", ""看展"","
6,0:44,...,然后买票 我们去看展 我们去吗 对 两个人吗,"""看展"","
7,0:50,...,我突然刚才有个想法 在脑海里 是不是金玉洁的话 他听进去了,"""金玉"", ""洁"","
8,0:55,...,不要太伤害吴嘉雯 有可能 最后我们的朋友天奇,"""吴嘉雯"", ""天奇"","
9,1:02,...,上个月工资三千五之后崩溃了 刘总的一番话 感觉让天奇瞬间清醒了,"""工资"", ""崩溃"", ""刘总"", ""一番话"", ""天奇"", ""瞬间"","


In [311]:
#Staging area
with open(path+"chinese_text_input.txt","r") as infile:
    textfile = infile.read()

text_list = re.split("。|\n",textfile)
text_list
known_phrases = set()
for line in text_list:
    
    if bool(re.search(r'[\u4e00-\u9fff]',line)) and not bool(re.search('[0-9]',line)):
        tokenized = list(jieba.cut((line)))

        add_line = True

        for word in tokenized:
            
            if not word in vocab['white_listed']:
                if not bool(re.search(r'[a-zA-Z]',word)):
                    if len(word)==1:
                        add_line = False
                    else:
                        for char in word:
                            if not char in vocab['white_listed']:
                                add_line = False
                    

        if add_line:
            known_phrases.add(line)
sorted(known_phrases,key=len)



[]

In [302]:
#Sanity check. is this phrase disqualified?
phrase = '你好'
tokenized = list(jieba.cut((phrase)))
for word in tokenized:
    if not word in vocab['white_listed']:
        if not bool(re.search(r'[a-zA-Z]',word)):
            if len(word)==1:
                print(word)
            else:
                for char in word:
                    if not char in vocab['white_listed']:
                        print(char)

In [300]:
for char in '你好':
    print(char)

你
好


In [384]:
#sort by a freq list:
#Upload from download
with open(path+'known_chinese_list.txt',"r") as input_file:
    zhongwen_words = input_file.read().split(',')
    print("today's catch in word count:")
    print(len(zhongwen_words))
zhongwen_words.append("你")
zhongwen_words.append("我")
most_freq = []
for line in zhongwen_words:
    word = line.replace("'","").strip()
    if word in set(freq_df.word):
        most_freq.append(word)
#sorted(most_freq,key = freq_df.frequency)
temp = freq_df[freq_df.word.isin(most_freq)]
list(set(temp.word))

today's catch in word count:
9


['你', '年纪', '老友', '我', '屋顶', '公爵', '可惜', '笑容']

In [71]:
vocab = {}
vocab['white_listed'] = list(vocab_all)
vocab['black_listed'] = []


In [204]:
#Sanity check. Current vocab size:
len(vocab['white_listed'])

Sanity check. Current vocab size:


2107

In [100]:
webpage = 'https://cn.nytimes.com/asia-pacific/20220805/taiwan-china-military-drills/'

In [176]:
#output newly knowns
with open(path+"new_known_chinese_lines.txt","w") as outfile:
    for line in list(set(knowns).difference(prev_knowns)):
        outfile.write(line+'\n')


In [100]:
remove_from_black = []

for word in remove_from_black:
    vocab['black_listed'].remove(word)

In [362]:
for word in freq_df.word:
    if word not in vocab['white_listed']:
        if len(word)>1:
            add_word= True
            for char in word:
                if char not in vocab['white_listed']:
                    add_word = False
            if add_word:
                vocab['white_listed'].add(word)

In [149]:
add_to_black = [ ]

for word in add_to_black:
    vocab['black_listed'].add(word)

In [117]:
"""
to do:

"""

In [232]:
#webpage = 
all_knowns = []
for webpage in pages:
    temp = filter_text(webpage)
    if temp is not None:
        temp = temp[temp.knowns != "..."]
        if len(temp.knowns.values)>0:
            for item in temp.knowns.array:
                if not (item in all_knowns):
                    all_knowns.append(item)
all_knowns


['也欢迎访问 中文网首页 阅读更多新闻',
 ' 广告 在美国和中国之间找到合适的位置很复杂',
 '所以，我们不能让中国人决定谁可以或不可以访问台湾',
 '这就是为什么佩洛西是对的，不能让中国来决定谁能、谁不能访问台湾',
 ' 广告 现在的情况就是这样',
 '’他们说，‘好吧，知道了',
 '让我们继续谈谈这个问题怎么解决',
 ' 但美国和中国都没有发生战争的必要',
 '“但我过得比他好',
 '“我很害怕，”他说',
 ' “最重要的是找一个我们可以在一起的地方，”他在2019年说',
 '“现在你又把他带走',
 ' “我很忙，”他说',
 '“没时间害怕',
 '“我们都有不同的命运',
 ' 他离开的时候，中国非常穷',
 '他没有离开的想法',
 '） 人们通常不知道他们正在被监视',
 '  世界曾认为它可以改变中国，很多方面的确改变了',
 '但对许多父母来说，这还不够',
 '的确会这样，但不是在中国',
 '他拒绝了',
 '“不，不，我不认为会有，”他说',
 '” 广告 他们没有收到这个信息']

In [69]:
##INPUT-OUTPUT
#vocab list:
with open("/Users/elyebliss/Desktop/Vocabulary/vocab_dfs/chinese_whitelisted.csv","r") as infile:
    whitelisted_lemmas = infile.read()


##VARIABLES
vocab_all = set()


for line in whitelisted_lemmas.split('\n'):
    if len(line) > 0:
        
        vocab = line.strip()
        vocab_all.add(vocab)
        
#len(whitelisted_lemmas.split('\n'))            
#pp.pprint(vocab_all)

In [50]:
#Upload from download
with open(path+'known_chinese_list.txt',"r") as input_file:
    zhongwen_words = input_file.read().split(',')
    print("today's catch in word count:")
    print(len(zhongwen_words))
for line in zhongwen_words:
    word = line.replace("'","").strip()
    vocab['white_listed'].add(word)
    #if len(word)>1:
    #    for char in word:
    #        vocab['white_listed'].add(char)


today's catch in word count:
5
