In [None]:
"""
Get articles/texts, and filter only sentences with white-listed vocab OR named entities.

Contents:
-load packages
-create helper functions
-data intialization from csv file of words &
 extend vocab set
-build data structure to house info moving forward
-write to .json formats
-read from existing .json formats
-get white-listed of words and inflections
-scrape journale en francais facile
"""

In [1]:
import bs4, requests, sys, codecs, urllib.request, re
from bs4 import SoupStrainer
from bs4.element import Comment
import random
import string
import json
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import mlconjug3
import sklearn
import goslate
from nltk.tokenize import word_tokenize, sent_tokenize
import pprint
pp = pprint.PrettyPrinter()
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 0)
import mitosheet

In [2]:
def add_noun(add_word):

    if add_word[len(add_word)-1] != 's' and add_word[len(add_word)-1] != 'x':

        out_word = add_word + 's'

        return out_word
    else:
        return add_word

In [3]:
##METHODS

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = bs4.BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)



user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'
headers={'User-Agent':user_agent,}


In [4]:
def get_known(unknown_list):
    
    count_got = 0
    known_list = []
    count_checked = 0
    for word in unknown_list:
        count_checked += 1
        decision = str(input(word+"\nKnown =k"))
        if decision =='k':
            known_list.append(word)
            count_got +=1
            print("got "+str(count_got))
        elif decision=='q':
            break
        print(str(len(unknown_list)-count_checked)+' remaining')
    return known_list

In [4]:
def review_out(out_dict,prev_learned,eb_unk_in,eb_unk_m_in,examples):
    """
    out_dict = out
    prev_learned = eb_learned
    eb_unk_in =  eb_unk
    eb_unk_m_in = eb_unk_morph
    examples = eb_unk_examples
    
    decisions:
    k = word known, remove from eb_unk and eb_unk_morph
        and add to eb_learned
        
    m = remove morph from eb_unk and eb_unk_morph 
    
    a = add example to eb_unk_examples
    q = quit for day
    """
    for word in out_dict.keys():
        print("Lemma: "+word)
        print("Morph: "+str(out_dict[word]['morph']))
        
        if len(examples[word])>0:
            print("Previous examples: ")
            for sent in examples[word]:
                print(" - "+sent)
                    
        print("New examples: ")
        
        for sent in out_dict[word]['sents']:
            if sent not in examples[word]:
                print(sent)
                decision = str(input("options : k,m,a,q"))
                if decision =='k':
                    try:
                        examples[word].append(sent) #by default adds examples
                        prev_learned[word] = eb_unk_in[word].copy()
                        del eb_unk_in[word] #remove lemma

                        i = 0
                        while i < len(eb_unk_m_in):

                            if eb_unk_m_in[list(eb_unk_m_in)[i]] == word:
                               del eb_unk_m_in[list(eb_unk_m_in)[i]]
                            else:
                                i +=1
                    except:
                        pass

                if decision =='m':
                    morph = str(input("which morph:"))
                    try:
                        del eb_unk_m_in[morph]
                        eb_unk_in[word].remove(morph) 
                    except:
                        pass
                if decision =='a':
                    try:
                        examples[word].append(sent)
                    except:
                        pass
                if decision =='q':
                    break
    

In [5]:
def how_to_add(word):
    """
    p = plain, add without conjurgating or changing
    c = conjugate as normal verb
    x = noun that adds x when plural
    s = noun that add s when plural
    iv = inner verb. There is a verb in the expression that should be 
        conjurgated
    """
    extras_morphs = []
    extras_morphs.append(word)
    
    
    gs = goslate.Goslate()

    default_conjugator = mlconjug3.Conjugator(language='fr')

    decision = str(input(word+"\nc\nx\ns\niv"))
    
    if decision == 'c':

        try:
            temp = []
            test_verb = default_conjugator.conjugate(word)
            all_conjugated_forms = test_verb.iterate()
            for item in all_conjugated_forms:
                if item not in temp:
                    temp.append(item[len(item)-1])
            extras_morphs.extend(list(set(temp)))
        except:
            pass
    elif decision == 'x':
        extras_morphs.append(lemma+'x')
    elif decision == 's':
        extras_morphs.append(lemma+'s')
    elif decision == 'iv':
        morph = str(input("which morph:"))
        

        temp = []
        test_verb = default_conjugator.conjugate(morph)
        all_conjugated_forms = test_verb.iterate()
        for item in all_conjugated_forms:
            if item not in temp:
                temp.append(item[len(item)-1])

        for variant in list(set(temp)):
            if variant is not None:
                new_word = word.replace(morph,variant)
                extras_morphs.append(new_word)

    return extras_morphs

In [6]:
"""
For eb_unk words: search through all articles, return 
sentences containing any words for unknown vocabulary. 

return dict object, key is lemma, then values are morphs found
and array of examples. 
"""

def find_unks(webpages,eb_unk_mo,prev_examples):
    
    output = {}
    for webpage in webpages:
    #try:

        request=urllib.request.Request(webpage,None,headers) #The assembled request
        response = urllib.request.urlopen(request)
        data = response.read()
        contents = text_from_html(data)
      
        contents_array = sent_tokenize(contents)
 
        for line in contents_array:
            tokenized = word_tokenize(line, language='french')
            for word in tokenized:
                if word.lower() in eb_unk_mo:
                    
                    if word.lower() in prev_examples:
                        if line not in prev_examples[word.lower()]:
                            if word.lower() not in output:
                                output[eb_unk_mo[word.lower()]] = {} #key will be lemma
                                output[eb_unk_mo[word.lower()]]['morph'] = []
                                output[eb_unk_mo[word.lower()]]['sents'] = []

                            output[eb_unk_mo[word.lower()]]['morph'].append(word.lower())
                            output[eb_unk_mo[word.lower()]]['sents'].append(line)
                    
    return output
    #except:
    #    pass

In [7]:
def filter_text(webpage,text=None,print_word_lvl=False):


    #try:
    if text is None:
        request=urllib.request.Request(webpage,None,headers) #The assembled request
        response = urllib.request.urlopen(request)
        data = response.read()
        contents = text_from_html(data)
    else:
        contents = webpage

    line_array = []
    percent_array = []
    contents_array = sent_tokenize(contents)

    disallowed_words = set()
    total_words = 0
    unknown_words = 0
    lines = []
    unknowns = []
    
    for line in contents_array:
        
        line_total = 0
        line_unks = 0
        tokenized = word_tokenize(line, language='french')

        unk_str = ""
        for word in tokenized:
            total_words +=1
            line_total +=1

            if not (bool(re.search("[0-9]", word)) or\
                    bool(re.search("[A-Z]", word)) or\
                    bool(re.search("[.,\/#!$%\^&\*;:{}=\-_`~()«»]", word)) or\
                    word.lower() in vocab['white_listed']):
                if not ((word.lower() in vocab['white_listed']) or (word.lower() in vocab['black_listed'])):
                    unk_str += '"'+word.lower()+'"'+", "

                disallowed_words.add(word.lower())
                unknown_words +=1
                line_unks +=1


        line_array.append(line)
        line_percent = (line_total-line_unks)/line_total
        percent_array.append(line_percent)

        if len(unk_str)>0:
            unk_str = unk_str[0:len(unk_str)-1]

        unknowns.append(unk_str)

    if print_word_lvl and total_words>0:
        print("word-level % known = "+str((1-(unknown_words/total_words))*100))

    return_pd = pd.DataFrame(list(zip(line_array,percent_array,unknowns)))
    return_pd.columns = ["line","line_percent","unk_words"]

    with open(path+'unknown_french_dad_list.txt',"w") as outfile:
        outfile.write(str(list(disallowed_words)))

    return return_pd
    
    #except:
    #    pass

##Dad section

In [8]:
path = "/Users/elyebliss/Desktop/Vocabulary/vocab_dfs/"
source_file = "french_dad.json"

In [17]:
            
##Journal en francais facile:
print("Checking Journal en francais facile")
parser = 'html.parser'  # or 'lxml' (preferred) or 'html5lib', if installed
#request=urllib.request.Request('https://savoirs.rfi.fr/en/apprendre-enseigner/langue-fran%C3%A7aise/journal-en-francais-facile',None,headers)
request=urllib.request.Request('https://francaisfacile.rfi.fr/fr/podcasts/journal-en-fran%c3%a7ais-facile/',None,headers)

resp = urllib.request.urlopen(request)
soup = bs4.BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))


pages = []
for link in soup.find_all('a', href=True):
    if '/fr/podcasts/le-journal-en-fran%C3%A7ais-facile/' in str(link['href']):
        #print(str(link['href']))
        str_page = 'https://savoirs.rfi.fr/'+str(link['href'])
        #str_page = 'https://savoirs.rfi.fr/en/apprendre-enseigner/langue-francaise/journal-en-francais-facile-'+str(re.search(r"[0-9]{8}.*",str(link['href'])).group(0))
        if str_page not in pages:
            pages.append(str_page)

pages

Checking Journal en francais facile


['https://savoirs.rfi.fr//fr/podcasts/le-journal-en-fran%C3%A7ais-facile/20221118-la-cor%C3%A9e-du-nord-tire-un-missile-la-cop27-black-ad-remporte-le-prix-d%C3%A9couvertes-rfi',
 'https://savoirs.rfi.fr//fr/podcasts/le-journal-en-fran%C3%A7ais-facile/20221117-600-prisonniers-lib%C3%A9r%C3%A9s-en-birmanie-l-ukraine-bombard%C3%A9e-que-fait-emmanuel-macron-%C3%A0-bangkok',
 'https://savoirs.rfi.fr//fr/podcasts/le-journal-en-fran%C3%A7ais-facile/20221116-la-fus%C3%A9e-artemis-d%C3%A9colle-lula-%C3%A0-la-cop27-un-missile-tombe-sur-la-pologne',
 'https://savoirs.rfi.fr//fr/podcasts/le-journal-en-fran%C3%A7ais-facile/20221115-la-russie-bombarde-l-ukraine-8-milliards-d-habitants-sur-terre-donald-trump-de-retour',
 'https://savoirs.rfi.fr//fr/podcasts/le-journal-en-fran%C3%A7ais-facile/20221114-joe-biden-et-xi-jinping-%C3%A0-bali-attentat-%C3%A0-istanbul-emmanuel-macron-veut-sanctionner-l-iran',
 'https://savoirs.rfi.fr//fr/podcasts/le-journal-en-fran%C3%A7ais-facile/20221111-l-ocean-viking-est

In [19]:
with open(path+"text_input_dad_fr.txt","r") as infile:
    textfile = infile.read()
filtered_art = filter_text(textfile,text= True)
#display(filtered_art)

In [29]:
all_unks = set()

with open(path+'unknown_french_dad_list.txt',"r") as input_file:
    new_words = input_file.read()
    new_words = re.sub('\n',"",new_words)
    new_words = new_words.split(',')
    new_words = [re.sub("^['|\"]|^\['|['|\"]$|'\]$|\[","",line.strip().lower()) for line in new_words] #update regex
    for word in new_words:
        all_unks.add(word)
len(all_unks)

In [45]:
all_unks=list(all_unks)
unk_df = pd.DataFrame(all_unks)
unk_df.columns = ['word']
unk_df['status'] = pd.Series(['' for word in all_unks])

In [48]:
mitosheet.sheet(unk_df, analysis_to_replay="id-gnhspsqdqa")

MitoWidget(analysis_data_json='{"analysisName": "id-gnhspsqdqa", "analysisToReplay": null, "code": [], "stepSu…

In [56]:
add_words = list(unk_df.word[unk_df['status']=='k'])
print(len(add_words))
for word in add_words:
    vocab['white_listed'].add(word)

['financement',
 'go',
 "d'entrer",
 'délégation',
 'concernant',
 'textes',
 'déstabilisation',
 'japonais',
 'tunisien',
 'tunisienne',
 'quatrième',
 "s'était",
 'conscient',
 'comité',
 'coréens',
 'délégués',
 "j'ai",
 'end',
 '"\'s',
 'coréen',
 'supérieure',
 'fossile']

In [58]:
filtered_art = filter_text(textfile,text= True)

In [61]:
filtered_art['theme']=['' for line in filtered_art.line]
mitosheet.sheet(filtered_art)

MitoWidget(analysis_data_json='{"analysisName": "id-djkrvxuaoi", "analysisToReplay": null, "code": [], "stepSu…

In [71]:
themes = set(filtered_art.theme[filtered_art['theme']!=''])
output = pd.DataFrame(columns=['theme','line'])
for theme in themes:
    temp = filtered_art[['theme','line']][filtered_art['theme']==theme]
    output = pd.concat([output,temp])
with open(path+'dad_article.csv','w') as outfile:
    output.to_csv(outfile,encoding='utf-8')

In [72]:
#write to .json formats
df = vocab
df['white_listed'] = list(df['white_listed'])
df['black_listed'] = list(df['black_listed'])
with open(path+source_file, "w") as outfile:
    json.dump(df,outfile)
print(len(vocab['white_listed']))

13374


In [73]:
#read from existing .json formats
with open(path+source_file, "r") as path_in:
    vocab = json.loads(path_in.read())
vocab['white_listed'] = set(vocab['white_listed'])
vocab['black_listed'] = set(vocab['black_listed'])
print(len(vocab['white_listed']))

13374


In [27]:
add_to_white = []

for word in add_to_white:
    vocab['white_listed'].add(word)

In [26]:
remove_from_white = []

for word in remove_from_white:
    if word in vocab['white_listed']:
        vocab['white_listed'].remove(word)

In [147]:
#read from existing .json formats
with open(path+source_file, "r") as path_in:
    vocab = json.loads(path_in.read())
vocab['white_listed'] = set(vocab['white_listed'])
vocab['black_listed'] = set(vocab['black_listed'])
print(len(vocab['white_listed']))

13352


In [24]:
with open(path+'unknown_french_dad_list.txt',"r") as input_file:
    new_words = input_file.read()
    new_words = re.sub('\n',"",new_words)
    new_words = new_words.split(',')
    new_words = [re.sub("^['|\"]|^\['|['|\"]$|'\]$","",line.strip().lower()) for line in new_words] #update regex
#new_words
print(len(new_words))
known_manual = get_known(new_words)

169
["d'avion
Known =kk
got 1
168 remaining
envahir
Known =k
167 remaining
explosé
Known =kk
got 2
166 remaining
informations
Known =k
165 remaining
concernent
Known =kk
got 3
164 remaining
manifesté
Known =kk
got 4
163 remaining
neutre
Known =k
162 remaining
d'au
Known =kk
got 5
161 remaining
retardée
Known =k
160 remaining
l'échec
Known =k
159 remaining
rescapés
Known =k
158 remaining
l'interdit
Known =k
157 remaining
d'activité
Known =kk
got 6
156 remaining
pertes
Known =k
155 remaining
exceptionnel
Known =kk
got 7
154 remaining
recouvert
Known =k
153 remaining
l'agriculture
Known =kk
got 8
152 remaining
producteurs
Known =kk
got 9
151 remaining
inextricables
Known =k
150 remaining
causé
Known =k
149 remaining
cinquantaine
Known =k
148 remaining
urgent
Known =kk
got 10
147 remaining
drapeau
Known =k
146 remaining
d'entamer
Known =k
145 remaining
transport
Known =kk
got 11
144 remaining
blocs
Known =k
143 remaining
spatial
Known =kk
got 12
142 remaining
épaves
Known =k
141 remaining


In [28]:
print("% known")
print(str(100*len(filtered_art[filtered_art.knowns != "..."])/len(filtered_art))+"%")
print(str(len(filtered_art[filtered_art.knowns != "..."]))+" lines")

% known
30.0%
30 lines


In [29]:
with open(path+'dad_article.csv','w') as outfile:
    filtered_art.to_csv(outfile,encoding='utf-8')

##Filter for lessons

In [242]:
with open(path+"text_input_dad_fr.txt","r") as infile:
    textfile = infile.read()
filtered_art = filter_text(textfile,text= True)
#display(filtered_art)

In [241]:
def line_decider(data,output_df,new_words):
    """
 
    y = add line
    if y, then give a theme
    
    """
    for i in range(0,len(data)):
        display(data.loc[[i]])
        
        add_words = str(input("add words:"))
        if len(add_words)>0:
            for word in add_words.split(','):
                word = word.replace(' ','')
                word = word.replace('"','')
                new_words.append(word)
                
        add_line = str(input("add line?"))
    
        if add_line=='y':
            add_theme = str(input("add theme:"))
            
            temp_df = pd.DataFrame(list(zip(add_theme,data.line.iloc[i])),columns=['theme','line'])
            output_df = pd.concat([output_df,temp_df])

In [235]:
new_words

['tests', 'élémentaire', 'bilingues']

In [243]:
new_words = []
output_df = pd.DataFrame(columns=['theme','line'])
line_decider(filtered_art,output_df,new_words)

Unnamed: 0,line,line_percent,unk_words
0,"La Corée du Nord tire un missile, missile qui est tombé au large du Japon.",1.0,


add words:
add line?y
add theme:Korea


Unnamed: 0,line,line_percent,unk_words
1,La COP27 la conférence sur le climat se donne un jour de plus pour réfléchir et espérer trouver un accord.,1.0,


add words:y
add line?COP


Unnamed: 0,line,line_percent,unk_words
2,Au sommaire aussi le 18ᵉ sommet de la Francophonie.,1.0,


add words:
add line?y
add theme:Franco


Unnamed: 0,line,line_percent,unk_words
3,"Nous serons en direct en Tunisie, à Djerba où notre envoyée spéciale nous attend.",1.0,


add words:
add line?y
add theme:Tunisia


Unnamed: 0,line,line_percent,unk_words
4,"Et puis le prix RFI Découverte, c'est de la musique revient cette année à Black AD C'est une chanteuse malienne que nous entendrons à la fin du journal.",0.966667,"""malienne"","


add words:"malienne"
add line?


Unnamed: 0,line,line_percent,unk_words
5,"Voilà pour les titres, Soyez les bienvenus.",1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
6,"Et elle a une de ce journal, Adrien.",1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
7,La Corée du Nord tire de plus en plus souvent des missiles vers le Japon.,1.0,


add words:
add line?y
add theme:Korea


Unnamed: 0,line,line_percent,unk_words
8,"Et le dernier en date, Mayeul, a été lancé de Corée du Nord vers 10 h 15 heure locale.",0.954545,"""h"","


add words:h
add line?y
add theme:Korea


Unnamed: 0,line,line_percent,unk_words
9,"Après avoir parcouru 1000 kilomètres dans les airs, le missile est tombé à environ 200 kilomètres des côtes japonaises.",0.857143,"""parcouru"", ""airs"", ""japonaises"","


add words:"japonaises"
add line?


Unnamed: 0,line,line_percent,unk_words
10,Colère et Inquiétude des autorités japonaises.,0.857143,"""japonaises"","


add words:
add line?y
add theme:Korea


Unnamed: 0,line,line_percent,unk_words
11,"Le premier ministre japonais Fumio Kishi était à Bangkok, au sommet de l'APEC, lorsqu'il a appris la nouvelle.",0.904762,"""japonais"", ""lorsqu'il"","


add words:
add line?y
add theme:Korea


Unnamed: 0,line,line_percent,unk_words
12,"À ce sommet, cette réunion, plusieurs dirigeants sont présents, dont Kamala Harris, Kamala Harris est la vice présidente américaine et elle apporte son soutien au Japon.",0.903226,"""dirigeants"", ""vice"", ""soutien"","


add words:"vice"
add line?


Unnamed: 0,line,line_percent,unk_words
13,"Écouter le récit de notre correspondante en Thaïlande, Carole Isoux.",0.916667,"""récit"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
14,Cessez ces actions irréfléchies de déstabilisation.,0.714286,"""irréfléchies"", ""déstabilisation"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
15,"C'est le message de Kamala Harris présente au sommet de l'Asie Pacifique, un missile inter balistique lancé par la Corée du Nord, a atterri ce matin dans les eaux territoriales japonaises.",0.882353,"""inter"", ""balistique"", ""atterri"", ""japonaises"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
16,Un missile dont la puissance serait capable d'atteindre les côtes américaines a prévenu le gouvernement japonais.,0.764706,"""puissance"", ""d'atteindre"", ""prévenu"", ""japonais"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
17,"Aussitôt, Kamala Harris a convoqué une réunion d'urgence à laquelle ont participé les dirigeants du Japon, de la Corée du Sud, de l'Australie, de la Nouvelle-Zélande et du Canada afin de condamner cette attaque qui serait peut-être une réponse de la Corée du Nord.",0.897959,"""convoqué"", ""participé"", ""dirigeants"", ""afin"", ""condamner"","


add words:"condamner"
add line?y
add theme:Korea


Unnamed: 0,line,line_percent,unk_words
18,"Après une réunion la semaine dernière à Phnom Penh entre le président américain Joe Biden et les dirigeants japonais et sud coréens, à l'issue de laquelle les trois chefs d'Etat ont averti la Corée du Nord de conséquence si elle venait à réitérer les essais nucléaires.",0.854167,"""dirigeants"", ""japonais"", ""coréens"", ""l'issue"", ""averti"", ""réitérer"", ""essais"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
19,"La Chine, vers qui tous les regards se tournent désormais, est restée discrète, lors de la réunion du G20, il y a quelques jours à Bali Joe Biden s'était montré confiant dans le fait que la Chine ne souhaitait pas voir d'escalade dans le dossier nord coréen.",0.846154,"""désormais"", ""discrète"", ""lors"", ""s'était"", ""confiant"", ""d'escalade"", ""dossier"", ""coréen"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
20,Carole Isoux Bangkok RFI.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
21,Et un jour de plus pour trouver un accord à la COP 27.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
22,La Conférence mondiale sur le climat qui se déroule en ce moment en Egypte est prolongée d'un jour.,0.894737,"""déroule"", ""prolongée"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
23,"Elle devait se terminer aujourd'hui vendredi, mais finalement elle se terminera demain samedi.",0.933333,"""finalement"","


add words:"finalement"
add line?y
add theme:COP


Unnamed: 0,line,line_percent,unk_words
24,"Le temps, nous dit on, de passer à la vitesse supérieure, dit le président de la COP.",0.904762,"""vitesse"", ""supérieure"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
25,Autrement dit d'aller plus vite pour tenter de trouver un accord qui aille à tout le monde.,1.0,


add words:
add line?y
add theme:COP


Unnamed: 0,line,line_percent,unk_words
26,"Parmi les absents remarqués à cette conférence sur le climat, le Japon, troisième puissance mondiale économique.",0.894737,"""absents"", ""puissance"","


add words:"absents", "puissance"
add line?y
add theme:COP


Unnamed: 0,line,line_percent,unk_words
27,"À Charm el-Cheikh, là où se déroule la COP, le Japon s'est vu décerner le prix « fossile » c'est un prix décerné par des ONG pour se moquer du Japon, car le pays ne lutte pas assez contre le réchauffement climatique.",0.869565,"""déroule"", ""décerner"", ""fossile"", ""décerné"", ""moquer"", ""lutte"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
28,Correspondance à Tokyo de Bruno Duval.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
29,Le Premier ministre Fumio Kishida n'a pas jugé bon d'assister à la COP27.,0.9375,"""d'assister"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
30,"Pour les réseaux sociaux, cela illustre, je cite « la nonchalance, voire le jemenfoutisme » du Japon concernant l'enjeu climatique.",0.708333,"""réseaux"", ""illustre"", ""nonchalance"", ""voire"", ""jemenfoutisme"", ""concernant"", ""l'enjeu"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
31,"Ces passants, en tout cas, n'apprécient pas.",0.8,"""passants"", ""n'apprécient"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
32,arrière.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
33,« La planète brûle et notre pays reste les bras ballants.,0.833333,"""brûle"", ""ballants"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
34,"Franchement, c'est affolant.",0.8,"""affolant"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
35,»\n\nquitter.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
36,"« Kishida zappe la COP alors qu'il dirige le cinquième pays le plus émetteurs de CO2, les bras m'en tombent.",0.863636,"""zappe"", ""émetteurs"", ""m'en"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
37,Cela renvoie au Monde une image terrible.,0.75,"""renvoie"", ""terrible"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
38,"Le Japon fuit ses responsabilités ou pire, n'en est pas conscient.",0.692308,"""fuit"", ""pire"", ""n'en"", ""conscient"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
39,»\n\n« Ce réchauffement climatique touche la vie des gens si concrètement qu'on ne peut plus se voiler la face?,0.904762,"""concrètement"", ""voiler"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
40,"Surtout qu' année après année, la situation s'aggrave.",0.909091,"""s'aggrave"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
41,"»\n\nEn effet, à Tokyo, désormais, en août, il ne fait plus 35 mais 40 degrés à l'ombre.",0.913043,"""désormais"", ""l'ombre"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
42,"Chaque été, un millier de Japonais meurent d' hyperthermie.",0.833333,"""millier"", ""hyperthermie"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
43,C'est trois fois plus qu'il y a 20 ans.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
44,Les plages des 6852 îles que compte l'archipel se réduisent à vue d'oeil en raison de l'érosion.,0.777778,"""l'archipel"", ""réduisent"", ""d'oeil"", ""l'érosion"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
45,"Et selon les experts, c'est l'élévation de la température des océans qui expliquent l'augmentation du nombre et de la violence des typhons qui, au Japon comme ailleurs en Asie, sont de plus en plus dévastateurs, donc meurtriers.",0.857143,"""l'élévation"", ""océans"", ""l'augmentation"", ""typhons"", ""dévastateurs"", ""meurtriers"","


add words:"l'élévation", "océans", "l'augmentation", "typhons", "dévastateurs"
add line?y
add theme:COP


Unnamed: 0,line,line_percent,unk_words
46,"Bruno Duval, Tokyo RFI.",1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
47,À l'écoute du journal en français faciles RFI à Paris 17 h 05.\n\nfacile.,0.933333,"""h"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
48,facile.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
49,Le 18ᵉ sommet de la Francophonie commence demain en Tunisie.,1.0,


add words:
add line?y
add theme:Franco


Unnamed: 0,line,line_percent,unk_words
50,Près de 30 chefs d'Etat et de gouvernement sont attendus ce week end sur l'île de Djerba.,0.888889,"""week"", ""end"","


add words:"week", "end"
add line?y
add theme:Franco


Unnamed: 0,line,line_percent,unk_words
51,Ils ont tous en commun de parler la langue française.,0.909091,"""commun"","


add words:"commun"
add line?y
add theme:Franco


Unnamed: 0,line,line_percent,unk_words
52,"Le français, quatrième langue au monde la plus parlée et la plus utilisée sur Internet.",0.882353,"""quatrième"", ""utilisée"","


add words:"quatrième", "utilisée"
add line?y
add theme:Franco


Unnamed: 0,line,line_percent,unk_words
53,"Mais avant que ne débute le sommet, nous rejoignons notre envoyée spécial Clémentine Pawlotski en direct de Djerba.",0.95,"""débute"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
54,"Clémentine, les discussions ont déjà débuté aujourd'hui et deux textes sont d'ailleurs actuellement à l'étude Clémentine.",0.833333,"""débuté"", ""textes"", ""l'étude"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
55,"Oui Adrien, ce sont des discussions entre ministres et délégués de la francophonie.",0.866667,"""délégués"", ""francophonie"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
56,"Je précise que les discussions entre chefs d'État et de gouvernement de la Francophonie, elles, ne débuteront que demain.",0.954545,"""débuteront"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
57,"Alors, les deux textes à l'étude portent pour le premier sur la déclaration de Djerba.",0.882353,"""textes"", ""l'étude"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
58,Cela fait plusieurs mois qu'un comité de rédaction travaille dessus.,0.727273,"""comité"", ""rédaction"", ""dessus"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
59,"Et puis le deuxième texte porte, lui, sur la résolution des situations de crises et de conflits dans l'espace francophone.",0.956522,"""francophone"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
60,"Il est notamment question, Adrien, de la situation sécuritaire au Sahel ou encore de la crise entre la République démocratique du Congo et le Rwanda.",0.928571,"""sécuritaire"", ""démocratique"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
61,"Alors, cette conférence ministérielle, elle se poursuit cet après-midi selon nos informations, le pays qui assure actuellement la présidence de l'Organisation de la Francophonie, l'Arménie, devait faire le bilan des engagements pris lors du dernier sommet, celui organisé à Erevan en 2018.",0.918367,"""ministérielle"", ""présidence"", ""bilan"", ""lors"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
62,"Et puis, au delà du volet politique, les ministres et chefs de délégation de la Francophonie doivent aussi se pencher sur des questions plus techniques, des questions plus logistiques liées au fonctionnement même de l'organisation ou encore à son financement.",0.840909,"""delà"", ""volet"", ""délégation"", ""pencher"", ""liées"", ""fonctionnement"", ""financement"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
63,"Clémentine Pawlotski en direct de Tunisie pour suivre le Sommet de la Francophonie qui s'achèvera dimanche soir, à noter qu'aujourd'hui la police tunisienne a dispersé des manifestants juste avant ce sommet de la francophonie.",0.888889,"""s'achèvera"", ""qu'aujourd'hui"", ""tunisienne"", ""francophonie"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
64,"Ces manifestants voulaient dénoncer la mauvaise gestion du gouvernement tunisien concernant les naufrages, les accidents en mer de bateaux remplis de migrants.",0.75,"""dénoncer"", ""gestion"", ""tunisien"", ""concernant"", ""naufrages"", ""remplis"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
65,"Et justement, Adrien, une semaine après l'accueil du navire humanitaire l'Ocean Viking à Toulon, plus de la moitié des rescapés ne peuvent pas entrer en France.",0.866667,"""l'accueil"", ""navire"", ""moitié"", ""rescapés"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
66,"Sur les 234 personnes secourues par le bateau ambulance en mer Méditerranée, 123 migrants, des hommes et femmes font l'objet de ce qu'on appelle un refus d'entrer sur le territoire français.",0.882353,"""secourues"", ""ambulance"", ""refus"", ""d'entrer"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
67,Et puis refermons ce journal en musique.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
68,"Après délibération du jury, la chanteuse malienne Awa Diallo, dite Black AD remporte cette année le prix RFI Découverte.",0.863636,"""délibération"", ""malienne"", ""remporte"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
69,"Son nom de scène traduit son combat pour la beauté africaine et juste après avoir remporté le prix, la chanteuse partage sa joie au micro de Sébastien Jedor\n\nAlors je me sens bien.",0.914286,"""traduit"", ""remporté"", ""partage"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
70,Il y a beaucoup de joie.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
71,"En fait, j'ai même pas les mots pour exprimer ce que je ressens.",0.8,"""j'ai"", ""exprimer"", ""ressens"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
72,"Moi, je suis vraiment contente!",1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
73,Et voilà.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
74,Ça représente vraiment une fierté.,0.666667,"""représente"", ""fierté"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
75,Vraiment beaucoup de choses.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
76,"C'est une grande fierté, c'est tout ce que je peux dire.",0.923077,"""fierté"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
77,"Merci à Dieu, merci à la famille, merci à RFI j'y croyais parce que faut pas désespérer.",0.954545,"""désespérer"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
78,Moi je suis comme ça dans tous les concours.,0.9,"""concours"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
79,J'y crois.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
80,"Je crie grâce à Dieu, j'ai été la lauréate.",0.818182,"""j'ai"", ""lauréate"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
81,J'aimerais tout simplement dire que je suis prête.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
82,Quel que soit ce qui va arriver.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
83,"Je suis prête pour les scènes, pour tout ce qui va avec.",1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
84,Je suis prête.,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
85,Let's go!,0.5,"""'s"", ""go"","


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
86,Félicitations!,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
87,Bravo!,1.0,


add words:
add line?


Unnamed: 0,line,line_percent,unk_words
88,à Black AD qui remporte le prix RFI découverte 2022.,0.909091,"""remporte"","


add words:
add line?


In [244]:
output_df

Unnamed: 0,theme,line


In [344]:
with open(path+'unknown_french_dad_list.txt',"r") as input_file:
    new_words = input_file.read()
    new_words = re.sub('\n',"",new_words)
    new_words = new_words.split(',')
    new_words = [re.sub("^['|\"]|^\['|['|\"]$|'\]$","",line.strip().lower()) for line in new_words] #update regex
#new_words
print(len(new_words))
known_manual = get_known(new_words)

6
preuves
Known =k
5 remaining
provisions
Known =kk
got 1
4 remaining
raconté…
Known =k
3 remaining
aube
Known =k
2 remaining
inconnus
Known =k
1 remaining
éclaircir
Known =k
0 remaining


In [345]:
print(len(known_manual))
for word in known_manual:
    vocab['white_listed'].add(word)

1


In [346]:
#known_lines_only = list(set(list(filtered_art.knowns[filtered_art.knowns != "..."])))
#known_lines_only = sorted(known_lines_only,key=len)
known_lines_only = list(filtered_art.knowns[filtered_art.knowns != "..."])
known_lines_only = sorted(known_lines_only,key=len)


In [347]:
#export knowns: [filtered_art.knowns != "..."]
#known_list = list(filtered_art.knowns)
with open(path+"known_lines_dad_fr.txt","w") as outfile:
    for line in known_lines_only:
        outfile.write(line+"\n")

In [360]:
#write to .json formats
df = vocab
df['white_listed'] = list(df['white_listed'])
df['black_listed'] = list(df['black_listed'])
with open(path+source_file, "w") as outfile:
    json.dump(df,outfile)
print(len(vocab['white_listed']))

13199


In [361]:
#read from existing .json formats
with open(path+source_file, "r") as path_in:
    vocab = json.loads(path_in.read())
vocab['white_listed'] = set(vocab['white_listed'])
vocab['black_listed'] = set(vocab['black_listed'])
print(len(vocab['white_listed']))

13199


##EB section

In [79]:
#read eb unknown files from .json formats
"""
Remaining vocab size = 2325
Total learned = 31
"""
with open(path+'eb_unk.json', "r") as infile:
    eb_unk = json.loads(infile.read())
print("Remaining vocab size = "+str(len(eb_unk)))

with open(path+'eb_unk_mo.json', "r") as infile:
    eb_unk_mo = json.loads(infile.read())

with open(path+'eb_learned.json', "r") as infile:
    eb_learned= json.loads(infile.read())
print("Total learned = "+str(len(eb_learned)))

with open(path+'eb_unk_examples.json', "r") as infile:
    eb_unk_examples= json.loads(infile.read())



Remaining vocab size = 2323
Total learned = 33


In [80]:
"""
space = plain, add without conjurgating or changing
c = conjugate as normal verb
x = noun that adds x when plural
s = noun that add s when plural
iv = inner verb. There is a verb in the expression that should be 
    conjurgated
"""
with open("/Users/elyebliss/Desktop/Vocabulary/vocab_dfs/fr_eb_extra.txt","r") as infile:
    extras_lemmas = infile.read().split('\n')

for lemma in extras_lemmas:
    extras_morphs = how_to_add(lemma)
    
    eb_unk[lemma] = extras_morphs
    for morph in extras_morphs:        
        if morph not in eb_unk_mo:
            eb_unk_mo[morph] = lemma

print("Remaining vocab size = "+str(len(eb_unk)))

riposte
c
x
s
ivs
davantage 
c
x
s
iv
peiné
c
x
s
ivs
briguer
c
x
s
ivc
rien n’est joué 
c
x
s
iv
ne date pas d’hier
c
x
s
iv
agacer 
c
x
s
ivc
autochtones
c
x
s
iv
d'aplomb
c
x
s
iv
âpre
c
x
s
ivs
Remaining vocab size = 2332


In [81]:
out = find_unks(pages,eb_unk_mo,eb_unk_examples)

In [84]:
review_out(out,eb_learned,eb_unk,eb_unk_mo,eb_unk_examples)

Lemma: flux
Morph: ['flux', 'flux', 'flux', 'flux', 'flux', 'flux', 'flux', 'flux', 'flux', 'flux', 'flux', 'flux']
New examples: 
    Jump to navigation                                             TV5 Monde    TV5MONDE Plus    TV    Vos rendez-vous    Guide des programmes    Revoir (replay)    Emissions    Réception TV    Direct TV      Info    Langue française    Afrique    Plus    Et aussi …    Voyage    Tivi5mondeplus    Nos applications mobiles    Flux RSS et widgets    Nous suivre sur les réseaux sociaux    Marchés publics          TV5MONDE à travers le monde     TV5MONDE Amérique Latine FR, EN, ES     TV5MONDE Asie/Pacifique FR, EN     TV5MONDE Etats-Unis EN     TV5MONDE Europe FR, EN, NL, DE, RO     TV5MONDE Maghreb-Orient FR, EN, AR             INFO     Formulaire de recherche   Rechercher          Menu        Menu principal Accueil  Videos  Afrique  Terriennes  Culture  Les journaux  En continu       Suivez-nous  Facebook  Twitter             Ukraine-Russie : la guerre       

options : k,m,a,q
    Jump to navigation                                             TV5 Monde    TV5MONDE Plus    TV    Vos rendez-vous    Guide des programmes    Revoir (replay)    Emissions    Réception TV    Direct TV      Info    Langue française    Afrique    Plus    Et aussi …    Voyage    Tivi5mondeplus    Nos applications mobiles    Flux RSS et widgets    Nous suivre sur les réseaux sociaux    Marchés publics          TV5MONDE à travers le monde     TV5MONDE Amérique Latine FR, EN, ES     TV5MONDE Asie/Pacifique FR, EN     TV5MONDE Etats-Unis EN     TV5MONDE Europe FR, EN, NL, DE, RO     TV5MONDE Maghreb-Orient FR, EN, AR             INFO     Formulaire de recherche   Rechercher          Menu        Menu principal Accueil  Videos  Afrique  Terriennes  Culture  Les journaux  En continu       Suivez-nous  Facebook  Twitter             Info     Francophonie : l'actualité de la langue française dans le monde Sommet de la Francophonie  Sommet de la francophonie : où en est l'usage 

options : k,m,a,qa
Lemma: civière
Morph: ['civière']
Previous examples: 
 - Coupe du Monde 2022 : Amine Harit gravement blessé à une semaine du début de la compétition    14 nov 2022    Le Marocain de l'Olympique de Marseille Amine Harit est sorti sur une civière lors du match contre Monaco, dimanche en Ligue 1....             Coupe du monde 2022 : Thuram convoqué in extremis chez les Bleus, Kimpembe forfait    14 nov 2022  TV5MONDE AFP    Marcus Thuram, attaquant du Borussia Mönchengladbach, a été convoqué pour compléter le groupe de l'équipe de France retenu pour...            4 : 19    Coupe du monde : les Sénégalais de l'ASC Lebougui en route pour le Qatar !
New examples: 
Lemma: conformément
Morph: ['conformément']
Previous examples: 
 - "Je crois que c'était un missile russe, conformément au rapport des militaires" ukrainiens, a-t-il ajouté alors que les responsables de l'Otan ont estimé qu'il s'agissait probablement d'un missile du système ukrainien de défense anti-aérienne.
New

options : k,m,a,qa
Lemma: lancée
Morph: ['lancée', 'lancée']
New examples: 
Meta a embauché et s'est lancée dans de nouveaux projets en croyant que les recettes publicitaires resteraient élevées.
options : k,m,a,qk
“ Elle a embauché et s'est lancée dans de nouveaux projets en croyant que les recettes publicitaires resteraient élevées ", ajoute-t-elle.
options : k,m,a,q
Lemma: frileux
Morph: ['frileux']
New examples: 
Des actionnaires frileux faces aux nouveaux investissements de GAFAM's  “ Meta pensait que la croissance du commerce en ligne continuerait dans la durée ”, argumente  Debra Aho Williamson.
options : k,m,a,qa
Lemma: curer
Morph: ['curer']
New examples: 
Il leur a redit sa volonté de "curer le marigot" de Washington, selon sa formule consacrée pour désigner les élites honnies de la capitale fédérale.
options : k,m,a,qa
Lemma: âpre
Morph: ['âpre', 'âpre']
New examples: 
- Âpre bataille - Une partie de la nébuleuse conservatrice s'est déjà tournée vers un autre possible préten

In [85]:
#write to eb unknown files to .json formats
"""
Remaining vocab size = 2322
Total learned = 43
"""
with open(path+'eb_unk.json', "w") as outfile:
    json.dump(eb_unk,outfile)
print("Remaining vocab size = "+str(len(eb_unk)))

with open(path+'eb_unk_mo.json', "w") as outfile:
    json.dump(eb_unk_mo,outfile)

with open(path+'eb_learned.json', "w") as outfile:
    json.dump(eb_learned,outfile)
print("Total learned = "+str(len(eb_learned)))

with open(path+'eb_unk_examples.json', "w") as outfile:
    json.dump(eb_unk_examples,outfile)

Remaining vocab size = 2322
Total learned = 43


### Code no longer in use

In [162]:
##France24
print("Checking France24")
parser = 'html.parser'  # or 'lxml' (preferred) or 'html5lib', if installed
#request=urllib.request.Request('https://savoirs.rfi.fr/en/apprendre-enseigner/langue-fran%C3%A7aise/journal-en-francais-facile',None,headers)
request=urllib.request.Request('https://www.france24.com/fr/',None,headers)

resp = urllib.request.urlopen(request)
soup = bs4.BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))


pages = []
for link in soup.find_all('a', href=True):
    str_ver = str(link['href'])
    
    #get regex such that links start with /fr/ and don't end
    #with /
    if bool(re.search('^\/fr\/.*',str_ver)) and (str_ver[len(str_ver)-1]!='/'):
        pages.append('https://www.france24.com'+str_ver)
pages = list(set(pages))

Checking France24


In [202]:
##France24
print("Checking 20minutes")
parser = 'html.parser'  # or 'lxml' (preferred) or 'html5lib', if installed
#request=urllib.request.Request('https://savoirs.rfi.fr/en/apprendre-enseigner/langue-fran%C3%A7aise/journal-en-francais-facile',None,headers)
request=urllib.request.Request('https://www.20minutes.fr/',None,headers)

resp = urllib.request.urlopen(request)
soup = bs4.BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))


pages = []
for link in soup.find_all('a', href=True):
    str_ver = str(link['href'])
    
    final_segment = str_ver.split('/')[len(str_ver.split('/'))-1]
    #articles have https:// at the beginning and a date in the last segment
    if ('https://www.20minutes.fr' in str_ver)\
    and bool(re.search('[0-9]',final_segment)):
        pages.append(str_ver)
    
pages = list(set(pages))
pages

Checking 20minutes


['https://www.20minutes.fr/monde/4010811-20221119-etats-unis-joe-biden-marie-petite-fille-maison-blanche-huis-clos',
 'https://www.20minutes.fr/guide-achat/bon-plan-fdj/4010807-20221119-resultats-loto-fdj-resultats-tirage-samedi-19-novembre',
 'https://www.20minutes.fr/sport/4010800-20221119-masters-atp-novak-djokovic-qualifie-huitieme-finale',
 'https://www.20minutes.fr/sport/4010789-20221119-ski-americaine-mikaela-shiffrin-gagne-slalom-levi-premiere-course-saison',
 'https://www.20minutes.fr/videos/oh-my-fake/4010674-20221119-nasa-origine-etude-refute-fonte-glaces-antarctique',
 'https://www.20minutes.fr/justice/4010803-20221119-homme-demembre-pres-rouen-deux-accusees-condamnees-22-17-annees-reclusion',
 'https://www.20minutes.fr/television/4010518-20221119-star-academy-avant-votait-ur-maintenant-tete-telecrochets-heure-vote-utile',
 'https://www.20minutes.fr/sante/4010717-20221119-sante-mentale-maison-perchee-jeunes-souffrant-bipolarite-schizophrenie-entraident',
 'https://www.20min

In [203]:
len(pages)

68

In [178]:
return_line_percents(webpage)

[100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 83.33333333333333,
 71.42857142857143,
 75.0,
 77.77777777777777,
 80.0,
 81.81818181818181,
 83.33333333333333,
 84.61538461538461,
 85.71428571428571,
 86.66666666666667,
 87.5,
 88.23529411764706,
 83.33333333333333,
 84.21052631578948,
 80.0,
 80.95238095238095,
 81.81818181818181,
 82.6086956521739,
 83.33333333333333,
 84.0,
 84.61538461538461,
 85.18518518518519,
 85.71428571428571,
 86.20689655172414,
 86.66666666666667,
 87.09677419354838,
 87.5,
 84.84848484848484,
 85.29411764705883,
 85.71428571428571,
 86.11111111111111,
 86.48648648648648,
 86.84210526315789,
 87.17948717948718,
 87.5,
 87.8048780487805,
 88.0952380952381,
 88.37209302325581,
 88.63636363636364,
 88.88888888888889,
 86.95652173913044,
 87.23404255319149,
 87.5,
 87.75510204081633,
 88.0,
 88.23529411764706,
 88.46153846153847,
 88.67924528301887,
 88.88888888888889,
 89.0909090909091,
 89.28571428571429,
 89.47368421052632,
 89.65517241379311,
 88.13559322033899,
 

In [177]:
def return_line_percents(webpage):
    request=urllib.request.Request(webpage,None,headers) #The assembled request
    response = urllib.request.urlopen(request)
    data = response.read()
    contents = text_from_html(data)

    known_array = []
    unk_array = []
    contents_array = sent_tokenize(contents)

    lines = []

    for line in contents_array:
        tokenized = word_tokenize(line, language='french')

        line_total = 0
        line_unks = 0
        for word in tokenized:

            line_total +=1

            if not (bool(re.search("[0-9]", word)) or\
                    bool(re.search("[A-Z]", word)) or\
                    bool(re.search("[.,\/#!$%\^&\*;:{}=\-_`~()«»]", word)) or\
                    word.lower() in vocab['white_listed']):

                    line_unks +=1

            lines.append(100*(line_total-line_unks)/line_total)
    return(lines)

In [None]:

if (start is not None) and (stop is not None):
    contents_array=contents_array[max(start,0):min(stop,len(contents_array))]

with open(path+'unknown_french_dad_list.txt',"w") as outfile:
    outfile.write(str(list(disallowed_words)))


In [91]:
            
##TV5Monde
print("Checking TV5Monde")
parser = 'html.parser'  # or 'lxml' (preferred) or 'html5lib', if installed
#request=urllib.request.Request('https://savoirs.rfi.fr/en/apprendre-enseigner/langue-fran%C3%A7aise/journal-en-francais-facile',None,headers)
request=urllib.request.Request('https://www.tv5monde.com/',None,headers)

resp = urllib.request.urlopen(request)
soup = bs4.BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))


pages = []
for link in soup.find_all('a', href=True):
    if '/info/' in str(link['href']):
        #print(str(link['href']))
        #str_page = 'https://savoirs.rfi.fr/'+str(link['href'])
        #str_page = 'https://savoirs.rfi.fr/en/apprendre-enseigner/langue-francaise/journal-en-francais-facile-'+str(re.search(r"[0-9]{8}.*",str(link['href'])).group(0))
        if str(link['href']) not in pages:
            pages.append(str(link['href']))

pages

Checking TV5Monde


['https://information.tv5monde.com/info/ukraine-russie-la-guerre',
 'https://information.tv5monde.com/info/cop27-l-afrique-au-coeur-des-enjeux-climatiques',
 'https://information.tv5monde.com/info/la-francophonie-en-sommet-djerba',
 'https://information.tv5monde.com/info/coupe-du-monde-de-foot-2022-au-qatar',
 'https://information.tv5monde.com/info/decryptage-vrai-dire',
 'https://information.tv5monde.com/info/direct-sommet-de-la-francophonie-les-travaux-se-poursuivent-djerba-479093',
 'https://information.tv5monde.com/info/tunisie-quels-sont-les-grands-enjeux-du-xviii-sommet-de-la-francophonie-478730',
 'https://information.tv5monde.com/info/direct-ukraine-le-premier-ministre-britannique-rishi-sunak-promet-50-millions-de-livres-d-aide',
 'https://information.tv5monde.com/info/cop27-un-accord-trouve-extremis-sur-la-question-des-degats-climatiques-479072']

In [300]:
##add in extras
with open("/Users/elyebliss/Desktop/Vocabulary/vocab_dfs/fr_eb_extra.txt","r") as infile:
    extras_lemmas = infile.read().split('\n')

gs = goslate.Goslate()

default_conjugator = mlconjug3.Conjugator(language='fr')


for lemma in extras_lemmas:
    
    decision = how_to_add(lemma)
    
    if lemma not in eb_unk:
        eb_unk[lemma] = []
        
    extras_morphs = []
    extras_morphs.append(lemma)
    try:
        #if it's an infinitive, add all conjurgations
        test_verb = default_conjugator.conjugate(lemma)
        all_conjugated_forms = test_verb.iterate()
        for item in all_conjugated_forms:
            if item not in extras_morphs:
                extras_morphs.append(item[len(item)-1])
        extras_morphs = list(set(extras_morphs))
    except:
        
        #if not an expression:
        if len(lemma.split(' '))==1:
            #add plural nouns
            if lemma[len(lemma)-1]=='u':
                extras_morphs.append(lemma+'x')
            else:
                extras_morphs.append(lemma+'s')

    eb_unk[lemma] = extras_morphs
    for morph in extras_morphs:        
        if morph not in eb_unk_mo:
            eb_unk_mo[morph] = lemma

print("Remaining vocab size = "+str(len(eb_unk)))

Remaining vocab size = 2327


In [177]:
#make backups
eb_learned_backup = eb_learned.copy()
eb_unk_backup = eb_unk.copy()
eb_unk_mo_backup = eb_unk_mo.copy()
eb_unk_examples_backup = eb_unk_examples.copy()
print("unknowns before = "+str(len(eb_unk_backup)))

unknowns before = 2319


In [149]:
add_to_black = [ ]

for word in add_to_black:
    vocab['black_listed'].add(word)

In [8]:
#Upload from download
with open(path+'known_french_dad_list.txt',"r") as input_file:
    new_words = input_file.read()
    new_words = re.sub('\n',"",new_words)
    new_words = new_words.split(',')
    new_words = [line.replace('"',"").strip().lower() for line in new_words]
    print("daily catch in word count:")
    print(len(new_words))
    
    for line in new_words:   
        vocab['white_listed'].add(line.replace('"',"").strip().lower())

FileNotFoundError: [Errno 2] No such file or directory: '/Users/elyebliss/Desktop/Vocabulary/vocab_dfs/known_french_dad_list.txt'

In [65]:
pp.pprint(len(disallowed_words))

407


In [None]:
##France24:
print("Checking France24")
parser = 'html.parser'  # or 'lxml' (preferred) or 'html5lib', if installed
request=urllib.request.Request('https://www.france24.com/fr/info-en-continu/',None,headers)
resp = urllib.request.urlopen(request)
soup = bs4.BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))

pages = []
for link in soup.find_all('a', href=True):
    if '/fr/info-en-continu/' in str(link['href']):
        pages.append(urljoin('https://www.france24.com',urllib.parse.quote(link['href'].encode('UTF-8'))))


#pages=["https://www.france24.com/fr/europe/20210607-tests-pass-sanitaire-comment-s-organise-l-europe-pour-accueillir-les-touristes"]
for webpage in pages:

    try:
        request=urllib.request.Request(webpage,None,headers) #The assembled request
        response = urllib.request.urlopen(request)
        data = response.read()
        contents = text_from_html(data).lower().split()

        frenchContents = set()
        for item in contents:
            if item.isalpha():
                frenchContents.add(item)
        
        
        print(len(frenchContents.difference(vocabSet)))

        if (len(frenchContents)>0) & (len(frenchContents.difference(vocabSet))<=120):
            
            withinLvl.write(str(webpage)+"\t"+str(len(frenchContents.difference(vocabSet)))+"\n")
            for word in frenchContents.difference(vocabSet):
                withinLvl.write("\t"+str(word))
                try:
                    withinLvl.write("\t"+str(gs.translate(word,'en')))
                except:
                    pass
                withinLvl.write("\n")
            print(webpage)
            print(frenchContents.difference(vocabSet))

                
        #print(webpage)
        
            
    except:
        pass
"""




In [117]:
"""
to do:

"""

In [49]:
##INPUT-OUTPUT
#vocab list:
with open("/Users/elyebliss/Desktop/Vocabulary/vocab_dfs/dad_whitelisted.csv","r") as infile:
    whitelisted_lemmas = infile.read()


##VARIABLES
vocab_all = set()
gs = goslate.Goslate()

default_conjugator = mlconjug3.Conjugator(language='fr')


for line in whitelisted_lemmas.split('\n'):
    if len(line) > 0:
        #print(line)
        vocab = line.lower().strip()
        vocab_all.add(vocab)
        try:
            #if it's an infinitive, add all conjurgations
            test_verb = default_conjugator.conjugate(vocab)
            all_conjugated_forms = test_verb.iterate()
            for item in all_conjugated_forms:
                
                vocab_all.add(item[len(item)-1])
                
            
        except:
            #might be a noun, add plural
            vocab_all.add(add_noun(vocab))
#len(whitelisted_lemmas.split('\n'))            
#pp.pprint(vocab_all)

In [83]:
vocab = {}
vocab['white_listed'] = list(vocab_all)
vocab['black_listed'] = []


In [27]:
#used for creating eb data

with open(path+'eb_unk_lemmas.csv','r') as infile:
    eb_unk_lemmas = pd.read_csv(infile)
with open(path+'eb_unk_morph.csv','r') as infile:
    eb_unk_morph = pd.read_csv(infile)
    
eb_unk = {}
for word in eb_unk_lemmas.lemme:
    eb_unk[word] = []

    for index, row in eb_unk_morph.iterrows():
        if row.lemme == word:
            eb_unk[word].append(row.ortho)
eb_unk_mo = {}
for lemma in eb_unk.keys():
    for morph in eb_unk[lemma]:
        eb_unk_mo[morph] = lemma

In [147]:
eb_learned = {}
eb_unk_examples = {}
for word in eb_unk.keys():
    eb_unk_examples[word] = []

In [130]:
#restore to backups
eb_learned = eb_learned_backup
eb_unk = eb_unk_backup
eb_unk_mo = eb_unk_mo_backup
eb_unk_examples = eb_unk_examples