<a href="https://colab.research.google.com/github/gretiere545/corpus/blob/main/CorpusPipe_V1_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pipe Corpus v1.2
### version de transition (moindre effort)


*   Auteur : Gilles Retière
*   Date de création : 2022 01 10
*   Version : 1.2
*   Révision : 1
```


```

## Imports

In [1]:
#!/usr/bin/env python
# -*- coding: utf8 -*-
!export PYTHONIOENCODING=utf8
!pip install gspread-formatting

import pandas as pd
import numpy as np
import uuid
import random
import os
import re
import json

# general
pd.set_option("display.width",1000)

# gdrive
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Trad-Union/Corpus/ASAMLA

# Ce bout de code pour pouvoir downloader des google sheets dans des dataframes
from google.colab import auth
auth.authenticate_user()

# gspread
import gspread
from gspread_formatting import *
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())
from googleapiclient.discovery import build
service = build('sheets', 'v4')
drive_service = build('drive', 'v3')

import config



Collecting gspread-formatting
  Downloading gspread_formatting-1.0.5-py2.py3-none-any.whl (21 kB)
Installing collected packages: gspread-formatting
Successfully installed gspread-formatting-1.0.5
Mounted at /content/drive
/content/drive/MyDrive/Trad-Union/Corpus/ASAMLA


## Ouverture du fichier de paramétrage des langues (JSON)
*   Contient tous les paramètres spécifiques (typo, crédits, etc.)

In [2]:
def get_cc_config(f):
  vk = []
  try:
    with open(f) as vk_f:
        vk = json.load(vk_f)
  except BaseException as e:
    print(e)
  return vk

def set_cc_config (vk, f):
  # sauvegarde du fichier de paramétrage
  with open(f, 'w') as fp:
      json.dump(vk, fp)  

## Ouverture de **Corpus Central DataBase** (GC Drive compte Atos)
*   Cette feuille est la référence et est alimentée par les Corpus individuels
*   On récupère sous forme de dataframe la partie pivot (français uniquement)

In [3]:
#
# Ouverture de la Sheet Centrale (corpus_central_base)
#
def get_CCDB_wb(uri):
  sheet_central = uri
  wb_central = gc.open_by_url(sheet_central)
  return wb_central

def get_CCDB_data(wb, idx):
  t_corpus = wb.get_worksheet(idx)
  data_t_corpus = t_corpus.get_all_values()
  return data_t_corpus

#
# DataFrame CC global (toutes les langues)
#
def get_ccdf_global(CCDB):
  return pd.DataFrame(CCDB[1:], columns=CCDB[0])

#
# DataFrame CC Français (invariable)
#
def get_ccdf_fr(df_cc):
  # on supprime les doublons
  return df_cc[['uid','expression','glossaire','état','date','commentaires','index']].drop_duplicates()


## Ouverture de **Corpus Local DataBase** (GC Drive compte GRE)
*   Cette feuille est la référence comme Corpus de travail des interprètes
*   Une feuille par thème
*   On récupère sous forme de dataframe

In [4]:
#
# Ouverture de la Sheet Locale (corpus_local_base)
#
def get_CLDB(uri):
  wb_local = gc.open_by_url(uri)
  return wb_local  

#
# Chaque onglet est chargé dans un dataframe
#
def get_corpus_trad (wb, df_corpus, d_lang):
  t_corpus_trad = wb.get_worksheet(d_lang['idx'])
  data_t_corpus = t_corpus_trad.get_all_values()
  df_corpus_trad = pd.DataFrame(data_t_corpus[1:], columns=data_t_corpus[0])
  # on vérifie que chaque expression possède la même clé
  df_corpus = pd.merge(df_corpus,df_corpus_trad[['uid','traduction']],on='uid', how='left')
  df_corpus['traduction'] = df_corpus['traduction'].apply(lambda x:x.strip() if x is not np.nan else "")
  
  df_corpus.rename({'traduction': d_lang['trigramme']}, axis=1, inplace=True)
  return df_corpus

#
# Itération pour récupérer dans une liste tous les onglets de traduction (base locale ou vue interprètes)
#
def get_corpus_list (vk_languages, db_cl, df_cc_fr):
  vk_df_corpus = []
  for i in vk_languages:
    #i['uri'] = local_db_uri
    if i['update']=='false':
      print ("Processing " + i['trigramme'])
      df_corpus = get_corpus_trad(db_cl, df_cc_fr,i)
      vk_df_corpus.append(df_corpus)
  return vk_df_corpus


def get_corpus(df_corpus, langue):
  #ouverture de l'onglet Corpus dans un DF
  df = df_corpus[['uid','expression', langue,'index']]
  return df  

# cas des index normaux (français -> langue)
def get_corpus_rev(uri):
  wb = gc.open_by_key(uri)
  #ouverture de l'onglet Corpus dans un DF
  t_corpus = wb.worksheet('med_vac_synthese')
  data_t_corpus = t_corpus.get_all_values()
  df_corpus = pd.DataFrame(data_t_corpus[1:], columns=data_t_corpus[0])
  return df_corpus

def get_all_corpus_rev (vk_languages):
  # tableau de corpus indéxé par langue de traduction
  df_corpus_rev = []
  # itération sur chaque trigramme de langue
  for i in vk_languages:
    if i['update']=='false':
      df = get_corpus_rev(i['uri'])
      df_corpus_rev.append(df)
      if trace is True:
        print ("* get_all_corpus_rev " + i['language'])  

  return df_corpus_rev

# Fonctions
### Application des bonnes règles typographiques pour la ponctuation française : espace fine insécable
```
# https://www.typofute.com/l_espace_fine_insecable_dans_les_documents_html  
# https://www.compart.com/en/unicode/U+202F
```



In [5]:
#
# Substitution d'un caractère unicode par un autre (cas des alphabets cyrilliques))
#
def replace_unicode(word, vk_uni):
  for t_uni in vk_uni:
    word = word.replace (t_uni[1], t_uni[2])
  return word

#
# Substitution du ? final par un espace insécable + le ? (pour les césures PDF)
#
def narrow_no_break_space (s):
  s = " ".join(s.split())   # on vire tous les espaces en trop
  find = r'(\s*\?$)'        # on recherche le ? précédé ou pas d'espace \s*
  replace = u'\u202F'+ r'?'
  s = re.sub(find, replace, s)    # on remplace par un blanc insécable + le ?
  return s  

# Application des bonnes règles typographiques pour la ponctuation française : espace fine insécable
def set_typo_rules (df_cc_fr):
  print ("Application des bonnes règles typographiques")
  df_cc_fr['expression'] = df_cc_fr['expression'].apply(lambda x:narrow_no_break_space(x))  
  # itération langues
  for i in vk_languages:
    if i['update']=='false':
      df_cc_fr = pd.merge(df_cc_fr,vk_df_corpus[i['idx']-1][['uid',i['trigramme']]],on='uid', how='left')
      # Application des bonnes règles typographiques pour la ponctuation française : espace fine insécable
      df_cc_fr[i['trigramme']] = df_cc_fr[i['trigramme']].apply(lambda x:narrow_no_break_space(x))    
      if i['unicode_substition']!=[]:
        # substition de caractères unicodes (optionnel)
        print (i['trigramme'])
        df_cc_fr[i['trigramme']] = df_cc_fr[i['trigramme']].apply(lambda x:replace_unicode (x, i['unicode_substition'])) 
  return df_cc_fr  

In [6]:
def half_split(s):
    half, rem = divmod(len(s), 2)
    return s[:half + rem], s[half + rem:]

def convertTuple(tup):
    # initialize an empty string
    s = ''
    for item in tup:
        s = s + item
    return s    

def convertTupleStr(tup):
    # initialize an empty string
    vk = []
    for item in tup:
        vk.append(''+item)
    return vk    

def rtl_arabic (s, pdf):
    arabic_string = arabic_reshaper.reshape(s)
    arabic_string = arabic_string[::-1]
    w = pdf.get_string_width(arabic_string) + 6
    # problème d'inversion des parenthèses
    arabic_string = arabic_string.replace('(', '§')
    arabic_string = arabic_string.replace(')', '(')    
    arabic_string = arabic_string.replace('§', ')')
    return arabic_string 

#
# Liste de styles graphiques
#
def get_color_theme():
  vk_color_theme=[
  {'name':'Abstract vector geometric pattern. Symmetrical layout. Illustration eps 10.','color_1':'128, 191, 162','color_2':'137, 166, 93','color_3':'217, 184, 85','color_4':'217, 170, 85','color_5':'242, 242, 242'},
  {'name':'color theme_IMG_2040','color_1':'220, 118, 70','color_2':'243, 161, 75','color_3':'147, 173, 164','color_4':'191, 219, 207','color_5':'234, 223, 201'},               
  {'name':'Water textured background. Calm sea ripples','color_1':'3, 140, 140','color_2':'3, 166, 166','color_3':'3, 127, 140','color_4':'217, 170, 85','color_5':'242, 242, 242'},
  {'name':'Pink and blue abstract paper background from a curved sheet.','color_1':'217, 119, 173','color_2':'102, 127, 109','color_3':'208, 217, 242','color_4':'121, 150, 132','color_5':'217, 187, 169'},
  {'name':'Sort of blue','color_1':'255, 255, 255','color_2':'81, 129, 140','color_3':'47, 89, 115','color_4':'133, 166, 162','color_5':'60, 60, 255'},
  {'name':'asamla','color_1':'255, 255, 255','color_2':'166, 3, 33','color_3':'174, 186, 191','color_4':'242, 188, 27','color_5':'242, 140, 15','color_6':'242, 48, 5'},
  {'name':'Healthcare background with medical symbols in hexagonal frame','color_1':'255, 255, 255','color_2':'4, 173, 191','color_3':'167, 235, 242','color_4':'4, 191, 191','color_5':' 3, 166, 150','color_6':'4, 191, 157'},
  {'name':'healthcare background with medical symbols in hexagonal frame','color_1':'242, 242, 242','color_2':'122, 191, 179','color_3':' 149, 191, 184','color_4':'39, 140, 11','color_5':'88, 166, 144','color_6':'166, 3, 33'}
  ]    
  return vk_color_theme

#
# Transformation en image depuis PDF
#
def pdf2img(trigramme, output_pdf):
  images = convert_from_path(output_pdf)
  dossier = item['trigramme']
  if not os.path.exists(dossier):
    os.makedirs(dossier)

  os.chdir(config.root_path + dossier)     
  sous_dossier = "lex-fr-"+item['trigramme']
  if not os.path.exists(sous_dossier):
    os.makedirs(sous_dossier)

  os.chdir(config.root_path + dossier + "/" + sous_dossier) 
  file = "med-vac-" + sous_dossier + "-"
  i=0
  for img in images:
    i+=1
    img.save(file + '_' + str(i) + ".jpg", 'JPEG')

  os.chdir(config.root_path)
  return
  
#
# Transformation en image depuis PDF
#
def pdf2img_rev(trigramme, output_pdf):

  images = convert_from_path(output_pdf)
  dossier = item['trigramme']
  if not os.path.exists(dossier):
    os.makedirs(dossier)
  os.chdir(config.root_path + dossier)
  sous_dossier = "lex-" + item['trigramme'] + "-fr"
  if not os.path.exists(sous_dossier):
    os.makedirs(sous_dossier)

  os.chdir(config.root_path +  dossier + "/" + sous_dossier) 
  file = "med-vac-" + sous_dossier + "-"
  i=0
  for img in images:
    i+=1
    img.save(file + '_' + str(i) + ".jpg", 'JPEG')

  os.chdir(config.root_path)
  return

### Format google sheet

In [7]:
#
# Formatage gg sheet
#
def format_feuille(wb, nom_onglet):
  # onglet
  ws = wb.worksheet(nom_onglet)
  # Format de la partie gauche
  fmt = cellFormat(
      backgroundColor=color(0.91, 0.96, 0.93),
      textFormat=textFormat(bold=False, foregroundColor=color(0,0,0), fontSize='10'),
      horizontalAlignment='LEFT'
      )
  format_cell_range(ws, 'A:G', fmt)

  # format de l'entete
  fmt = cellFormat(
      backgroundColor=color(0.7725,0.8431,0.7922),
      textFormat=textFormat(bold=True, foregroundColor=color(0,0,0), fontSize='10'),
      horizontalAlignment='LEFT'
      )
  format_cell_range(ws, '1', fmt)
  
  # on gèle l'entete et les colonnes de gauche
  set_frozen(ws, rows=1, cols=7)
  set_column_width(ws, 'A', 100)
  set_column_width(ws, 'B', 300)
  return

#
# par défaut, le fichier est créé à la racine de drive, il faut le déplacer dans le bon dossier
# 
def move_sh (drive_service, file_id, folder_id):
  file = drive_service.files().get(fileId=file_id,
                                  fields='parents').execute()
  previous_parents = ",".join(file.get('parents'))
  # Move the file to the new folder
  file = drive_service.files().update(fileId=file_id,
                                      addParents=folder_id,
                                      removeParents=previous_parents,
                                      fields='id, parents').execute()
  return


#
# création d'une feuille vierge
# 
def create_sheet (service, title):
  spreadsheet = {
    'properties': {
        'title': title
    }
  } 
  spreadsheet = service.spreadsheets().create(body=spreadsheet,
                                    fields='spreadsheetId').execute()
  print('Spreadsheet ID: {0}'.format(spreadsheet.get('spreadsheetId')))

  return spreadsheet.get('spreadsheetId')  

### Fonctions du processus

In [8]:
#
# synchronisation des corpus
#
def sync_cc_local_to_central (db_cc_wb, df_cc, nom_onglet):
  df_corpus_synth = df_cc.copy()

  # si l'onglet existe déjà
  try:
    ws = db_cc_wb.worksheet(nom_onglet)
    db_cc_wb.del_worksheet(ws)
  except:
    print ("Onglet inexistant !")

  db_cc_wb.add_worksheet(nom_onglet, 1, 1)
  export_sheet = db_cc_wb.worksheet(nom_onglet)
  set_with_dataframe(export_sheet, df_corpus_synth)
  format_feuille(db_cc_wb, nom_onglet)
  if trace is True:
    print ("*****************************************")
    print ("* Synchro okay                          *")
    print ("*****************************************")
  return

In [9]:
#
# Règles d'indexation spéciales selon les langues
#
def set_special_index_rules_1 (lang, df_temp):
  # moment du tri alpha selon alphabet local
  alphabet = lang['alphabet']
  a = [x for x in alphabet]    
  # suppression des ? en début de phrase en espagnol
  if lang['trigramme'] == 'esp':
    df_temp['index'] = df_temp[lang['trigramme']].map(lambda x: x.lstrip('¿¡')).apply(lambda x:x[0].upper() if (len(x)>0) else "")  
  elif lang['trigramme'] == 'hun' or lang['trigramme'] == 'alb':
    # cas du hongrois, de l'albanais (index multilettres)
    liste_lettre = [half_split(x) for x in [convertTuple(t) for t in [half_split(x) for x in alphabet.split('-')]]]
    a = [
        initiale
            for t in liste_lettre
            for t2 in convertTupleStr(t)
            for initiale in t2.split()
    ]      
    df_temp['index'] = df_temp[lang['trigramme']].apply(lambda x:x[:2] if ((len(x)>0) and x[:2] in a) else x[0].upper() if (len(x)>0) else "")            
  else:
    df_temp['index'] = df_temp[lang['trigramme']].apply(lambda x:x[0].upper() if (len(x)>0) else "")     

  return df_temp

def set_special_index_rules_2 (lang, df_temp):
  if lang['trigramme'] == 'geo':
    # pas de majuscule en géorgien, on force en minuscule si besoin
    df_temp['index'] = df_temp['index'].apply(lambda x:x[0].lower() if (len(x)>0) else "") 
  return df_temp
  
#
# Regroupement du dataframe par catégories
#
def categorise_df (lang, df_temp):
  # moment du tri alpha selon alphabet local
  alphabet = lang['alphabet']
  a = [x for x in alphabet]  

  df_temp['index'] = df_temp['index'].astype("category")
  df_temp['index'].cat.set_categories(a, inplace=True)
  df_temp.sort_values(["index", lang['trigramme']], ascending=True, inplace=True)
  return df_temp

In [10]:
def create_reverse_indexed_corpus(vk_languages, df_corpus, service, drive_service):

  # tableau de corpus indéxé par langue de traduction
  df_corpus_trad = []
  # itération sur chaque trigramme de langue
  for i in vk_languages:
    if i['update']=='false':
      df_temp = df_corpus[['uid', ''.join(map(str, i['trigramme'])) ,'glossaire','état','date','commentaires', 'expression']].drop_duplicates()
      #
      # Règles d'indexation spéciales selon les langues (passe #1)
      #
      df_temp = set_special_index_rules_1 (i, df_temp)
      #
      # Regroupement par catégories
      #
      df_temp = categorise_df(i, df_temp)
      #
      # Règles d'indexation spéciales selon les langues (passe #2)
      #
      df_temp = set_special_index_rules_2 (i, df_temp)
      #
      # ajout à la liste de corpus
      #
      df_corpus_trad.append(df_temp)

      sh_trad = 'corpus_central_base_'+i['trigramme']
      nom_onglet = "med_vac_synthese"
      sh_id = i['uri']
      try:
        wb_trad = gc.open_by_key(sh_id)
        sh = wb_trad.worksheet("med_vac_synthese")
      except BaseException as e:
        print(e)
        sh_id = create_sheet(service, sh_trad)
        i['uri'] = sh_id
        move_sh (drive_service, sh_id, "1L8YxbtY9Rn0hEO-IkMtvikdJUEkXExyi")   
        wb_trad = gc.open_by_key(sh_id)
        sh = wb_trad.add_worksheet(nom_onglet, 1, 1)
      set_with_dataframe(sh, df_temp)
      format_feuille(wb_trad, nom_onglet) 

  return df_corpus_trad

In [11]:
def create_reversed_corpus_with_index (vk_languages, df_corpus, service, drive_service, nom_onglet):
  # tableau de corpus indéxé par langue de traduction
  df_corpus_trad = []
  # itération sur chaque trigramme de langue
  for i in vk_languages:
    if i['update']=='false':
      df_temp = df_corpus[['uid', ''.join(map(str, i['trigramme'])) ,'glossaire','état','date','commentaires', 'expression']].drop_duplicates()

      # moment du tri alpha selon alphabet local
      alphabet = i['alphabet']
      a = [x for x in alphabet]  

      # suppression des ? en début de phrase en espagnol
      if i['trigramme'] == 'esp':
        df_temp['index']=df_temp[i['trigramme']].map(lambda x: x.lstrip('¿¡')).apply(lambda x:x[0].upper() if (len(x)>0) else "")  
      elif i['trigramme'] == 'hun' or i['trigramme'] == 'alb':
        # cas du hongrois, de l'albanais (index multilettres)
        liste_lettre = [half_split(x) for x in [convertTuple(t) for t in [half_split(x) for x in alphabet.split('-')]]]
        a = [
            initiale
                for t in liste_lettre
                for t2 in convertTupleStr(t)
                for initiale in t2.split()
        ]      
        df_temp['index']=df_temp[i['trigramme']].apply(lambda x:x[:2] if ((len(x)>0) and x[:2] in a) else x[0].upper() if (len(x)>0) else "")            
      else:
        df_temp['index']=df_temp[i['trigramme']].apply(lambda x:x[0].upper() if (len(x)>0) else "")    

      df_temp['index'] = df_temp['index'].astype("category")
      df_temp['index'].cat.set_categories(a, inplace=True)
      df_temp.sort_values(["index",i['trigramme']], ascending=True, inplace=True)
      if i['trigramme'] == 'geo':
        # pas de majuscule en géorgien, on force en minuscule si besoin
        df_temp['index']=df_temp['index'].apply(lambda x:x[0].lower() if (len(x)>0) else "")     
      #df_temp = df_temp.sort_values(by=['index',i['trigramme']], ascending=True)    
      df_corpus_trad.append(df_temp)
      sh_trad = 'corpus_central_base_'+i['trigramme']
      # nom_onglet = "med_vac_synthese"
      sh_id = i['uri']
      try:
        wb_trad = gc.open_by_key(sh_id)
        sh = wb_trad.worksheet(nom_onglet)
        if trace is True:
          print ("* Reversing op for " + i['language'])
      except BaseException as e:
        print(e)
        sh_id = create_sheet(service, sh_trad)
        i['uri'] = sh_id
        move_sh (drive_service, sh_id, "1L8YxbtY9Rn0hEO-IkMtvikdJUEkXExyi")   
        wb_trad = gc.open_by_key(sh_id)
        sh = wb_trad.add_worksheet(nom_onglet, 1, 1)
      set_with_dataframe(sh, df_temp)
      format_feuille(wb_trad, nom_onglet)    
   
  return vk_languages

# Lancement du PIPE

In [12]:
version = "1.2.1"
nom_onglet = 'med_gen_synthese'
trace = True
debug = False
vk_corpus_saisie = []

In [13]:
# Config Langues
config_languages = 'med_vac_synthese.json'
vk_languages = get_cc_config(config_languages)

In [14]:
# Config Corpus (de consolidation)
config_corpus_central = "corpus_central_config.json"
vk_corpus_central = get_cc_config(config_corpus_central)

In [15]:
# Config Corpus locales (de saisie)
config_corpus_local = "corpus_local_config.json"
vk_corpus_local = get_cc_config(config_corpus_local)

In [16]:
# Base Centrale
# on charge l'onglet 1 (français)
db_cc_wb = get_CCDB_wb(vk_corpus_central[0]["uri"])
db_cc = get_CCDB_data (db_cc_wb, 1)
df_cc_global = get_ccdf_global(db_cc)
df_cc_fr = get_ccdf_fr(df_cc_global)

In [25]:
db_cl

<Spreadsheet 'ASAMLA - Corpus Médical Médecine Générale-Traduction-v1.0' id:1zB7hvjr2HufHcVgcJfEtrUUmR75wU5OCiL-_d4oDtTU>

In [56]:
# vecteur de corpus
vk_df_corpus = []
vk_df_trad = []
# on itère sur chaque corpus
for item in vk_corpus_local:
  print(item)
  # on lit la base locale
  db_cl = get_CLDB(item["uri"])
  # on construit la partie gauche (expression sans traduction)
  t_expr = db_cl.get_worksheet(vk_languages[0]['idx']) # le premier fait l'affaire
  data_t_expr = t_expr.get_all_values()
  df_corpus = pd.DataFrame(data_t_expr[1:], columns=data_t_expr[0])  
  df_corpus.rename({"français":"expression"}, axis=1, inplace=True)
  df_corpus = df_corpus[['uid','expression']]
  # on itère sur chaque langue
  for item_lang in vk_languages:
    print (item_lang)
    # on construit par ajout la partie droite (traductions)
    t_trad = db_cl.get_worksheet(item_lang['idx'])
    data_t_trad = t_trad.get_all_values()
    df_trad = pd.DataFrame(data_t_trad[1:], columns=data_t_trad[0])    
    # on renomme la colonne par le trigramme de la langue
    df_trad['traduction'] = df_trad['traduction'].apply(lambda x:x.strip() if x is not np.nan else "")
    df_trad.rename({"traduction":item_lang["trigramme"]}, axis=1, inplace=True)  
    # on ajoute une colonne sur la droite
    vk_df_trad.append(df_trad)
    df_corpus[item_lang["trigramme"]]=df_trad[item_lang["trigramme"]]

  # on obtient un vecteur qui contient les corpus avec transfo des onglets de traduction en colonnes
  vk_df_corpus.append(df_corpus)

{'corpus-name': 'Vaccination', 'corpus-filename': 'ASAMLA - Corpus Médical Vaccination-v2.0', 'domaine': 'med', 'trigramme': 'vac', 'onglet': 'med_vac_synthese', 'uri': 'https://docs.google.com/spreadsheets/d/1CclzYfFCW4srA3Lq_np2LpSrxj84JpcbzytL449DH8E', 'idx': 1, 'version': '1', 'sub-version': '1', 'creation-date': '2022-01-06', 'last-update': '2022-01-06', 'credits': 'Gilles Retière', 'model-filename': 'asamla-corpus_francais_med_vac-v1.2', 'model-uri': 'https://docs.google.com/spreadsheets/d/1-2SDHCFR5JOHCc4MiYSMo4V8yG7kfHbpIKV3q9ci6wE', 'meta': {'title-fr': 'Vaccination', 'title-en': 'Vaccination', 'subtitle-fr': '', 'subtitle-en': '', 'credits-fr': '', 'credits-en': ''}}
{'language': 'Arabe', 'trigramme': 'ams', 'uri': '134rEeVux-FvSxPt4EO4OM1M8PnGHa6P1ewtmnsnRfrs', 'idx': 1, 'update': 'false', 'alphabet': 'اآٱأإبتثجحخدذرزسشصضطظعغفقكلمنهةوؤيئىء', 'credits': 'Sonia ZARROUK, Wafa TAHRI', 'font-family': 'DejaVuSans', 'font-path': '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 't

In [None]:
df_temp = []
i=0
for item in vk_corpus_local:
  db_cl = get_CLDB(item["uri"])
  t_corpus_trad = db_cl.get_worksheet(vk_languages[5]['idx'])
  data_t_corpus = t_corpus_trad.get_all_values()
  df_corpus_trad = pd.DataFrame(data_t_corpus[1:], columns=data_t_corpus[0])
  df_corpus_trad.rename({"français":"expression"}, axis=1, inplace=True)
  df_1notin2 = df_corpus_trad[~(df_corpus_trad['uid'].isin(df_cc_fr['uid']) )].reset_index(drop=True)
  df_temp.append(pd.concat([df_cc_fr,df_1notin2]).drop_duplicates(keep=False))
  df_temp[i].drop("traduction", 1, inplace=True)
  i=i+1

df = pd.concat(df_temp).drop_duplicates(keep=False)

In [None]:
get_corpus_list(vk_languages, db_cl, df_cc_fr)

In [None]:
# mock
db_cl = get_CLDB(vk_corpus_local[0]["uri"]) # vaccination

In [None]:
t_corpus_trad = db_cl.get_worksheet(vk_languages[5]['idx'])
data_t_corpus = t_corpus_trad.get_all_values()
df_corpus_trad = pd.DataFrame(data_t_corpus[1:], columns=data_t_corpus[0])

In [None]:
df_corpus_trad

In [None]:
df_corpus_trad.rename({"français":"expression"}, axis=1, inplace=True)

In [None]:
# difference sur uid
df_1notin2 = df_corpus_trad[~(df_corpus_trad['uid'].isin(df_cc_fr['uid']) )].reset_index(drop=True)

In [None]:
df_1notin2

Unnamed: 0,uid,expression,traduction


In [None]:
df_new2 = pd.concat([df_cc_fr,df_1notin2]).drop_duplicates(keep=False)

In [None]:
df_new2.drop("traduction", 1, inplace=True)

In [None]:
pd.concat([df_new,df_new2]).drop_duplicates(keep=False)

NameError: ignored

In [None]:
df_new

Unnamed: 0,uid,expression,glossaire,état,date,commentaires,index
0,f397cda1,A jeun,TRUE,validé,07/06/2021,,A
1,95c4a863,Accès fébrile concomitant chez un autre membre...,FALSE,supprimé,07/06/2021,,A
2,30579682,Accident,TRUE,validé,07/06/2021,,A
3,6f214e4c,Accident cérébral,TRUE,validé,07/06/2021,,A
4,8b195cf4,Allergie,TRUE,validé,07/06/2021,,A
...,...,...,...,...,...,...,...
167,fee1214e,Vitamines,TRUE,validé,07/06/2021,,V
168,39cba202,Vomissements,FALSE,validé,07/06/2021,,V
169,80aac977,Vos vaccinations sont-elles à jour ?,FALSE,validé,07/06/2021,,V
170,36e66161,Y a-t-il des cas contagieux à l'école ?,FALSE,validé,07/06/2021,,Y


In [None]:
df_corpus_trad

Unnamed: 0,uid,expression,traduction
0,f397cda1,A jeun,Pe stomacul gol
1,95c4a863,Accès fébrile concomitant chez un autre membre...,Mai are cineva temperatură în familie ?
2,30579682,Accident,Accident
3,6f214e4c,Accident cérébral,Accident vascular cerebral
4,8b195cf4,Allergie,Alergie
...,...,...,...
166,fee1214e,Vitamines,Vitamine
167,39cba202,Vomissements,Vomă/Vărsături
168,80aac977,Vos vaccinations sont-elles à jour ?,Sunteți la zi cu vaccinurile ?
169,36e66161,Y a-t-il des cas contagieux à l'école ?,La sșcoală sunt cazuri de infecție contagioasă ?


In [None]:
df_cc_fr

Unnamed: 0,uid,expression,glossaire,état,date,commentaires,index
0,f397cda1,A jeun,TRUE,validé,07/06/2021,,A
1,95c4a863,Accès fébrile concomitant chez un autre membre...,FALSE,supprimé,07/06/2021,,A
2,30579682,Accident,TRUE,validé,07/06/2021,,A
3,6f214e4c,Accident cérébral,TRUE,validé,07/06/2021,,A
4,8b195cf4,Allergie,TRUE,validé,07/06/2021,,A
...,...,...,...,...,...,...,...
167,fee1214e,Vitamines,TRUE,validé,07/06/2021,,V
168,39cba202,Vomissements,FALSE,validé,07/06/2021,,V
169,80aac977,Vos vaccinations sont-elles à jour ?,FALSE,validé,07/06/2021,,V
170,36e66161,Y a-t-il des cas contagieux à l'école ?,FALSE,validé,07/06/2021,,Y


In [None]:
# merge avec l'existant côté serveur
pd.merge(df_cc_fr,df_corpus_trad[['uid','traduction']],on='uid', how='left')

Unnamed: 0,uid,expression,glossaire,état,date,commentaires,index,traduction
0,f397cda1,A jeun,TRUE,validé,07/06/2021,,A,Pe stomacul gol
1,95c4a863,Accès fébrile concomitant chez un autre membre...,FALSE,supprimé,07/06/2021,,A,Mai are cineva temperatură în familie ?
2,30579682,Accident,TRUE,validé,07/06/2021,,A,Accident
3,6f214e4c,Accident cérébral,TRUE,validé,07/06/2021,,A,Accident vascular cerebral
4,8b195cf4,Allergie,TRUE,validé,07/06/2021,,A,Alergie
...,...,...,...,...,...,...,...,...
166,fee1214e,Vitamines,TRUE,validé,07/06/2021,,V,Vitamine
167,39cba202,Vomissements,FALSE,validé,07/06/2021,,V,Vomă/Vărsături
168,80aac977,Vos vaccinations sont-elles à jour ?,FALSE,validé,07/06/2021,,V,Sunteți la zi cu vaccinurile ?
169,36e66161,Y a-t-il des cas contagieux à l'école ?,FALSE,validé,07/06/2021,,Y,La sșcoală sunt cazuri de infecție contagioasă ?


In [None]:
# on itère sur la liste
for i in vk_corpus_local:
  print(i["corpus-name"])
  vk_corpus_saisie.append(i["uri"])
  db_cl = get_CLDB(i["uri"])
  for j in vk_languages:
    if j['update']=='false':
      print ("Processing " + j['trigramme'])
      #df_corpus = get_corpus_trad(db_cl, df_cc_fr,i)
      #vk_df_corpus.append(df_corpus)  
      # base locale
      t_corpus_trad = db_cl.get_worksheet(d_lang['idx'])
      data_t_corpus = t_corpus_trad.get_all_values()
      df_corpus_trad = pd.DataFrame(data_t_corpus[1:], columns=data_t_corpus[0])
 
      # on vérifie que chaque expression possède la même clé
    df_corpus = pd.merge(df_corpus,df_corpus_trad[['uid','traduction']],on='uid', how='left')
  df_corpus['traduction'] = df_corpus['traduction'].apply(lambda x:x.strip() if x is not np.nan else "")
  
  df_corpus.rename({'traduction': d_lang['trigramme']}, axis=1, inplace=True)
  return df_corpus      

On met à jour l'existant

On ajoute ce qui est inexistant côté central

In [None]:
df_new = pd.merge(df_cc_fr,df_corpus_trad[['uid','expression','traduction']],on='expression', how='right')
df_new.rename({"uid_x":"uid"}, axis=1, inplace=True)
df_new["uid"]=df_new["uid_y"]

In [None]:
df_new

Unnamed: 0,uid,expression,glossaire,état,date,commentaires,index,uid_y,traduction
0,f397cda1,A jeun,TRUE,validé,07/06/2021,,A,f397cda1,Pe stomacul gol
1,95c4a863,Accès fébrile concomitant chez un autre membre...,,,,,,95c4a863,Mai are cineva temperatură în familie ?
2,30579682,Accident,TRUE,validé,07/06/2021,,A,30579682,Accident
3,6f214e4c,Accident cérébral,TRUE,validé,07/06/2021,,A,6f214e4c,Accident vascular cerebral
4,8b195cf4,Allergie,TRUE,validé,07/06/2021,,A,8b195cf4,Alergie
...,...,...,...,...,...,...,...,...,...
166,fee1214e,Vitamines,TRUE,validé,07/06/2021,,V,fee1214e,Vitamine
167,39cba202,Vomissements,FALSE,validé,07/06/2021,,V,39cba202,Vomă/Vărsături
168,80aac977,Vos vaccinations sont-elles à jour ?,,,,,,80aac977,Sunteți la zi cu vaccinurile ?
169,36e66161,Y a-t-il des cas contagieux à l'école ?,,,,,,36e66161,La sșcoală sunt cazuri de infecție contagioasă ?


In [None]:
vk_df_corpus = get_corpus_list(vk_languages, db_cl, df_cc_fr)


  vk_df_corpus = []
  for i in vk_languages:
    #i['uri'] = local_db_uri
    if i['update']=='false':
      print ("Processing " + i['trigramme'])
      df_corpus = get_corpus_trad(db_cl, df_cc_fr,i)
      vk_df_corpus.append(df_corpus)
  return vk_df_corpus

In [None]:
vk_df_corpus

In [None]:
def get_corpus_trad (wb, df_corpus, d_lang):
  t_corpus_trad = wb.get_worksheet(d_lang['idx'])
  data_t_corpus = t_corpus_trad.get_all_values()
  df_corpus_trad = pd.DataFrame(data_t_corpus[1:], columns=data_t_corpus[0])
  # on vérifie que chaque expression possède la même clé
  df_corpus = pd.merge(df_corpus,df_corpus_trad[['uid','traduction']],on='uid', how='left')
  df_corpus['traduction'] = df_corpus['traduction'].apply(lambda x:x.strip() if x is not np.nan else "")
  
  df_corpus.rename({'traduction': d_lang['trigramme']}, axis=1, inplace=True)
  return df_corpus

In [None]:
df[1]

In [None]:
t_corpus_trad = db_cl.get_worksheet(d_lang['idx'])
data_t_corpus = t_corpus_trad.get_all_values()
df_corpus_trad = pd.DataFrame(data_t_corpus[1:], columns=data_t_corpus[0])

In [None]:
df=[]
for j in vk_corpus_local:
  print(j["uri"])
  db_cl = get_CLDB(j["uri"])

  
  df.append (get_corpus_trad(db_cl, df_cc_fr,vk_languages[1]))

https://docs.google.com/spreadsheets/d/1CclzYfFCW4srA3Lq_np2LpSrxj84JpcbzytL449DH8E
https://docs.google.com/spreadsheets/d/1zB7hvjr2HufHcVgcJfEtrUUmR75wU5OCiL-_d4oDtTU


In [None]:
vk_df_corpus = []
# pour chaque langue
for i in vk_languages:
  print(i['trigramme'])
  # pour chaque corpus local
  df_temp=[]
  for j in vk_corpus_local:
    print(j["corpus-name"])
    db_cl = get_CLDB(j["uri"])
    df_temp.append(get_corpus_trad(db_cl, df_cc_fr,i))
  vk_df_corpus.append(df_temp)
  df_all = pd.concat(df_temp)

In [None]:
config_corpus_local = "corpus_central_config.json"
vk_corpus_central = get_cc_config(config_corpus_local)

In [None]:
db_cc_wb = get_CCDB_wb(vk_corpus_central[0]['uri'])
idx = 1 #onglet FR
db_cc = get_CCDB_data (db_cc_wb, idx)
df_cc_global = get_ccdf_global(db_cc)
df_cc_fr = get_ccdf_fr(df_cc_global)

In [None]:
df_cc_fr

In [None]:
#
# Config Params
#
config_languages = 'med_vac_synthese.json'
vk_languages = get_cc_config(config_languages)
vk_corpus_saisie = []

# liste des corpus locaux
config_corpus_local = "corpus_local_config.json"
vk_corpus_local = get_cc_config(config_corpus_local)

# on itère sur la liste
for i in vk_corpus_local:
  print(i["corpus-name"])
  vk_corpus_saisie.append(i["uri"])

version = "2.4.8"
nom_onglet = 'med_gen_synthese'
trace = True
debug = False
vk_color_theme = get_color_theme()
idx = 2 #Med générale (1=Vaccination)

#
# Base Centrale
#
config_corpus_local = "corpus_central_config.json"
vk_corpus_central = get_cc_config(config_corpus_local)

db_cc_wb = get_CCDB_wb('https://docs.google.com/spreadsheets/d/1L8YB1aXHUJwUE9AE6xyn_xMHalinGR335Q7lntwbu1U')
db_cc = get_CCDB_data (db_cc_wb, idx)
df_cc_global = get_ccdf_global(db_cc)
df_cc_fr = get_ccdf_fr(df_cc_global)


Vaccination
Médecine générale


In [None]:
local_db_uri = vk_corpus_saisie[1]

In [None]:
db_cl = get_CLDB(local_db_uri)

In [None]:
vk_df_corpus = get_corpus_list(vk_languages, db_cl, df_cc_fr)

Processing ams
Processing eng
Processing tur
Processing rus
Processing ukr
Processing rou
Processing hun
Processing tig
Processing alb
Processing geo
Processing arm
Processing dar
Processing pst
Processing prs
Processing aze
Processing esp
Processing amh
Processing all
Processing pol
Processing som


In [None]:
vk_df_corpus

[         uid  ...                                ams
 0   f9bc13ef  ...  في أي وقت من اليوم تتناول الدواء؟
 1   0e9f09c9  ...                     هل انخفض وزنك؟
 2   6e65dde7  ...                             خُرّاج
 3   1178fb77  ...                        تحليل البول
 4   ef2f771b  ...                          تخدير عام
 ..       ...  ...                                ...
 90  3691346a  ...                      أين لديك ألم؟
 91  192bc4c8  ...                   من أيّ بلد أتيت؟
 92  c32f5fd9  ...     هل يوجد أمراض مزمنة في عائلتك؟
 93  0394c416  ...                                   
 94  d288f6d8  ...                                   
 
 [95 rows x 8 columns],
          uid                                         expression  ... index eng
 0   f9bc13ef  À quelle moment de la journée prenez-vous le m...  ...     A    
 1   0e9f09c9                        Votre poids a-t-il baissé ?  ...          
 2   6e65dde7                                              Abcès  ...          
 3   1

In [None]:
df_cc = set_typo_rules(df_cc_fr)

Application des bonnes règles typographiques
ukr


In [None]:
sync_cc_local_to_central (db_cc_wb, df_cc, nom_onglet)

*****************************************
* Synchro okay                          *
*****************************************


In [None]:
create_reversed_corpus_with_index(vk_languages, df_cc, service, drive_service, nom_onglet)

med_gen_synthese
Spreadsheet ID: 1q-Z7KdXzQJcdSRdtLAa3p4rBmrRSwNB3Ur53lnlqG98
med_gen_synthese
Spreadsheet ID: 1scltKezBNXd3v2jpzxhU3r69MgOhkv2psHqS-wsTTXY
med_gen_synthese
Spreadsheet ID: 1yZdMFXENzF9KDLyP5nR-QXmvfVqPTO8LCQUlIcYSLQg
med_gen_synthese
Spreadsheet ID: 18VXch3TkMmoRhfINPeSNL2iuR1OpgX0HPhJtN9-6kvk
med_gen_synthese
Spreadsheet ID: 1XUMVZmBmUrPI5-8-3gAkJfs_4CatKAfv84GFdMp1Ktc
med_gen_synthese
Spreadsheet ID: 1wt1jj7uZpTprw2FQBRB7HDRZRS77XMFNbID9-GRMSns
med_gen_synthese
Spreadsheet ID: 1SNxKGTYpXkYP3dtcA01jrar1KX5zSQrB-FgtpI5ObR0
med_gen_synthese
Spreadsheet ID: 1ckrQuLCF4bC9x9_2m25SpxTLDb58c9-WCuvfbT_CQ1I
med_gen_synthese
Spreadsheet ID: 1KNIcaucGX4hSqoMc5PRmG4HXWGjO0pX4IT6EWRvi_cU
med_gen_synthese
Spreadsheet ID: 1jWDRE4NNiMnlnHU4aNRnKrWyREG3iO5BckwNAWF8gUE
med_gen_synthese
Spreadsheet ID: 1E2P6kO2cvConO8wBnLbNtZCMxf490ao9y6CTQhNRCz0
med_gen_synthese
Spreadsheet ID: 1uUYG1DsH56n_-EdMuAC6OLZ990_MerWIOx8fMMl3lsY
med_gen_synthese
Spreadsheet ID: 1mKugj5wm4rCZXSjQUfeBM2Q-A03V31

[{'alphabet': 'اآٱأإبتثجحخدذرزسشصضطظعغفقكلمنهةوؤيئىء',
  'credits': 'Sonia ZARROUK, Wafa TAHRI',
  'font-family': 'DejaVuSans',
  'font-family-bold': 'DejaVuSans-Bold',
  'font-path': '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
  'glossary-subtitle': 'عربي فرنسي',
  'idx': 1,
  'language': 'Arabe',
  'meta': {'credits-fr': '',
   'credits-tr': '',
   'subtitle-fr': '',
   'subtitle-tr': '',
   'title-fr': '',
   'title-tr': ''},
  'text-direction': 'rtl',
  'trigramme': 'ams',
  'unicode_substition': [],
  'update': 'false',
  'uri': '1q-Z7KdXzQJcdSRdtLAa3p4rBmrRSwNB3Ur53lnlqG98',
  'wals_code': 'ams'},
 {'alphabet': 'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz',
  'credits': 'Khalida BENHEDDER',
  'font-family': 'NotoSerif-Regular',
  'font-family-bold': 'NotoSerif-Bold',
  'font-path': '/usr/share/fonts/truetype/noto/NotoSerif-Regular.ttf',
  'glossary-subtitle': 'English-French',
  'idx': 2,
  'language': 'Anglais',
  'meta': {'credits-fr': '',
   'credits-tr': '',
 

In [None]:
#
# Config Params
#
config_languages = 'med_vac_synthese.json'
vk_languages = get_cc_config(config_languages)
vk_corpus_saisie = []
# vaccination
#local_db_uri = "https://docs.google.com/spreadsheets/d/1CclzYfFCW4srA3Lq_np2LpSrxj84JpcbzytL449DH8E"
#vk_corpus_saisie.append(local_db_uri)
# med gen
local_db_uri = "https://docs.google.com/spreadsheets/d/1zB7hvjr2HufHcVgcJfEtrUUmR75wU5OCiL-_d4oDtTU"
vk_corpus_saisie.append(local_db_uri)

version = "2.4.8"
debug = False
trace = True
step_1 = True
step_2 = True
step_3 = True
step_4 = False
vk_color_theme = get_color_theme()

#
# Base Centrale
#
db_cc_wb = get_CCDB_wb('https://docs.google.com/spreadsheets/d/1L8YB1aXHUJwUE9AE6xyn_xMHalinGR335Q7lntwbu1U')
db_cc = get_CCDB_data (db_cc_wb)
df_cc_global = get_ccdf_global(db_cc)
df_cc_fr = get_ccdf_fr(df_cc_global)
#
# Base Locale
#
db_cl = get_CLDB(local_db_uri)

if step_1 is True:
  vk_df_corpus = get_corpus_list(vk_languages, local_db_uri, df_cc_fr)
  #
  # application des règles typographiques et merge de tous les onglets
  #
  df_cc = set_typo_rules(df_cc_fr)


if step_2 is True:
  #
  # synchronisation des corpus
  #
  nom_onglet = 'med_gen_synthese'
  sync_cc_local_to_central (db_cc_wb, df_cc, nom_onglet)
  #
  # création des index inversés (un fichier par langue) des corpus
  #

if step_3 is True:
  create_reversed_corpus_with_index(vk_languages, df_cc, service, drive_service, nom_onglet)
  #
  #

  set_cc_config (vk_languages, config_languages)

  #
  # tableau de corpus indéxé par langue de traduction
  #
if step_4 is True:  
  df_corpus_trad = get_all_corpus_rev(vk_languages)

  make_pdf(vk_languages, df_corpus_trad, df_cc)


Processing ams
Processing eng
Processing tur
Processing rus
Processing ukr
Processing rou
Processing hun
Processing tig
Processing alb
Processing geo
Processing arm
Processing dar
Processing pst
Processing prs
Processing aze
Processing esp
Processing amh
Processing all
Processing pol
Processing som
ukr
*****************************************
* Synchro okay                          *
*****************************************
* Reversing op for Arabe
* Reversing op for Anglais
* Reversing op for Turc
* Reversing op for Russe


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


* Reversing op for Ukrainien
* Reversing op for Roumain
* Reversing op for Hongrois
* Reversing op for Tigrinya
* Reversing op for Albanais
* Reversing op for Géorgien
* Reversing op for Arménien
* Reversing op for Dari
* Reversing op for Pashto
* Reversing op for Fârsi
{
  "error": {
    "code": 503,
    "message": "The service is currently unavailable.",
    "status": "UNAVAILABLE"
  }
}

Spreadsheet ID: 1Le1Ek1EKvOMl1A4tSwNYnW5ubedTLuyDpVqqCuHDNnU
* Reversing op for Espagnol
* Reversing op for Amharique
* Reversing op for Allemand
<!DOCTYPE html>
<html lang=en>
  <meta charset=utf-8>
  <meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width">
  <title>Error 404 (Not Found)!!1</title>
  <style>
    *{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-rep

In [None]:
vk_languages

# FIN

In [None]:
import sys, locale
print(sys.getfilesystemencoding())
print(locale.getpreferredencoding())

utf-8
UTF-8


In [None]:
!ls /usr/share/fonts/truetype/dejavu

DejaVuSans-Bold.ttf	 DejaVuSansMono.ttf  DejaVuSerif-Bold.ttf
DejaVuSansMono-Bold.ttf  DejaVuSans.ttf      DejaVuSerif.ttf


In [None]:
ord(u'ۍ')

1741

In [None]:
ord(u' ')

In [None]:
ord(u'К')


1050

In [None]:
vk_languages

[{'alphabet': 'اآٱأإبتثجحخدذرزسشصضطظعغفقكلمنهةوؤيئىء',
  'credits': 'Sonia ZARROUK, Wafa TAHRI',
  'font-family': 'DejaVuSans',
  'font-family-bold': 'DejaVuSans-Bold',
  'glossary-subtitle': 'عربي فرنسي',
  'idx': 1,
  'language': 'Arabe',
  'meta': {'credits-fr': '',
   'credits-tr': '',
   'subtitle-fr': '',
   'subtitle-tr': '',
   'title-fr': '',
   'title-tr': ''},
  'text-direction': 'rtl',
  'trigramme': 'ams',
  'unicode_substition': [],
  'update': 'false',
  'uri': '134rEeVux-FvSxPt4EO4OM1M8PnGHa6P1ewtmnsnRfrs'},
 {'alphabet': 'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz',
  'credits': 'Khalida BENHEDDER',
  'font-family': 'NotoSerif-Regular',
  'font-family-bold': 'NotoSerif-Bold',
  'glossary-subtitle': 'English-French',
  'idx': 2,
  'language': 'Anglais',
  'meta': {'credits-fr': '',
   'credits-tr': '',
   'subtitle-fr': '',
   'subtitle-tr': '',
   'title-fr': '',
   'title-tr': ''},
  'text-direction': 'ltr',
  'trigramme': 'eng',
  'unicode_substition': [],


In [None]:
set_cc_config (vk_languages, config_languages)

In [None]:
meta = {'meta':{'title-fr':'', 'title-tr':'', 'subtitle-fr':'', 'subtitle-tr':'', 'credits-fr':'', 'credits-tr':''}}

In [None]:
for i in vk_languages:
  i['meta']={'title-fr':'', 'title-tr':'', 'subtitle-fr':'', 'subtitle-tr':'', 'credits-fr':'', 'credits-tr':''}

In [None]:
make_pdf(vk_languages, df_corpus_trad, df_cc)

In [None]:
vk_languages = get_cc_config(config_languages)

In [None]:
vk_languages[4]

{'alphabet': 'АаБбВвГгҐґДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЮюЯя',
 'credits': 'Natalia SAVELIEVA',
 'font-family': 'NotoSerif-Regular',
 'font-family-bold': 'NotoSerif-Bold',
 'glossary-subtitle': 'Українсько-французька',
 'idx': 5,
 'language': 'Ukrainien',
 'meta': {'credits-fr': 'Traduction : Natalia SAVELIEVA',
  'credits-tr': 'Переклад : Наталія САВЕЛЬЄВА',
  'subtitle-fr': 'Français-Ukrainien',
  'subtitle-tr': 'Українсько-Французький',
  'title-fr': 'Vaccination',
  'title-tr': 'Вакцинація'},
 'text-direction': 'ltr',
 'trigramme': 'ukr',
 'unicode_substition': [['A', 'A', 'А'],
  ['B', 'B', 'В'],
  ['E', 'E', 'Е'],
  ['I', 'I', 'І'],
  ['K', 'K', 'К'],
  ['M', 'M', 'М'],
  ['P', 'P', 'Р'],
  ['T', 'T', 'Т']],
 'update': 'false',
 'uri': '16gWyW1_7z1OOt_ssYHL8iCCZGjOjQ5Wtfbkew9_uvnw'}

In [None]:
vk_languages[3]['meta']['credits-fr']='Traduction : Natalia SAVELIEVA'
vk_languages[3]['meta']['credits-tr']='Перевод : Наталья САВЕЛЬЕВА'
vk_languages[3]['meta']['title-fr']='Vaccination'
vk_languages[3]['meta']['title-tr']='Вакцинация'
vk_languages[3]['meta']['subtitle-fr']='Français-Russe'
vk_languages[3]['meta']['subtitle-tr']='Русско-Французский'


vk_languages[4]['meta']['credits-fr']='Traduction : Natalia SAVELIEVA'
vk_languages[4]['meta']['credits-tr']='Переклад : Наталія САВЕЛЬЄВА'
vk_languages[4]['meta']['title-fr']='Vaccination'
vk_languages[4]['meta']['title-tr']='Вакцинація'
vk_languages[4]['meta']['subtitle-fr']='Français-Ukrainien'
vk_languages[4]['meta']['subtitle-tr']='Українсько-Французький'

1Jj0FiQqKikotBpGUomjsg3_VnWB8zoCDNn66lRDECSo

In [None]:
df_corpus = df_cc.copy()

In [None]:
create_reverse_indexed_corpus(vk_languages, df_cc, service, drive_service)

ValueError: ignored

In [None]:
df_cc_fr.loc[df_cc_fr['uid']=='f80074ba']

Unnamed: 0,uid,expression,glossaire,état,date,commentaires,index,ams,eng,tur,rus,ukr,rou,hun,tig,alb,geo,arm,dar,pst,prs,aze,esp,amh,all
100,f80074ba,Méningite,True,validé,07/06/2021,,M,التهاب السحايا,Meningitis,Menenjit,Менингит,Менінгіт,Meningită,Meningitisz (agyhártyagyulladás),ናይ ዓጽሚ ምግታር ሕማም,Meningjiti,მენინგიტი,մենինգիտ,,,,,Meningitis,,
101,f80074ba,Méningite,True,validé,07/06/2021,,M,التهاب السحايا,Meningitis,Menenjit,Менингит,Менінгіт,Meningită,Agyhártyagyulladás,ናይ ዓጽሚ ምግታር ሕማም,Meningjiti,მენინგიტი,մենինգիտ,,,,,Meningitis,,


In [None]:
vk_df_corpus