<a href="https://colab.research.google.com/github/gretiere545/tradunion/blob/master/corpus_glossary_builder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Création d'un glossaire alphabétique des Corpus ASAMLA

## Chargement des Corpus

In [None]:
#!/usr/bin/env python
# -*- coding: utf8 -*-

!pip install fpdf
!pip install arabic_reshaper
# https://pyfpdfbook.wordpress.com/2015/03/22/putting-two-adjacent-multicell-blocks/
import pandas as pd
import numpy as np
from fpdf import FPDF
import arabic_reshaper
pd.set_option("display.width",1000)
# Ce bout de code pour pouvoir downloader des fichiers sur le drive
from google.colab import drive
drive.mount('/content/drive')
# Ce bout de code pour pouvoir downloader des google sheets dans des dataframes
from google.colab import auth
auth.authenticate_user()
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())
!export PYTHONIOENCODING=utf8
%cd /content/drive/MyDrive/Trad-Union/Corpus/ASAMLA


In [None]:
# cyrillique
!apt-get update -qq
!apt-get install -y fonts-dejavu-core -qq

# amharique
!apt-get update
!apt-get install fonts-noto


In [5]:
# Ouverture de la Sheet Centrale (corpus_central_base)
sheet_central = 'https://docs.google.com/spreadsheets/d/1L8YB1aXHUJwUE9AE6xyn_xMHalinGR335Q7lntwbu1U'
wb_central = gc.open_by_url(sheet_central)
#ouverture de l'onglet Corpus dans un DF
t_corpus = wb_central.worksheet('med_vac_synthese')
data_t_corpus = t_corpus.get_all_values()
df_corpus = pd.DataFrame(data_t_corpus[1:], columns=data_t_corpus[0])
df = df_corpus[['uid','expression','rus','index']]



In [57]:
def charge_corpus_langue(df_corpus, langue):
  #ouverture de l'onglet Corpus dans un DF
  df = df_corpus[['uid','expression', langue,'index']]
  return df


In [56]:
vk_lang_dict=[
{'language':'Arabe','trigramme':'ams'},
{'language':'Turc','trigramme':'tur'},
{'language':'Russe','trigramme':'rus'},
{'language':'Ukrainien','trigramme':'ukr'},
{'language':'Roumain','trigramme':'rou'},
{'language':'Hongrois','trigramme':'hun'},
{'language':'Tigrinya','trigramme':'tig'},
{'language':'Albanais','trigramme':'alb'},
{'language':'Géorgien','trigramme':'geo'},
{'language':'Arménien','trigramme':'arm'},
{'language':'Dari','trigramme':'dar'},
{'language':'Pashto','trigramme':'pst'},
{'language':'Fârsi','trigramme':'prs'},
{'language':'Azéri','trigramme':'aze'},
{'language':'Espagnol','trigramme':'esp'},
{'language':'Amharique','trigramme':'amh'},
{'language':'Peul','trigramme':'fma'},
]

In [73]:
class PDF(FPDF):
  def __init__(self):
      super().__init__()
      self.WIDTH = 210
      self.HEIGHT = 297
      self.format = 'A4'
      self.unit = 'in'
      self.set_margins(20.0, 10.0, 20.0)

  def header(self):
      # Custom logo and positioning
      # Create an `assets` folder and put any wide and short image inside
      # Name the image `logo.png`
      self.image('./logo_asamla.jpg', 10, 8, 33)
      self.set_text_color (0,0,0)
      self.set_font('Arial', '', 11)
      self.cell(self.WIDTH - 80)
      self.cell(50, 1, 'Corpus Médical - Vaccination', 0, 0, 'R')
      self.ln(5)      
      self.cell(self.WIDTH - 80)
      self.set_text_color (98,163,98)
      self.cell(50, 1, 'Glossaire Français-'+self.local_language, 0, 0, 'R')      
      new_y = self.get_y()+7
      self.line(0, new_y, 210, new_y)

  def print_page(self, images):
      # Generates the report
      self.add_page()  

  def set_langue(self, langue):
      self.local_language = langue




In [75]:
def create_pdf_instance(item, df):
  pdf=PDF()
  pdf.set_langue (item['language'])

  # cut here -------------------------------------------------------------
  pdf.add_font('DejaVuSans', '', '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', uni=True)
  pdf.add_font('DejaVuSans-Bold', '', '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', uni=True)
  pdf.set_font('DejaVuSans', '', 9.0)
  # amharique/tigrinya
  pdf.add_font('NotoSerifEthiopic', '', '/usr/share/fonts/truetype/noto/NotoSerifEthiopic-Regular.ttf', uni=True)
  #pdf.set_font('NotoSerifEthiopic', '', 9.0)

  pdf.format = 'A4'
  pdf.unit = 'in'
  pdf.set_margins(20.0, 10.0, 20.0)
  # Add new page. Without this you cannot create the document.
  pdf.add_page()
  
  # Remember to always put one of these at least once.
  #pdf.set_font('Times','',10.0) 

  A4_height_inches = 11.6929
  effective_page_width = pdf.w - 2*pdf.l_margin
  effective_page_height = pdf.h - 2*pdf.b_margin

  multi_cell_width = (effective_page_width/2)-5

  rowh = 4.5
  ln = 6.5
  pdf.ln(ln)
  idx = '' # index alpha (start)
  for ind in df.index:
    ybefore = pdf.get_y()
    if df['index'][ind] != idx:
      pdf.ln(ln*1.5)
      ybefore = pdf.get_y()
      # changement lettre index
      pdf.set_font('DejaVuSans-Bold','',24.0)
      pdf.set_text_color (98,163,98)
      pdf.multi_cell(multi_cell_width,  rowh, df['index'][ind])
      pdf.set_xy(effective_page_width/2 + pdf.l_margin, ybefore)
      pdf.multi_cell(multi_cell_width,  rowh, " ")
      idx = df['index'][ind]
      pdf.ln(ln)

    ybefore = pdf.get_y()
    pdf.set_font('DejaVuSans', '', 9.0)
    pdf.set_text_color (0,0,0)
    pdf.multi_cell(multi_cell_width,  rowh, df['expression'][ind])
    pdf.set_xy(effective_page_width/2 + pdf.l_margin, ybefore)
    #---- cas de l'alphabet arabe
    # arabic_string = arabic_reshaper.reshape(df['ams'][ind])
    # arabic_string = arabic_string[::-1]
    # w = pdf.get_string_width(arabic_string) + 6
    #pdf.cell(w, 9, arabic_string, 0, 1, 'L', 0)

    #pdf.set_font('NotoSerifEthiopic', '', 9.0)
    pdf.multi_cell(multi_cell_width,  rowh, df[item['trigramme']][ind])

    pdf.ln(ln)

    space_left = effective_page_height-pdf.get_y()
    if space_left < 0:
      pdf.add_page()
      pdf.ln(ln*2)
    #print (df['expression'][ind], str(space_left))

  pdf.ln(ln)  
  # cut here -------------------------------------------------------------


  pdf.output(item['trigramme'] + '.pdf', 'F')

In [76]:
df_corpus_langue=[] 
#ouverture de chaque onglet Corpus de travail dans un DF
for index, item in enumerate(vk_lang_dict):
  df_corpus_langue.append (charge_corpus_langue(df_corpus, item['trigramme']))
  create_pdf_instance(item, df_corpus_langue[index])


## Classe de construction de PDF

In [65]:
# Create instance of FPDF class
# Letter size paper, use inches as unit of measure
# https://stackoverflow.com/questions/58801358/displaying-arabic-words-incorrectly-in-fpdf-python

pdf=PDF()
pdf.set_langue ('ukr')
pdf.add_font('DejaVuSans', '', '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', uni=True)
pdf.add_font('DejaVuSans-Bold', '', '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', uni=True)
pdf.set_font('DejaVuSans', '', 9.0)
# amharique/tigrinya
pdf.add_font('NotoSerifEthiopic', '', '/usr/share/fonts/truetype/noto/NotoSerifEthiopic-Regular.ttf', uni=True)
#pdf.set_font('NotoSerifEthiopic', '', 9.0)

pdf.format = 'A4'
pdf.unit = 'in'
pdf.set_margins(20.0, 10.0, 20.0)
# Add new page. Without this you cannot create the document.
pdf.add_page()
 
# Remember to always put one of these at least once.
#pdf.set_font('Times','',10.0) 

A4_height_inches = 11.6929
effective_page_width = pdf.w - 2*pdf.l_margin
effective_page_height = pdf.h - 2*pdf.b_margin

multi_cell_width = (effective_page_width/2)-5

rowh = 4.5
ln = 6.5
pdf.ln(ln)
idx = '' # index alpha (start)
for ind in df.index:
  ybefore = pdf.get_y()
  if df['index'][ind] != idx:
    pdf.ln(ln*1.5)
    ybefore = pdf.get_y()
    # changement lettre index
    pdf.set_font('DejaVuSans-Bold','',24.0)
    pdf.set_text_color (98,163,98)
    pdf.multi_cell(multi_cell_width,  rowh, df['index'][ind])
    pdf.set_xy(effective_page_width/2 + pdf.l_margin, ybefore)
    pdf.multi_cell(multi_cell_width,  rowh, " ")
    idx = df['index'][ind]
    pdf.ln(ln)

  ybefore = pdf.get_y()
  pdf.set_font('DejaVuSans', '', 9.0)
  pdf.set_text_color (0,0,0)
  pdf.multi_cell(multi_cell_width,  rowh, df['expression'][ind])
  pdf.set_xy(effective_page_width/2 + pdf.l_margin, ybefore)
  #---- cas de l'alphabet arabe
  # arabic_string = arabic_reshaper.reshape(df['ams'][ind])
  # arabic_string = arabic_string[::-1]
  # w = pdf.get_string_width(arabic_string) + 6
  #pdf.cell(w, 9, arabic_string, 0, 1, 'L', 0)

  #pdf.set_font('NotoSerifEthiopic', '', 9.0)
  pdf.multi_cell(multi_cell_width,  rowh, df['rus'][ind])

  pdf.ln(ln)

  space_left = effective_page_height-pdf.get_y()
  if space_left < 0:
    pdf.add_page()
    pdf.ln(ln*2)
  #print (df['expression'][ind], str(space_left))

pdf.ln(ln)  
pdf.output('tut58.pdf', 'F')

''

In [39]:
ls -l /usr/share/fonts/truetype/dejavu/

total 2956
-rw-r--r-- 1 root root 705684 Jul 30  2016 DejaVuSans-Bold.ttf
-rw-r--r-- 1 root root    549 Jun 14 08:39 DejaVuSans.cw127.pkl
-rw-r--r-- 1 root root 331992 Jul 30  2016 DejaVuSansMono-Bold.ttf
-rw-r--r-- 1 root root 340712 Jul 30  2016 DejaVuSansMono.ttf
-rw-r--r-- 1 root root 136927 Jun 14 08:39 DejaVuSans.pkl
-rw-r--r-- 1 root root 757076 Jul 30  2016 DejaVuSans.ttf
-rw-r--r-- 1 root root 356088 Jul 30  2016 DejaVuSerif-Bold.ttf
-rw-r--r-- 1 root root 380132 Jul 30  2016 DejaVuSerif.ttf


In [None]:
# Import FPDF class
from fpdf import FPDF


# Create instance of FPDF class
# Letter size paper, use inches as unit of measure
pdf=FPDF(format='A4', unit='in')
pdf.add_font('DejaVuSans', '', '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', uni=True)
pdf.set_font('DejaVuSans', '', 9.0)

pdf.add_font('NotoSerifEthiopic', '', '/usr/share/fonts/truetype/noto/NotoSerifEthiopic-Regular.ttf', uni=True)
pdf.set_font('NotoSerifEthiopic', '', 9.0)
# Add new page. Without this you cannot create the document.
pdf.add_page()
 
# Remember to always put one of these at least once.
#pdf.set_font('Times','',10.0) 

A4_height_inches = 11.6929
effective_page_width = pdf.w - 2*pdf.l_margin
effective_page_height = pdf.h - 2*pdf.b_margin

for ind in df.index:
  ybefore = pdf.get_y()
  pdf.multi_cell(effective_page_width/2,  0.15, df['expression'][ind])
  pdf.set_xy(effective_page_width/2 + pdf.l_margin, ybefore)
  pdf.multi_cell(effective_page_width/2,  0.15, df['tig'][ind])
  pdf.ln(0.15)
  space_left = effective_page_height-pdf.get_y()
  if space_left < 0:
    pdf.add_page()
  #print (df['expression'][ind], str(space_left))

pdf.output('test4.pdf','F')

''

In [None]:
#!wget "http://sourceforge.net/projects/dejavu/files/dejavu/2.37/dejavu-fonts-ttf-2.37.zip"
!unzip "download?family=dejavu-fonts-ttf-2"

!mv DejaVuSans.ttf /usr/share/fonts/truetype/

!fc-cache -f -v