# Construire un EAF à partir d'un fichier de sous-titres issu de Whisper
- le traitement est basé sur un SRT
- chaque LINGUISTIC_TYPE_REF des TIER doit correspondre à un LINGUISTIC_TYPE_ID dans les LINGUISTIC_TYPE


In [1]:
import lxml.etree as ET
import codecs

## Sélection du fichier SRT

In [10]:
fSRT="FranceArgentine-DecWhisper-TransGold.srt"
fSRT="Whisper_NouvelleZelande-France.srt"
fEAF=fSRT.replace(".srt",".eaf")
print (fEAF)

Whisper_NouvelleZelande-France.eaf


In [11]:
with codecs.open(fSRT,"r",encoding="utf8") as inFile:
    lignesSRT=[l.strip() for l in inFile.readlines()]
if lignesSRT[0].endswith("1"): lignesSRT[0]="1"
else: print("vérifier le fichier SRT")

## Construction de la coquille
- racine ANNOTATION_DOCUMENT
        

In [2]:
ANNOTATION_DOCUMENT=ET.Element("ANNOTATION_DOCUMENT",AUTHOR="",DATE="",FORMAT="3.0", VERSION="3.0")

### Header
- header
     - media_descriptor
     - property
 

In [3]:
HEADER=ET.SubElement(ANNOTATION_DOCUMENT,"HEADER",MEDIA_FILE="",TIME_UNITS="milliseconds")
MEDIA_DESCRIPTOR=ET.SubElement(HEADER,"MEDIA_DESCRIPTOR",MEDIA_URL="",MIME_TYPE="",RELATIVE_MEDIA_URL="")
PROPERTY=ET.SubElement(HEADER,"PROPERTY", NAME="lastUsedAnnotationId")

### Time order for Time slots
- time order
    - time slot


In [4]:
TIME_ORDER=ET.SubElement(ANNOTATION_DOCUMENT,"TIME_ORDER")

### Tiers for Annotations
- tier
    - annotation
        - alignable annotation
            - annotation value


In [5]:
WhisperTIER=ET.SubElement(ANNOTATION_DOCUMENT,"TIER",ANNOTATOR="Whisper",LINGUISTIC_TYPE_REF="Whisper",PARTICIPANT="Whisper",TIER_ID="Whisper")

In [6]:
JournalTIER=ET.SubElement(ANNOTATION_DOCUMENT,"TIER",ANNOTATOR="",LINGUISTIC_TYPE_REF="Journaliste",PARTICIPANT="Journaliste",TIER_ID="Journaliste")

In [7]:
ExpertTIER=ET.SubElement(ANNOTATION_DOCUMENT,"TIER",ANNOTATOR="",LINGUISTIC_TYPE_REF="Expert",PARTICIPANT="Expert",TIER_ID="Expert")

In [8]:
TerrainTIER=ET.SubElement(ANNOTATION_DOCUMENT,"TIER",ANNOTATOR="",LINGUISTIC_TYPE_REF="Terrain",PARTICIPANT="Terrain",TIER_ID="Terrain")

### Linguistic type

In [9]:
LINGUISTIC_TYPE=ET.SubElement(ANNOTATION_DOCUMENT,"LINGUISTIC_TYPE",GRAPHIC_REFERENCES="false",LINGUISTIC_TYPE_ID="Whisper",TIME_ALIGNABLE="true")
LINGUISTIC_TYPE=ET.SubElement(ANNOTATION_DOCUMENT,"LINGUISTIC_TYPE",GRAPHIC_REFERENCES="false",LINGUISTIC_TYPE_ID="Journaliste",TIME_ALIGNABLE="true")
LINGUISTIC_TYPE=ET.SubElement(ANNOTATION_DOCUMENT,"LINGUISTIC_TYPE",GRAPHIC_REFERENCES="false",LINGUISTIC_TYPE_ID="Expert",TIME_ALIGNABLE="true")
LINGUISTIC_TYPE=ET.SubElement(ANNOTATION_DOCUMENT,"LINGUISTIC_TYPE",GRAPHIC_REFERENCES="false",LINGUISTIC_TYPE_ID="Terrain",TIME_ALIGNABLE="true")

## Conversion des sous-titres vers EAF

In [12]:
def getSeconds(time):
    h, m, s = [float(i) for i in time.replace(",",".").split(':')]
    result=((60*h+m)*60+s)*1000
    return str(int(result))

def getSubtitles(lignes):
    result=[]
    currentState="start"
    currentNum=1
    for l in lignes:
        if currentState=="start" and str(currentNum)==l:
            num=l
            currentState="time"
        elif currentState=="time":
            t1,t2=l.split(" --> ")
            t1=getSeconds(t1)
            t2=getSeconds(t2)
            currentState="text"
        elif currentState=="text":
            text=l
            currentState="skip"
        elif currentState=="skip" and l=="":
            currentState="start"
            currentNum+=1
            result.append({"num":num,"start":t1,"end":t2,"text":text})
    return result        

In [13]:
subtitles=getSubtitles(lignesSRT)

In [14]:
for s in subtitles:
    ANNOTATION=ET.SubElement(WhisperTIER,"ANNOTATION")
    aid="au"+s["num"]
    ALIGNABLE_ANNOTATION=ET.SubElement(ANNOTATION,"ALIGNABLE_ANNOTATION",
                                       ANNOTATION_ID=aid,
                                       TIME_SLOT_REF1=aid+s["start"],
                                       TIME_SLOT_REF2=aid+s["end"]
                                      )
    ANNOTATION_VALUE=ET.SubElement(ALIGNABLE_ANNOTATION,"ANNOTATION_VALUE")
    ANNOTATION_VALUE.text=s["text"]
    TIME_SLOT=ET.SubElement(TIME_ORDER,"TIME_SLOT",
                            TIME_SLOT_ID=aid+s["start"],
                            TIME_VALUE=s["start"],
                           )
    TIME_SLOT=ET.SubElement(TIME_ORDER,"TIME_SLOT",
                            TIME_SLOT_ID=aid+s["end"],
                            TIME_VALUE=s["end"],
                           )

## Exportation du fichier EAF

In [15]:
print (ET.tostring(ANNOTATION_DOCUMENT,pretty_print=True,encoding="utf8").decode("utf8"))

<ANNOTATION_DOCUMENT AUTHOR="" DATE="" FORMAT="3.0" VERSION="3.0">
  <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
    <MEDIA_DESCRIPTOR MEDIA_URL="" MIME_TYPE="" RELATIVE_MEDIA_URL=""/>
    <PROPERTY NAME="lastUsedAnnotationId"/>
  </HEADER>
  <TIME_ORDER>
    <TIME_SLOT TIME_SLOT_ID="au10" TIME_VALUE="0"/>
    <TIME_SLOT TIME_SLOT_ID="au12000" TIME_VALUE="2000"/>
    <TIME_SLOT TIME_SLOT_ID="au22000" TIME_VALUE="2000"/>
    <TIME_SLOT TIME_SLOT_ID="au24000" TIME_VALUE="4000"/>
    <TIME_SLOT TIME_SLOT_ID="au34000" TIME_VALUE="4000"/>
    <TIME_SLOT TIME_SLOT_ID="au36000" TIME_VALUE="6000"/>
    <TIME_SLOT TIME_SLOT_ID="au46000" TIME_VALUE="6000"/>
    <TIME_SLOT TIME_SLOT_ID="au48000" TIME_VALUE="8000"/>
    <TIME_SLOT TIME_SLOT_ID="au58000" TIME_VALUE="8000"/>
    <TIME_SLOT TIME_SLOT_ID="au510000" TIME_VALUE="10000"/>
    <TIME_SLOT TIME_SLOT_ID="au610000" TIME_VALUE="10000"/>
    <TIME_SLOT TIME_SLOT_ID="au612000" TIME_VALUE="12000"/>
    <TIME_SLOT TIME_SLOT_ID="au712000" TIME

In [16]:
EAF=ET.ElementTree(ANNOTATION_DOCUMENT)
EAF.write(fEAF, pretty_print=True, encoding='utf-8', xml_declaration=True)