# WhisperTS joining JSON to SRT

In [1]:
import re,codecs,json,unidecode
import pandas as pd
import networkx as nx
import matplotlib.pylab as p

In [2]:
repJson="/Users/gilles/sDrive/Cours/Bordeaux/L2-Corpus/Whisper/JSON/"

In [3]:
json1=repJson+"extrait-00.json"
with open(json1) as inFile:
    whisper1=json.load(inFile)

### Gestion des timestamps

In [4]:
def second2timestamp(seconds):
    sHours=int(seconds/3600)
    sMinutes=int((seconds-sHours*3600)/60)
    sSeconds=int((seconds-sHours*3600-sMinutes*60))
    sMilliseconds=int((seconds%1)*1000)
    return "%02d:%02d:%02d,%03d"%(sHours,sMinutes,sSeconds,sMilliseconds)

def timestamp2second(timestamp):
    m=re.match("(\d\d):(\d\d):(\d\d),(\d\d\d)",timestamp)
    if m:
        seconds=(int(m.group(1))*60+int(m.group(2)))*60+int(m.group(3))+float(m.group(4))/1000
        return seconds
    else:
        print("problème de timestamp")
        return

def shiftTime(timestamp,decal):
    return second2timestamp(timestamp2second(timestamp)+decal)

In [5]:
punct = list('''!()[]{};:-'"\,<>./?@#$%^&*_~''')
maxSize=8

def removePunct(texte):
    result=texte
    for p in punct:
        result=result.replace(p,"")
    return result

### Gestion des structures JSON de Whisper

In [6]:
def getWords(whisper):
    lWords=[]
    for segment in whisper["segments"]:
        for word in segment["words"]:
            lWords.append((second2timestamp(word["start"]),
                              second2timestamp(word["end"]),
                              word["text"],
                              word["confidence"]))        
    return [(t1,t2,w,c) for t1,t2,w,c in lWords if w!=""]

def getList(tWhisper):
    return [unidecode.unidecode(removePunct(w)).lower() for _,_,w,_ in tWhisper]

In [7]:
srt=getWords(whisper1)
srtList=getList(srt)

### Trouver la jointure

In [8]:
def findSample(sample,ref):
    i=0
    while i<len(ref)-len(sample):
        # print("ref",i)
        if sample==ref[i:i+len(sample)]:
            return i
        else:
            i+=1
    return None

### Collage des mots
1. on cherche la partie commune entre le nouveau segment et les sous-titres précédents
2. on peut régler la longueur de la partie commune pour optimiser le collage *maxSize*
3. on incrémente les temps de la nouvelle partie à partir du deuxième mot (pour éviter les erreurs de timestamp en début d'enregistrement)

In [9]:
def pasteWords(srt,whisper2):
    srtList=getList(srt)
    w2=getWords(whisper2)
    w2List=getList(w2)
    lenW2=len(w2)
    compSize=min(maxSize,lenW2)
    i=1
    print(compSize)
    while i<lenW2-compSize:
        sample=w2List[i:i+compSize]
        ref=srtList
        # print(sample)
        idx=findSample(sample,ref)
        if idx:
            print("found",i,idx)
            break
        else:
            i+=1
    if idx:
        print("found",i,idx,srtList[idx],srt[idx][0],w2List[i],w2[i][0])
        tDecal=timestamp2second(srt[idx+1][0])-timestamp2second(w2[i+1][0])
        print(second2timestamp(tDecal))
        newSrt=[]
        for n in range(i+1,lenW2):
            nWord=w2[n]
            newWord=(shiftTime(nWord[0],tDecal),shiftTime(nWord[1],tDecal),nWord[2],nWord[3])
            newSrt.append(newWord)
        print(srt[idx+1],newSrt[0])
        return srt[:idx]+newSrt
    else:
        return None

In [10]:
for i in range(1,100):
    print(i)
    json2=repJson+"extrait-%02d.json"%i
    with open(json2) as inFile:
        whisper2=json.load(inFile)
    newSrt=pasteWords(srt,whisper2)
    if not newSrt:
        break
    else:
        srt=newSrt
# print(srt)

1
8
found 6 148
found 6 148 ca 00:01:03,929 ca 00:00:04,089
00:01:00,001
('00:01:04,310', '00:01:04,469', 'ça', 0.856) ('00:01:04,310', '00:01:04,469', 'ça', 0.735)
2
8
found 1 326
found 1 326 ledesma 00:02:00,010 ledesma 00:00:00,120
00:02:00,010
('00:02:00,730', '00:02:01,031', 'Super', 0.527) ('00:02:00,730', '00:02:01,030', 'Super', 0.905)
3
8
found 7 458
found 7 458 hernandez 00:03:02,588 hernandez 00:00:02,819
00:03:00,050
('00:03:03,890', '00:03:03,989', 'Il', 0.988) ('00:03:03,889', '00:03:04,030', 'Il', 0.809)
4
8
found 1 588
found 1 588 lequipe 00:04:00,279 lequipe 00:00:00,340
00:04:00,039
('00:04:00,760', '00:04:00,839', 'de', 1.0) ('00:04:00,759', '00:04:00,840', 'de', 0.999)
5
8
found 10 726
found 10 726 et 00:05:04,050 et 00:00:04,000
00:05:00,028
('00:05:04,149', '00:05:04,308', "c'est", 0.925) ('00:05:04,149', '00:05:04,288', "c'est", 0.986)
6
8
found 6 891
found 6 891 fabrice  00:06:01,769 fabrice  00:00:01,240
00:06:00,309
('00:06:03,269', '00:06:03,507', 'Bonsoir', 

In [11]:
pause=0.200

srtTours=[]
srtMots=[]
bTour=True
prevEnd=0
pStart=second2timestamp(0)
pEnd=0

for i,(start,end,mot,confidence) in enumerate(srt):
    # print(start,end,mot,confidence)
    tStart=timestamp2second(start)
    tEnd=timestamp2second(end)
    # print(start,tStart,prevEnd)
    if tStart-prevEnd>pause and prevEnd!=0:
        pEnd=second2timestamp(prevEnd)
        pTour=" ".join(srtMots)
        srtTours.append((pStart,pEnd,pTour))
        pStart=start
        prevEnd=tEnd
        srtMots=[mot]
        bPhrase=True
    else:
        srtMots.append(mot)
        prevEnd=tEnd
pTour=" ".join(srtMots)
srtTours.append((pStart,pEnd,pTour))

In [12]:
srtLines=[]
for i,(start,end,mot) in enumerate(srtTours):
    print(start,end,mot)
    n=i+1
    srtLines.append(str(n))
    srtLines.append(start+" --> "+end)
    srtLines.append(mot)
    srtLines.append("")


00:00:00,000 00:00:11,009 Spreadbury, il sera encadré par un Australien, M. Dickinson, et par un Néo-Zélandais, [*] M.
00:00:11,269 00:00:11,650 Lawrence.
00:00:13,380 00:00:19,018 M. Spreadbury, c'est le plus âgé des arbitres de cette Coupe du Monde, le plus expérimenté,
00:00:20,609 00:00:29,268 et petit [*] signe peut-être du destin, c'est lui qui dirigeait le France-Argentine, gagné par l'équipe de France au mois de novembre dernier, [*] ici même au Stade de France.
00:00:29,510 00:00:30,149 [*]
00:00:32,689 00:00:38,990 Après avoir demandé si tout le monde était prêt, le coup d'envoi de la Coupe du Monde 2007 est donné par David Skrela.
00:00:40,490 00:00:42,969 Allez, de suite dans le camp, [*] Argentin.
00:00:43,189 00:00:47,950 Avec un départ de Roncero, [*] Pichot a appelé le ballon [*] pour [*] Hernandez,
00:00:48,450 00:00:52,990 [*] avec le pied droit, pour trouver une très jolie touche.
00:00:53,969 00:01:00,509 Direct, il en avait le droit, il était à l'intérieur de ses [

In [13]:
with codecs.open(json1.replace(".json","-turns.srt"),"w",encoding="utf8") as outFile:
    for srtLine in srtLines:
        outFile.write(srtLine+"\n")