# Imports

In [1]:
import moviepy.editor as mp
import speech_recognition as sr
from inaSpeechSegmenter import Segmenter
from inaSpeechSegmenter.export_funcs import seg2csv, seg2textgrid
import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# Process 1 : Extraire l'audio d'une video

In [2]:
path_to_the_video = os.path.abspath(os.path.join(os.getcwd(),os.pardir,'data','sample_videos','cousinade_court_métrage.mp4'))

In [3]:
my_clip = mp.VideoFileClip(path_to_the_video)

In [4]:
#Extract the audio
path_to_audio = os.path.abspath(os.path.join(path_to_the_video, os.pardir, 'cousinade_audio.wav'))
#my_clip.audio.write_audiofile(path_to_audio)

# Process 2 : Extraire le texte

In [5]:
def decodeSpeech(wavefile, start_time = None, end_time = None, language = None):
    
    r = sr.Recognizer()
    #r.pause_threshold = 3
    #r.dynamic_energy_adjustment_damping = 0.5

    with sr.WavFile(wavefile) as source:
      if start_time == None and end_time == None:
        audio_text = r.record(source)
      else:
        audio_text = r.record(source, duration = end_time - start_time, offset = start_time)
    
    if language == None: #default language is American English
      lg = "en-US"
    else:
      lg = language

      # recoginize_() method will throw a request error if the API is unreachable, hence using exception handling
      try:
          
          # using google speech recognition
          text = r.recognize_google(audio_text, language = lg)
          print('Converting audio transcripts into text ...')
          return text
      
      except:
          print('Sorry.. run again...')

In [6]:
language = "fr-FR"

In [7]:
seg = Segmenter()

In [8]:
segmentation = seg(path_to_audio)

In [17]:
mapper = {'source' : [], 'start_time' : [] , 'end_time' : [] }
for seg in segmentation:
    mapper['source'].append(seg[0])
    mapper['start_time'].append(seg[1])
    mapper['end_time'].append(seg[2])
    

In [19]:
df = pd.DataFrame(mapper)

In [21]:
df.source.value_counts()

female      29
male        24
noise       15
noEnergy     8
music        3
Name: source, dtype: int64

In [23]:
df['text'] = ''

In [31]:
df.loc[df['source']=='female','text'] =\
df.loc[df['source']=='female',:].progress_apply(lambda x : decodeSpeech(path_to_audio, x['start_time'], x['end_time'], language), axis=1)

  7%|█████▋                                                                             | 2/29 [00:10<02:19,  5.17s/it]

Converting audio transcripts into text ...


 10%|████████▌                                                                          | 3/29 [00:14<02:06,  4.85s/it]

Converting audio transcripts into text ...


 14%|███████████▍                                                                       | 4/29 [00:16<01:29,  3.58s/it]

Converting audio transcripts into text ...


 17%|██████████████▎                                                                    | 5/29 [00:16<01:00,  2.54s/it]

Converting audio transcripts into text ...


 21%|█████████████████▏                                                                 | 6/29 [00:17<00:45,  1.97s/it]

Converting audio transcripts into text ...


 24%|████████████████████                                                               | 7/29 [00:18<00:36,  1.66s/it]

Converting audio transcripts into text ...


 28%|██████████████████████▉                                                            | 8/29 [00:19<00:27,  1.29s/it]

Sorry.. run again...


 31%|█████████████████████████▊                                                         | 9/29 [00:19<00:22,  1.12s/it]

Converting audio transcripts into text ...


 34%|████████████████████████████▎                                                     | 10/29 [00:20<00:18,  1.03it/s]

Converting audio transcripts into text ...


 38%|███████████████████████████████                                                   | 11/29 [00:21<00:18,  1.03s/it]

Converting audio transcripts into text ...


 41%|█████████████████████████████████▉                                                | 12/29 [00:22<00:14,  1.14it/s]

Converting audio transcripts into text ...


 45%|████████████████████████████████████▊                                             | 13/29 [00:22<00:12,  1.27it/s]

Converting audio transcripts into text ...


 48%|███████████████████████████████████████▌                                          | 14/29 [00:23<00:13,  1.11it/s]

Converting audio transcripts into text ...


 52%|██████████████████████████████████████████▍                                       | 15/29 [00:24<00:12,  1.13it/s]

Converting audio transcripts into text ...


 55%|█████████████████████████████████████████████▏                                    | 16/29 [00:25<00:11,  1.11it/s]

Converting audio transcripts into text ...


 59%|████████████████████████████████████████████████                                  | 17/29 [00:26<00:10,  1.18it/s]

Converting audio transcripts into text ...


 62%|██████████████████████████████████████████████████▉                               | 18/29 [00:27<00:08,  1.28it/s]

Sorry.. run again...


 66%|█████████████████████████████████████████████████████▋                            | 19/29 [00:28<00:08,  1.18it/s]

Sorry.. run again...


 69%|████████████████████████████████████████████████████████▌                         | 20/29 [00:29<00:08,  1.05it/s]

Converting audio transcripts into text ...


 72%|███████████████████████████████████████████████████████████▍                      | 21/29 [00:31<00:09,  1.20s/it]

Converting audio transcripts into text ...


 76%|██████████████████████████████████████████████████████████████▏                   | 22/29 [00:32<00:08,  1.27s/it]

Converting audio transcripts into text ...


 79%|█████████████████████████████████████████████████████████████████                 | 23/29 [00:33<00:07,  1.30s/it]

Converting audio transcripts into text ...


 83%|███████████████████████████████████████████████████████████████████▊              | 24/29 [00:35<00:06,  1.31s/it]

Converting audio transcripts into text ...


 86%|██████████████████████████████████████████████████████████████████████▋           | 25/29 [00:37<00:06,  1.52s/it]

Converting audio transcripts into text ...


 90%|█████████████████████████████████████████████████████████████████████████▌        | 26/29 [00:38<00:04,  1.45s/it]

Converting audio transcripts into text ...


 93%|████████████████████████████████████████████████████████████████████████████▎     | 27/29 [00:39<00:02,  1.43s/it]

Converting audio transcripts into text ...


 97%|███████████████████████████████████████████████████████████████████████████████▏  | 28/29 [00:41<00:01,  1.58s/it]

Converting audio transcripts into text ...


100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:42<00:00,  1.35s/it]

Converting audio transcripts into text ...


100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:43<00:00,  1.50s/it]

Sorry.. run again...





In [39]:
df.loc[df['source']=='female',:]

Unnamed: 0,source,start_time,end_time,text
2,female,6.88,48.82,merci ma cousine
5,female,86.28,108.46,du coup ton copain ça fait une semaine amoureuse
8,female,118.48,122.2,numéro inconnu
10,female,126.16,127.84,ouais c'est moi c'est qui
12,female,129.82,131.82,Ouais je crois je me souviens
14,female,133.9,139.66,ça va bien remis
16,female,149.46,150.72,
18,female,152.72,157.26,du coup je crois que
21,female,159.34,161.26,Alex enchanté
23,female,162.86,167.62,c'est que des trucs chez moi


In [40]:
from pydub import AudioSegment
t1 = 168.94 * 1000 #Works in milliseconds
t2 = 170.40 * 1000
newAudio = AudioSegment.from_wav(path_to_audio)
newAudio = newAudio[t1:t2]


In [41]:
type(newAudio)

pydub.audio_segment.AudioSegment

In [43]:
newAudio.export(os.path.abspath(os.path.join(path_to_audio, os.pardir, 'tests.wav')), format="wav")

<_io.BufferedRandom name='C:\\Users\\mehdi\\Desktop\\data_for_good\\mm2_bechdelai-vision\\data\\sample_videos\\tests.wav'>

In [45]:

decodeSpeech(os.path.abspath(os.path.join(path_to_audio, os.pardir, 'tests.wav')), start_time = None, end_time = None, language = language)

Converting audio transcripts into text ...


'bah ouais je crois bien ouais'

In [46]:
#Resemblyzer