# Installation

In [None]:
# moviepy [MIT Licence] >>> extraction de son
# https://towardsdatascience.com/extracting-audio-from-video-using-python-58856a940fd

# inaSpeechSegmenter [MIT Licence] >>> reconnaissance du genre du locuteur

# SpeechRecognition [BSD License] >>> speech to text
# https://pypi.org/project/SpeechRecognition/2.1.3/
# https://openclassrooms.com/forum/sujet/reconnaissance-vocale-en-python-1
# https://medium.com/@garcjes/transcribe-a-podcast-in-python-with-vosk-api-35eba7d96c1f

In [22]:
# !pip install moviepy
# !pip install inaSpeechSegmenter
# !pip install SpeechRecognition

# 1. Extract the audio with moviepy

In [2]:
import moviepy.editor as mp

In [5]:
# Make sure to upload a video in "./media" !
path_to_the_video = r"./media/video2.mp4"
my_clip = mp.VideoFileClip(path_to_the_video)

In [7]:
# Extract the audio
my_clip.audio.write_audiofile(r"./media/extract.wav")

MoviePy - Writing audio in ./media/extract.wav


                                                                    

MoviePy - Done.




# 2. Extract the text with SpeechRecognition

In [8]:
import speech_recognition as sr

In [9]:
def decodeSpeech(wavefile, start_time = None, end_time = None, language = None):
    
    r = sr.Recognizer()
    #r.pause_threshold = 3
    #r.dynamic_energy_adjustment_damping = 0.5

    with sr.WavFile(wavefile) as source:
      if start_time == None and end_time == None:
        audio_text = r.record(source)
      else:
        audio_text = r.record(source, duration = end_time - start_time, offset = start_time)
    
    if language == None: #default language is American English
      lg = "en-US"
    else:
      lg = language

      # recoginize_() method will throw a request error if the API is unreachable, hence using exception handling
      try:
          
          # using google speech recognition
          text = r.recognize_google(audio_text, language = lg)
          print('Converting audio transcripts into text ...')
          return text
      
      except:
          print('Sorry.. run again...')

In [10]:
# Define the audio file to perform speech-recognition and segmentation:
WAVFILE = "./media/extract.wav"
language = "fr-FR"

# 3. Gender Recognition with inaSpeechSegmenter

In [14]:
# Load the API
from inaSpeechSegmenter import Segmenter
from inaSpeechSegmenter.export_funcs import seg2csv, seg2textgrid

In [15]:
# select a media to analyse
# any media supported by ffmpeg may be used (video, audio, urls)
media = WAVFILE

In [16]:
# create an instance of speech segmenter
# this loads neural networks and may last few seconds
# Warnings have no incidence on the results
seg = Segmenter()

In [17]:
# segmentation is performed using the __call__ method of the segmenter instance
segmentation = seg(media)

2021-12-22 08:53:26.432257: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [18]:
# the result is a list of tuples
# each tuple contains:
# * label in 'male', 'female', 'music', 'noEnergy'
# * start time of the segment
# * end time of the segment
print(segmentation)

[('male', 0.0, 3.24), ('female', 3.24, 4.28), ('noise', 4.28, 6.72), ('male', 6.72, 15.22), ('female', 15.22, 16.46), ('noEnergy', 16.46, 17.18), ('male', 17.18, 20.18), ('noise', 20.18, 24.16), ('male', 24.16, 29.1), ('female', 29.1, 31.46), ('male', 31.46, 34.46), ('noEnergy', 34.46, 35.46), ('female', 35.46, 43.52)]


In [19]:
# Perform speech-to-text with SpeechRecognition for each element in the male/female segmentation

segmentation_bis = []

for segment in segmentation:
  segment_bis = {"type" : segment[0],
                 "start_time" : segment[1],
                 "end_time" : segment[2],
                 "text" : decodeSpeech(wavefile = WAVFILE, start_time = segment[1], end_time = segment[2], language = language)
                 }
  segmentation_bis.append(segment_bis)

segmentation_bis

Converting audio transcripts into text ...
Converting audio transcripts into text ...
Sorry.. run again...
Converting audio transcripts into text ...
Sorry.. run again...
Sorry.. run again...
Converting audio transcripts into text ...
Sorry.. run again...
Converting audio transcripts into text ...
Converting audio transcripts into text ...
Converting audio transcripts into text ...
Sorry.. run again...
Converting audio transcripts into text ...


[{'type': 'male', 'start_time': 0.0, 'end_time': 3.24, 'text': 'et alors'},
 {'type': 'female',
  'start_time': 3.24,
  'end_time': 4.28,
  'text': 'donne-moi mon'},
 {'type': 'noise', 'start_time': 4.28, 'end_time': 6.72, 'text': None},
 {'type': 'male',
  'start_time': 6.72,
  'end_time': 15.22,
  'text': "tu as fait quelque chose pour mériter ta nourriture ses céréales je sais payer avec mon argent j'ai gagné parce que j'ai travaillé"},
 {'type': 'female', 'start_time': 15.22, 'end_time': 16.46, 'text': None},
 {'type': 'noEnergy', 'start_time': 16.46, 'end_time': 17.18, 'text': None},
 {'type': 'male',
  'start_time': 17.18,
  'end_time': 20.18,
  'text': "j'ai déjà vu dans des dans le quartier"},
 {'type': 'noise', 'start_time': 20.18, 'end_time': 24.16, 'text': None},
 {'type': 'male',
  'start_time': 24.16,
  'end_time': 29.1,
  'text': 'choisis entre 1 et 6 tu es une voleuse tant pis pour toi'},
 {'type': 'female',
  'start_time': 29.1,
  'end_time': 31.46,
  'text': "qu'est-ce

In [20]:
import pandas as pd
df = pd.DataFrame(segmentation_bis)
df[(df["type"] == "male") | (df["type"] == "female")]

Unnamed: 0,type,start_time,end_time,text
0,male,0.0,3.24,et alors
1,female,3.24,4.28,donne-moi mon
3,male,6.72,15.22,tu as fait quelque chose pour mériter ta nourr...
4,female,15.22,16.46,
6,male,17.18,20.18,j'ai déjà vu dans des dans le quartier
8,male,24.16,29.1,choisis entre 1 et 6 tu es une voleuse tant pi...
9,female,29.1,31.46,qu'est-ce que vous croyez qu'on choisit d'être...
10,male,31.46,34.46,tu es jeune si tu voulais vraiment bosser tu b...
12,female,35.46,43.52,ma mère était toxico ma grand-mère qui m'a éle...


# Idées pour la suite

In [None]:
# Détection de voix distinctes --> du type --> female1, female2, male1, etc.
# ---> resemblyzer ?
# ---> https://www.audiolabs-erlangen.de/resources/2017-CountNet

In [None]:
# !pip install resemblyze

# from resemblyzer import VoiceEncoder, preprocess_wav
# from pathlib import Path
# import numpy as np

# WAVFILE = '/content/extraction_du_son.wav'

# fpath = Path(WAVFILE)
# wav = preprocess_wav(fpath)

# encoder = VoiceEncoder()
# embed = encoder.embed_utterance(wav)
# np.set_printoptions(precision=3, suppress=True)
# # print(embed)