In [1]:
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install speechbrain
!pip install inaSpeechSegmenter
!pip install asrecognition
!pip install SpeechRecognition
!pip install pydub
!pip install pyyaml==5.4.1

In [3]:
video_dir = '/content/drive/MyDrive/DataForGood/bechdel/data/video'
audio_dir = '/content/drive/MyDrive/DataForGood/bechdel/data/audio'
chunks = '/content/drive/MyDrive/DataForGood/bechdel/data/audio/chunks'
folders = [video_dir, audio_dir, chunks]

In [4]:
for folder in folders:
  if not os.path.isdir(folder) : 
    !mkdir -p {folder}

In [None]:
!wget https://github.com/dataforgoodfr/mm2_bechdelai-vision/blob/main/data/sample_audios/LA_LETTRE.wav -P {audio_dir}

In [5]:
audio_path = os.path.join(audio_dir, 'LA_LETTRE.wav')
small_test = os.path.join(audio_dir, 'Enregistrement.wav')

In [6]:
import plotly.express as px
import datetime
import torch
from torchaudio.sox_effects import apply_effects_file
from transformers import AutoFeatureExtractor, AutoModelForAudioXVector
from speechbrain.pretrained import SepformerSeparation as separator
import torchaudio
from speechbrain.pretrained import EncoderASR
from asrecognition import ASREngine
from pydub import AudioSegment
import transformers
import speech_recognition as sr
from inaSpeechSegmenter import Segmenter
from inaSpeechSegmenter.export_funcs import seg2csv, seg2textgrid
import os
import pandas as pd
from tqdm import tqdm
import re
tqdm.pandas()

In [7]:
def similarity_fn(path1, path2,treshold):    
    wav1, _ = apply_effects_file(path1, EFFECTS)
    wav2, _ = apply_effects_file(path2, EFFECTS)
    

    input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
    input2 = feature_extractor(wav2.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)

    with torch.no_grad():
        emb1 = model(input1).embeddings
        emb2 = model(input2).embeddings
    emb1 = torch.nn.functional.normalize(emb1, dim=-1).cpu()
    emb2 = torch.nn.functional.normalize(emb2, dim=-1).cpu()
    similarity = cosine_sim(emb1, emb2).numpy()[0]

    if similarity >= treshold:
        output = True
    else:
        output = False

    return output


def speech_segmenter(segmenter, wavfile):  
    segmentation = segmenter(wavfile)
    
    mapper = {'source' : [], 'start_time' : [] , 'end_time' : [] }
    
    for seg in segmentation:
        mapper['source'].append(seg[0])
        mapper['start_time'].append(seg[1])
        mapper['end_time'].append(seg[2])


    df = pd.DataFrame(mapper)
    
    return df

def map_numbers(x,mapper):
  for k in mapper.keys():
    if x in mapper[k]:
      return k



def crop_and_save_segments(wavfile, chunks_path, index, start_time, end_time):
    sound = AudioSegment.from_file(wavfile)
    segment = sound[start_time*1000:end_time*1000]
    segment.export(os.path.join(chunks_path,f'female{index}.wav'), format="wav")
    segment_fname = os.path.join(chunks_path,f'female{index}.wav')
    return segment_fname
   
def indexing_persons(path1, path2, treshold, women_index):
    is_same = similarity_fn(path1, path2, treshold)
    return women_index if is_same else -1   


def compare_voices_from_audio(wavfile, chunks_path, model, segmenter, treshold ) :
    """
    This function processes a wavefile and crops it according to female segmented speeches
    It then takes saved chunks of female voices and compares them one-to-one in a O(n) way
    It adds a categorical column indexing each woman in the audio
    It then returns : 
        the segmentation dataframe
        the distinct women dataframe
        the indexed persons dataframe

    """
    print(r'******************')
    print("Segmenting audio file...")
    print(r'******************')
    df = speech_segmenter(segmenter, wavfile)
    df.reset_index(drop=False,inplace=True)
    
    print(r'******************')
    print("Exporting cropped audio files...")
    print(r'******************')
    df.loc[df['source']=='female',"female_fname"] = df.loc[df['source']=='female',:].progress_apply(lambda x :\
    crop_and_save_segments(wavfile, chunks_path, x['index'], x['start_time'], x['end_time']), axis=1)

    df['person_indexer'] = -1
    all_women = df.loc[df['source']=='female',:]
    
    
    print(r'******************')
    print("Comparing Voices...")
    print(r'******************')
    all_women.reset_index(drop=True,inplace=True)
    grappes = []
    dropped = []
    for i in tqdm(range(1,len(all_women))):
        to_loop = list(range(0,i))
        for x in dropped:
            to_loop.remove(x)
        for j in to_loop[::-1]:
                rowi1 = all_women.loc[j,"female_fname"]
                rowi = all_women.loc[i,"female_fname"]
                is_same = similarity_fn(os.path.join(chunks_path,rowi1), os.path.join(chunks_path,rowi),treshold=treshold)
                if is_same :
                    grappes.append([rowi,rowi1])
                    dropped.append(j)
                    break            
            
    
    print(r'******************')
    print("Indexing Persons...")
    print(r'******************')
    for cpl in grappes:
      cpl[0] = int(re.findall(r'\d+', cpl[0])[0])
      cpl[1] = int(re.findall(r'\d+', cpl[1])[0])

    groups = []
    while len(grappes)!=0: 
      group = []
      to_drop = []
      for i in tqdm(range(len(grappes))):
        if i==0:
          group = list(set(group + grappes[i]))
        if len(set(group + grappes[i])) < len(group + grappes[i]):
          group = list(set(group + grappes[i]))
          to_drop.append(i)

      groups.append(group)
      to_keep = list(set(list(range(len(grappes)))) - set(to_drop))
      grappes = [grappes[p] for p in to_keep]
    
    mapper = {k : groups[k] for k in range(len(groups))}

    all_women['number'] = all_women['female_fname'].progress_apply(lambda x : int(re.findall(r'\d+', x)[0]))

    all_women['person_indexer'] = all_women['number'].progress_apply(lambda x : map_numbers(x,mapper))

    return df, all_women , groups, dropped

In [8]:
treshold = 0.80
EFFECTS = [
    ["remix", "-"],
    ["channels", "1"],
    ["rate", "16000"],
    ["gain", "-1.0"],
    ["silence", "1", "0.1", "0.1%", "-1", "0.1", "0.1%"],
    ["trim", "0", "10"],
]

In [9]:
seg = Segmenter()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "microsoft/wavlm-base-plus-sv"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModelForAudioXVector.from_pretrained(model_name).to(device)
cosine_sim = torch.nn.CosineSimilarity(dim=-1)

Downloading:   0%|          | 0.00/57.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/215 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/386M [00:00<?, ?B/s]

In [10]:
df, all_women , groups, dropped = compare_voices_from_audio(wavfile=audio_path, chunks_path=chunks, model=model, segmenter=seg, treshold=treshold )

******************
Segmenting audio file...
******************


  for e, c in six.moves.zip(emission.T, consecutive)
  for e, c in six.moves.zip(constraint.T, consecutive)
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


******************
Exporting cropped audio files...
******************


100%|██████████| 17/17 [00:02<00:00,  7.51it/s]


******************
Comparing Voices...
******************


100%|██████████| 16/16 [00:41<00:00,  2.58s/it]


******************
Indexing Persons...
******************


100%|██████████| 14/14 [00:00<00:00, 72137.91it/s]
100%|██████████| 1/1 [00:00<00:00, 6584.46it/s]
100%|██████████| 17/17 [00:00<00:00, 14217.98it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 17/17 [00:00<00:00, 23863.18it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
groups

[[2, 67, 37, 69, 7, 39, 9, 71, 74, 13, 15, 17, 22, 24], [20, 77]]

In [12]:
all_women

Unnamed: 0,index,source,start_time,end_time,female_fname,person_indexer,number
0,2,female,12.44,17.96,/content/drive/MyDrive/DataForGood/bechdel/dat...,0.0,2
1,7,female,54.68,66.58,/content/drive/MyDrive/DataForGood/bechdel/dat...,0.0,7
2,9,female,69.12,72.48,/content/drive/MyDrive/DataForGood/bechdel/dat...,0.0,9
3,13,female,76.82,84.88,/content/drive/MyDrive/DataForGood/bechdel/dat...,0.0,13
4,15,female,85.3,86.98,/content/drive/MyDrive/DataForGood/bechdel/dat...,0.0,15
5,17,female,88.34,89.08,/content/drive/MyDrive/DataForGood/bechdel/dat...,0.0,17
6,20,female,92.22,93.48,/content/drive/MyDrive/DataForGood/bechdel/dat...,1.0,20
7,22,female,98.68,100.24,/content/drive/MyDrive/DataForGood/bechdel/dat...,0.0,22
8,24,female,100.8,103.6,/content/drive/MyDrive/DataForGood/bechdel/dat...,0.0,24
9,37,female,134.98,137.06,/content/drive/MyDrive/DataForGood/bechdel/dat...,0.0,37


In [34]:
all_women.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   index           17 non-null     int64         
 1   source          17 non-null     object        
 2   start_time      17 non-null     float64       
 3   end_time        17 non-null     float64       
 4   female_fname    17 non-null     object        
 5   person_indexer  16 non-null     float64       
 6   number          17 non-null     int64         
 7   start_datetime  17 non-null     datetime64[ns]
 8   end_datetime    17 non-null     datetime64[ns]
dtypes: datetime64[ns](2), float64(3), int64(2), object(2)
memory usage: 1.3+ KB


In [53]:
df = df.drop(columns=['person_indexer'])

In [54]:
all_women2 = all_women[['female_fname','person_indexer']]

In [92]:
df_graph = pd.merge(df, all_women2 , 'left' , on='female_fname')

In [93]:
df_graph.drop(columns=['index'],inplace=True)

In [94]:
df_graph['start_datetime'] = df_graph['start_time'].apply(lambda x : datetime.datetime.now() + datetime.timedelta(seconds=x))
df_graph['end_datetime'] = df_graph['end_time'].apply(lambda x : datetime.datetime.now() + datetime.timedelta(seconds=x))

In [95]:
df_graph['person_indexer'] = df_graph['person_indexer'].fillna(-1).astype('int').astype('object')

In [99]:
df_graph['person_indexer'] = df_graph['person_indexer'].apply(lambda x : 'Female ' + str(x))

In [100]:
df_graph.loc[df_graph['source']== 'male' ,'person_indexer'] = 'male'
df_graph.loc[df_graph['source']== 'noEnergy' ,'person_indexer'] = 'silence'
df_graph.loc[df_graph['source']== 'noise' ,'person_indexer'] = 'bruit'
df_graph.loc[df_graph['source']== 'music' ,'person_indexer'] = 'musique'

In [101]:
df_graph.person_indexer.value_counts()

male         27
silence      20
bruit        15
Female 0     14
musique      13
Female 1      2
Female -1     1
Name: person_indexer, dtype: int64

In [102]:
df_graph = df_graph.drop(df_graph[df_graph.person_indexer == 'Female -1'].index).reset_index(drop=True)

In [106]:
fig = px.timeline(data_frame=df_graph, x_start='start_datetime', x_end='end_datetime', y='person_indexer', color='person_indexer', facet_row=None, facet_col=None,
                        facet_col_wrap=0, facet_row_spacing=None, facet_col_spacing=None, hover_name=None, hover_data=None,
                        custom_data=None, text=None, animation_frame=None, animation_group=None, category_orders=None, labels=None, color_discrete_sequence=None, color_discrete_map=None,
                        color_continuous_scale=None, range_color=None, color_continuous_midpoint=None, opacity=None, range_x=None, range_y=None, title=None, template=None, width=None, height=None)

fig.update_yaxes(categoryarray=['musique','bruit','silence','male'] + ['Female ' + str(x) for x in range(len(df_graph.person_indexer.value_counts().tolist())-4)][::-1])
fig.show()