In [1]:
import json
import os
import gcp_speech2text as sp2txt
import transcript as trs
import constants
from google.cloud import speech

CREDENTIALS_FILE = "../gcp-credential/key.json"

# biz-meeting, debating, interview, monologue
style = 'biz-meeting'
gcs_uri_wav_lst = constants.gcs_uri_wav[style]
speaker_info_lst = constants.speaker[style]

# For Test Run
idx = 0

In [2]:
credentials = CREDENTIALS_FILE
gcs_uri = gcs_uri_wav_lst[idx]
speakers = speaker_info_lst[idx]['min_max']

In [3]:
script_conf_pairs, word_infos = sp2txt.speech_to_text(gcs_uri, 
                                                            CREDENTIALS_FILE, 
                                                            speaker_info_lst[idx]['min_max'])

Google Cloud Storage: gs://talking-dataset/biz-meeting/biz-result-oup-brainstorming-meeting_16k.wav
Waiting for Speech-to-Text to complete...
Speak diarization - min:2, max:6
Completed!



In [4]:
transcript = sp2txt.generate_transcript_with_tag(word_infos)
transcript

Generated Transcript! 25



[{'speaker': 2,
  'contents': "improving our performance with customers he's all about changing the company culture so we need a message or slogan between even need an image that everyone sees every day on that desk to remind them of the go like a mission statement",
  'start_time': 0.3,
  'end_time': 19.2},
 {'speaker': 1, 'contents': 'sort of', 'start_time': 19.2, 'end_time': 20.8},
 {'speaker': 2,
  'contents': "something short and instantly recognizable mission statements tend to be forgotten all that too long but I don't want to rule out any ideas at this stage what does I'd like us to do this morning is spend an hour or so brainstorming some ideas to see what we can come up with just a reminder this is a rainstorm so all ideas are welcome we're not touch with valuate the ideas yet we just want to get as many as possible to get to stuff it should begin with the word customer in the center and see what comes from that okay",
  'start_time': 22.2,
  'end_time': 66.3},
 {'speaker': 1

In [5]:
# ---------------------------------------------------------------------
client = speech.SpeechClient.from_service_account_file(credentials)

# ---------------------------------------------------------------------
audio = speech.RecognitionAudio(uri=gcs_uri)

print(f"Google Cloud Storage: {gcs_uri}\nWaiting for Speech-to-Text to complete...")
print(f"Speak diarization - min:{speakers[0]}, max:{speakers[1]}")
is_monologue = False
if speakers == (1, 1):
    speakers = (2, 2)
    is_monologue = True
    print("It is Monologue")
diarization_config = speech.SpeakerDiarizationConfig(
    enable_speaker_diarization=True,
    min_speaker_count=speakers[0],
    max_speaker_count=speakers[1],
)
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, # For .wav with 16000 sampling rate
    sample_rate_hertz=16000, 
    language_code="en-US",
    diarization_config=diarization_config,
)
operation = client.long_running_recognize(config=config, audio=audio)

# ---------------------------------------------------------------------
response = operation.result(timeout=300)

# ---------------------------------------------------------------------
# Each result is for a consecutive portion of the audio. Iterate through
# them to get the transcripts for the entire audio file.
script_conf_pairs = []
for result in response.results:
    # The first alternative is the most likely one for this portion.
    script_conf_pairs.append({"Transcript": result.alternatives[0].transcript,
                                "Confidence": result.alternatives[0].confidence})

# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result:
result = response.results[-1]
infos = result.alternatives[0].words
# print(result.alternatives[0])
word_infos = []
for inf in infos:
    # print(inf)
    item = {"word":inf.word,
            "speaker_tag":inf.speaker_tag,
            "start_time": inf.start_time.total_seconds(),
            "end_time": inf.end_time.total_seconds()}
    if is_monologue: item['speaker_tag'] = 1
    word_infos.append(item)    

print("Completed!")
print()


Google Cloud Storage: gs://talking-dataset/biz-meeting/biz-result-oup-brainstorming-meeting_16k.wav
Waiting for Speech-to-Text to complete...
Speak diarization - min:2, max:6
Completed!



In [7]:
transcript = sp2txt.generate_transcript_with_tag(word_infos)


Generated Transcript! 23



In [8]:
transcript

[{'speaker': 2,
  'contents': "improving our performance with customers he's all about changing the company culture so we need a message or slogan between even need an image that everyone sees every day on that desk to remind them of the go like a mission statement",
  'start_time': 0.3,
  'end_time': 19.2},
 {'speaker': 1, 'contents': 'sort of', 'start_time': 19.2, 'end_time': 20.8},
 {'speaker': 2,
  'contents': "something short and instantly recognizable mission statements tend to be forgotten all that too long but I don't want to rule out any ideas at this stage what does I'd like us to do this morning is spend an hour or so brainstorming some ideas to see what we can come up with just a reminder this is a rainstorm so all ideas are welcome we're not touch with valuate the ideas yet we just want to get as many as possible to get to stuff it should begin with the word customer in the center and see what comes from that okay",
  'start_time': 22.2,
  'end_time': 66.3},
 {'speaker': 1

In [9]:

# NOTE: 'speaker' is going to be set by User Input through Front-end
# speaker_sample is used for getting this User Input
if speaker_info_lst[idx]['min_max'] != (1,1):
    speaker_sample = trs.extract_speaker_sample(transcript)
speaker = speaker_info_lst[idx]['speakers'] 

Extracted audio samples by each speaker tag!
1 : customer service sorry wake up at 1 already how ab... (28.80s)
2 : and it would be a good name if you wanted to offer... (87.70s)



In [10]:
transcript_txt = trs.convert_transcript_json2txt(transcript, speaker)

Mapped speaker's name into the tag!

{1: 'Marcus', 2: 'Paul'}
Converted transcript json into txt! 3561



# Generate Transctipt.json

In [2]:

# -------------------------------------------------------------------
script_conf_pairs, word_infos = sp2txt.speech_to_text(gcs_uri_wav, 
                                                            CREDENTIALS_FILE, 
                                                            speaker_info_lst[idx]['min_max'])

transcript = sp2txt.generate_transcript_with_tag(word_infos)


Google Cloud Storage: gs://talking-dataset/biz-meeting/biz-result-oup-brainstorming-meeting_16k.wav
Waiting for Speech-to-Text to complete...
Speak diarization - min:2, max:6
Completed!
Generated Transcript! 43


In [3]:
word_infos_path = os.path.join('./', 'word_infos.json')
trs.save_result(word_infos_path, word_infos)

Saved contents into the file! 
./word_infos.json


In [4]:
transcript_json_path = os.path.join('./', 'transcript.json')
trs.save_result(transcript_json_path, transcript)

Saved contents into the file! 
./transcript.json


# Extract Audio Samples

In [10]:
import numpy as np

In [11]:
with open('./transcript.json','r') as f :
    transcript = json.load(f)

In [12]:
transcript

[{'speaker': 1,
  'contents': 'improving our performance with customers is all about changing the company culture so we need a message or a slogan perhaps we even need an image that everyone sees every day on their desks to remind them of the goal like',
  'start_time': 0.4,
  'end_time': 18.2},
 {'speaker': 2,
  'contents': 'a mission statement',
  'start_time': 18.2,
  'end_time': 19.2},
 {'speaker': 1,
  'contents': "well thought of recognizable mission statements tend to be forgotten or they're too long but I don't want to rule out any idea that this stage so what does I'd like us to do this morning is spend an hour or so brainstorming some ideas to see what we can come up with and just a reminder this is a brainstorm so all ideas are welcome we're not here to evaluate the ideas yet we just want to get as many as possible to get us started perhaps we should begin with",
  'start_time': 20.2,
  'end_time': 60.0},
 {'speaker': 3,
  'contents': 'the word customer in the center and see

[{'speaker': 1}, {'speaker': 2}, {'speaker': 3}, {'speaker': 4}, {'speaker': 5}]


In [24]:
speaker = np.unique([tr['speaker'] for tr in transcript])
audio_sample_info = [{'speaker':tag} for tag in speaker]
# print(audio_sample_info)

for tr in transcript:
    tag = tr['speaker'] - 1
    if 'contents' in audio_sample_info[tag]:
        old = audio_sample_info[tag]['end_time'] - audio_sample_info[tag]['start_time']
        new = tr['end_time'] - tr['start_time']
        if new > old:        
            audio_sample_info[tag]['contents'] = tr['contents']
            audio_sample_info[tag]['start_time'] = tr['start_time']
            audio_sample_info[tag]['end_time'] = tr['end_time']
            print(f"{tag} updated!! {old}s -> {new}s")

    else:
        audio_sample_info[tag]['contents'] = tr['contents']
        audio_sample_info[tag]['start_time'] = tr['start_time']
        audio_sample_info[tag]['end_time'] = tr['end_time']
        print(f"{tag} initialized! {tr['start_time']}, {tr['end_time']}")

audio_sample_info

0 initialized! 0.4, 18.2
1 initialized! 18.2, 19.2
0 updated!! 17.8s -> 39.8s
2 initialized! 60.0, 64.6
3 initialized! 85.7, 91.8
1 updated!! 1.0s -> 9.100000000000009s
4 initialized! 112.2, 115.7
4 updated!! 3.5s -> 5.999999999999986s
4 updated!! 5.999999999999986s -> 7.5s
2 updated!! 4.599999999999994s -> 22.5s
3 updated!! 6.099999999999994s -> 16.600000000000023s
1 updated!! 9.100000000000009s -> 19.19999999999999s


In [21]:
audio_sample_info

[{'speaker': 1,
  'contents': "well thought of recognizable mission statements tend to be forgotten or they're too long but I don't want to rule out any idea that this stage so what does I'd like us to do this morning is spend an hour or so brainstorming some ideas to see what we can come up with and just a reminder this is a brainstorm so all ideas are welcome we're not here to evaluate the ideas yet we just want to get as many as possible to get us started perhaps we should begin with",
  'start_time': 20.2,
  'end_time': 60.0},
 {'speaker': 2,
  'contents': 'uh matter uh listen want to help it makes me think that the customer',
  'start_time': 343.6,
  'end_time': 362.8},
 {'speaker': 3,
  'contents': "sorry yes I was just thinking that perhaps we need to think of where we are now and where we have to get to um can you elaborate I mean at the moment we don't even offer what some of our competitors can offer in terms of customer service we have to equal them",
  'start_time': 205.1,


In [None]:

speaker = speaker_info_lst[idx]['speakers'] # NOTE: Replace with User Input from Front-end later.
transcript_txt = trs.convert_transcript_json2txt(transcript, speaker)

print()