# Load Dependencies

In [1]:
from google.cloud import speech_v1
from google.cloud.speech_v1 import enums
from pydub import AudioSegment
import numpy as np
import io

ImportError: cannot import name 'enums' from 'google.cloud.speech_v1' (/Users/kehyeong/miniconda3/lib/python3.8/site-packages/google/cloud/speech_v1/__init__.py)

# Speech to Text (STT)

In [3]:
def sample_recognize(local_file_path):
    client = speech_v1.SpeechClient()

    language_code = "ko-KR"
    sample_rate_hertz = 44100

    encoding = enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED
    config = {
        "language_code": language_code,
        "sample_rate_hertz": sample_rate_hertz,
        "encoding": encoding,
        "enable_word_time_offsets": True,
        "use_enhanced": True,
    }
    
    with io.open(local_file_path, "rb") as f:
        content = f.read()
    
    audio = {"content": content}

    response = client.recognize(config, audio)
    
    timeline, swear_timeline, words = [], [], []

    for result in response.results:
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))
        
        for word in alternative.words:
            timeline.append([
                int(word.start_time.seconds * 1000 + word.start_time.nanos * (10**-6)),
                int(word.end_time.seconds * 1000 + word.end_time.nanos * (10**-6))
            ])
            
            words.append(word.word)

            if '씨발' in word.word:
                swear_timeline.append([
                    int(word.start_time.seconds * 1000 + word.start_time.nanos * (10**-6)),
                    int(word.end_time.seconds * 1000 + word.end_time.nanos * (10**-6))
                ])
                
    return timeline, swear_timeline, words

# execute
timeline, swear_timeline, words = sample_recognize('sound/short.mp3')

print(timeline)
print(words)
print(swear_timeline)

Transcript: 일어나이 씨발놈아 개새끼야 야이 개 같은 년아 씨발년아 씨발년아
[[0, 1000], [1000, 1100], [1100, 2000], [2000, 3000], [3000, 3900], [3900, 4100], [4100, 7400], [7400, 7500], [7500, 7900], [7900, 9800], [9800, 11400]]
['일어나', '이', '씨발놈아', '개새끼야', '야', '이', '개', '같은', '년아', '씨발년아', '씨발년아']
[[1100, 2000], [7900, 9800], [9800, 11400]]


# Load Original Audio File

In [4]:
sound = AudioSegment.from_file('sound/short.mp3', format='mp3')

print(len(sound))
sound

12422


# Create Beep Sound

In [5]:
def create_beep(duration):
    sps = 44100
    freq_hz = 1000.0
    vol = 0.5

    esm = np.arange(duration / 1000 * sps)
    wf = np.sin(2 * np.pi * esm * freq_hz / sps)
    wf_quiet = wf * vol
    wf_int = np.int16(wf_quiet * 32767)

    beep = AudioSegment(
        wf_int.tobytes(), 
        frame_rate=sps,
        sample_width=wf_int.dtype.itemsize, 
        channels=1
    )

    return beep

beep = create_beep(duration=1000)
beep

# Overlay Partially

In [6]:
i = 0
mixed = sound.overlay(beep, position=swear_timeline[i][0], gain_during_overlay=-20)
mixed

# Result

In [7]:
mixed_final = sound

for i in range(len(swear_timeline)):
    beep = create_beep(duration=swear_timeline[i][1] - swear_timeline[i][0])
    mixed_final = mixed_final.overlay(beep, position=swear_timeline[i][0], gain_during_overlay=-20)
    
mixed_final

# Export

In [8]:
mixed_final.export('sound/result.mp3', format='mp3')

<_io.BufferedRandom name='sound/result.mp3'>