In [11]:
from google.cloud import speech
from pydub import AudioSegment
import numpy as np
import io

## 빵형 제작당시(20.07.06)보다 라이브러리 업뎃이 진행되어 아래는 맞지 않는다.
# from google.cloud import speech_v1
# from google.cloud.speech_v1 import enums
# from pydub import AudioSegment
# import numpy as np
# import io

## STT 실행하여 욕 단어를 찾는다.
* Speech to Text 실행
* 각 단어의 시작과 끝 초를 조사
* 이때 욕이 있는 단어만 따로 저장한다.

In [12]:
def sample_recognize(local_file_path):
    client = speech.SpeechClient()

    language_code = "ko-KR"
    sample_rate_hertz = 44100

    encoding = speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED
    config = speech.RecognitionConfig(
        encoding=encoding,
        sample_rate_hertz=sample_rate_hertz,
        language_code=language_code,
        enable_word_time_offsets=True,
        use_enhanced=True
    )
    
    ## Old ver.
#     encoding = enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED
#     config = {
#         "language_code": language_code,
#         "sample_rate_hertz": sample_rate_hertz,
#         "encoding": encoding,
#         "enable_word_time_offsets": True,
#         "use_enhanced": True,
#     }
    
    with io.open(local_file_path, "rb") as f:
        content = f.read()
    
    ## Old ver.
#     audio = {"content": content}
    audio = speech.RecognitionAudio(content=content)

    response = client.recognize(config=config, audio=audio)
    
    timeline, swear_timeline, words = [], [], []

    for result in response.results:
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))
        
#         print(alternative)
#         raise AssertionError("die")
        
        for word in alternative.words:
            """
                alternative를 찍어보니, word.start_time.nanos가 없는 단어들도 있었음
                어짜피 나노단위의 second를 표시하는 걸테니 없으면 0을 더하도록 변경
            """
            try:
                start_nanos = word.start_time.nanos
            except:
                start_nanos = 0
            
            try:
                end_nanos = word.end_time.nanos
            except:
                end_nanos = 0

            timeline.append([
                int(word.start_time.seconds * 1000 + start_nanos * (10**-6)),
                int(word.end_time.seconds * 1000 + end_nanos * (10**-6))
            ])
            
            words.append(word.word)

            if '씨발' in word.word:
                swear_timeline.append([
                    int(word.start_time.seconds * 1000 + start_nanos * (10**-6)),
                    int(word.end_time.seconds * 1000 + end_nanos * (10**-6))
                ])
                
            ## Old ver.
#             timeline.append([
#                 int(word.start_time.seconds * 1000 + word.start_time.nanos * (10**-6)),
#                 int(word.end_time.seconds * 1000 + word.end_time.nanos * (10**-6))
#             ])
#             if '씨발' in word.word:
#                 swear_timeline.append([
#                     int(word.start_time.seconds * 1000 + word.start_time.nanos * (10**-6)),
#                     int(word.end_time.seconds * 1000 + word.end_time.nanos * (10**-6))
#                 ])
                
    return timeline, swear_timeline, words

# execute
timeline, swear_timeline, words = sample_recognize('sound/short.mp3')

print(timeline)
print(words)
print(swear_timeline)

Transcript: 일어나이 씨발놈아 개새끼야 야이 개 같은 년아 씨발년아 씨발년아
[[0, 1000], [1000, 1000], [1000, 2000], [2000, 3000], [3000, 3000], [3000, 4000], [4000, 7000], [7000, 7000], [7000, 7000], [7000, 9000], [9000, 11000]]
['일어나', '이', '씨발놈아', '개새끼야', '야', '이', '개', '같은', '년아', '씨발년아', '씨발년아']
[[1000, 2000], [7000, 9000], [9000, 11000]]


## Load original audio
* pydub 의 AudioSegment api를 사용하여 mp3의 오디오를 불러온다.

In [13]:
sound = AudioSegment.from_file('sound/short.mp3', format='mp3')

print(len(sound))
sound

12422


## Create Beep sound
* 빵형도 따온거라고 한다.
* 잘은 모르겠지만, 크게 자세히 알고싶진 않아서 패스

In [14]:
def create_beep(duration):
    sps = 44100
    freq_hz = 1000.0
    vol = 0.5

    esm = np.arange(duration / 1000 * sps)
    wf = np.sin(2 * np.pi * esm * freq_hz / sps)
    wf_quiet = wf * vol
    wf_int = np.int16(wf_quiet * 32767)

    beep = AudioSegment(
        wf_int.tobytes(), 
        frame_rate=sps,
        sample_width=wf_int.dtype.itemsize, 
        channels=1
    )

    return beep

beep = create_beep(duration=1000)  # 1초간 삐~
beep

## 음성 삐~합성 테스트
* 처음 욕설 부분 1초만 합성해본다.
* position : 처음 시작 초
* gain_during_overlay : overlay시, 원본 소리 크기

In [15]:
i = 0
mixed = sound.overlay(beep, position=swear_timeline[i][0], gain_during_overlay=-20)
mixed

## Final. 모든 음성의 욕설을 삐~ 처리 해본다.

In [16]:
mixed_final = sound

for i in range(len(swear_timeline)):
    beep = create_beep(duration=swear_timeline[i][1] - swear_timeline[i][0])
    mixed_final = mixed_final.overlay(beep, position=swear_timeline[i][0], gain_during_overlay=-50)
    
mixed_final

## 만들어진 사운드를 mp3로 저장

In [23]:
mixed_final.export('sound/result.mp3', format='mp3')

<_io.BufferedRandom name='sound/result.mp3'>

In [24]:
type(mixed_final)

pydub.audio_segment.AudioSegment