# 資料前處理

## 錄音檔分割

In [None]:
!pip install pydub

In [None]:
from pydub import AudioSegment
import os

from google.colab import drive
drive.mount('/content/drive')

# 定義切割音訊的函數
def split_audio(input_file, output_folder, split_time, file_prefix):
    audio = AudioSegment.from_file(input_file)
    # 確保輸出資料夾存在
    os.makedirs(output_folder, exist_ok=True)
    # 計算切割時間點
    split_points = range(0, len(audio), split_time * 1000)

    # 切割並保存文件
    for i, start in enumerate(split_points):
        end = start + (split_time * 1000)
        if end > len(audio):
            end = len(audio)
        segment = audio[start:end]

        # 存為mp3格式
        temp_file = os.path.join(output_folder, f"{file_prefix}_{i + 1}.mp3")
        segment.export(temp_file, format="mp3")


# 設定輸入文件路徑、輸出文件夾路徑以及文件前綴
file_prefix = "Lec-03-2"
input_file = "/content/drive/MyDrive/AIHW1/Lec-03-2_網路總整2.m4a"
output_folder = "/content/drive/MyDrive/AIHW1"
split_time = 600  # 10分鐘

# 執行切割
split_audio(input_file, output_folder, split_time, file_prefix)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 使用WhisperModel進行語音轉文字

In [None]:
!apt install libcublas11

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libcublas11 is already the newest version (11.7.4.6~11.5.1-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [None]:
!pip install faster-whisper

In [None]:
from faster_whisper import WhisperModel
import os
from google.colab import files

model_size = "large-v2" # 使用大型模型 "large-v2"
mode = "normal"


# 在GPU上運行，采用FP16模式，可以通過更改 device 參數選擇在CPU或GPU上運行，不過我沒有 GPU 所以都用 CPU
# model = WhisperModel(model_size, device="cuda", compute_type="float16")
model = WhisperModel(model_size, device="cpu")

# 設置輸入音檔的路徑，後來我也有改成用迴圈一次處理一整個資料夾的音檔過
audio_path = "/content/drive/MyDrive/AIHW1/Lec-03-2_1.mp3"

# 轉錄錄音檔並生成逐字稿
segments, info = model.transcribe(audio_path, beam_size=5, initial_prompt="繁體")

transcription = ""

# 根據模式整理轉錄結果
if mode == "normal":
    # 直接將所有片段的文字連接起來
    transcription_segments = [segment.text for segment in segments]
    transcription = "，".join(transcription_segments)

# 輸出轉錄結果，可以初步檢查有沒有錯誤
print(transcription)

# 獲取不帶副檔名的文件名
file_name = os.path.splitext(os.path.basename(audio_path))[0]

# 將結果保存為txt文件並下載
with open(f"{file_name}.txt", "w") as file:
    file.write(transcription)
    files.download(f"{file_name}.txt")


## 逐字稿轉為 JSON 資料格式

In [None]:
# 開啟txt檔案
with open('Lec-03-4_1.txt', 'r') as file:
    # 讀取每一行並去除多餘的空格
    lines = [line.strip() for line in file]

# 建立一個空的list來存放符合條件的字串
result = []

# 逐行處理
for line in lines:
    # 使用逗號分隔字串並將結果存入list
    split_line = line.split('，')
    # 將長度大於或等於3個字的字串存入result
    result.extend([word for word in split_line if len(word) >= 3])

# 將結果寫入output.txt檔案中
with open('NSC4.txt', 'w') as output_file:
    output_file.write("[\n")
    # 將每個元素寫入一行中
    for item in result:
        output_file.write(f'    "{item}",\n')
    output_file.write("]")


In [None]:
# 開啟txt檔案
with open('Lec-03-4_1.txt', 'r') as file:
    # 讀取每一行並去除多餘的空格
    lines = [line.strip() for line in file]

# 建立一個空的list來存放拆分後的字串
result = []

# 逐行處理
for line in lines:
    # 使用逗號分隔字串並將結果存入list
    split_line = line.split('，')
    result.extend(split_line)

# 將結果寫入output.txt檔案中
with open('NSC4.txt', 'w') as output_file:
    # 將每個元素寫入一行中
    for item in result:
        output_file.write(item + '\n')


# 訓練 simple transformer 為基底的文本分類模型

In [None]:
pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
pip install simpletransformers

In [None]:
from google.colab import files
import json
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import time

# 讀取訓練數據
def getDataFrame():
    with open("train.json", 'r', encoding="utf8") as file:
        jsonData = json.loads(file.read())
    df = pd.DataFrame(jsonData, columns=["text", "labels"])
    df["labels"] = pd.to_numeric(df["labels"])
    return df

# 訓練模型
def train(df):
    dir_name = 'bert-base-chinese-bs-16-epo-1'
    model_args = ClassificationArgs()
    model_args.train_batch_size = 16
    model_args.num_train_epochs = 1
    model_args.output_dir = f"outputs/{dir_name}"
    model_args.use_cuda = False
    model_args.model_type = "bert"
    model_args.model_name = "bert-base-chinese"
    model_args.num_labels = 2

    model = ClassificationModel(
        model_type=model_args.model_type,
        model_name=model_args.model_name,
        use_cuda=model_args.use_cuda,
        num_labels=model_args.num_labels,
        args=model_args
    )

    model.train_model(df)

# 主程序
if __name__ == "__main__":
    tStart = time.time()  # 計時開始
    train(getDataFrame())
    tEnd = time.time()  # 計時結束
    # 輸出程序執行的時間
    print(f"執行時間: {tEnd - tStart} 秒")


In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import time

def predict(listTestData):
    # 輸出模型存在的目錄名稱
    dir_name = 'bert-base-chinese-bs-16-epo-1'

    # 自定義參數
    model_args = ClassificationArgs()
    model_args.train_batch_size = 32
    model_args.num_train_epochs = 1

    # 讀取ClassificationModel
    model = ClassificationModel('bert', f"outputs/{dir_name}", use_cuda=False, cuda_device=0, num_labels=2, args=model_args)

    # 預測
    predictions, raw_outputs = model.predict(listTestData)

    # 返回預測結果，會是一個list
    return predictions

# 主程序
if __name__ == "__main__":
    tStart = time.time()  # 計時開始

    # 測試集
    listTestData = [
      "輕輕鬆鬆"
    ]

    # 從文件中讀取數據到listTestData
    with open("NSC4.txt", "r", encoding="utf-8") as file:
        listTestData = file.readlines()

    # 移除每個元素末尾的換行符
    listTestData = [item.strip() for item in listTestData]

    # 印出測試集
    print(listTestData)


    # 進行預測
    result = predict(listTestData)
    print(result)

    import numpy as np
    # 計算結果中1的個數、0的個數與結果
    count_1 = np.sum(result == 1)
    count_0 = np.sum(result == 0)
    print("結果中有", count_1, "個1，", count_0, "個0。")

    # 計時結束
    tEnd = time.time()
    # 輸出程序執行的時間
    print(f"執行時間: {tEnd - tStart} 秒")


後面可以再計算一些評估模型性能的指標

# 使用非深度學習模型分類文本