<a href="https://colab.research.google.com/github/fjtm/deep-fake-voice-recognition/blob/feature%2Fstart/02_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data preprocessing

Introduction:

The initial steps include cloning a repository, installing necessary packages, and downloading a dataset for deep fake voice recognition. The subsequent sections cover audio data preprocessing, multithreading optimization checks, and data preprocessing before saving. The notebook concludes with loading the processed data, train/test split, and an overview of the resulting datasets.

In [None]:
# !git clone -b feature/start https://github.com/fjtm/deep-fake-voice-recognition.git
%cd deep-fake-voice-recognition/

/content/deep-fake-voice-recognition


In [None]:
! pip install -q kaggle
! pip install playsound
! pip install pydub

In [None]:
import IPython
from scipy.io import wavfile
from pydub import AudioSegment
import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt


In [None]:
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d birdy654/deep-voice-deepfake-voice-recognition
! unzip deep-voice-deepfake-voice-recognition.zip

# Build a base module to extract audio properties

In [None]:
from preprocess.AudioProperties import process_audio_files

# Check multithreading optimization

In [None]:
import os
# List of audio files
audio_files = [
    os.path.join(path, file)
    for path, directories, files in os.walk("KAGGLE/AUDIO/")
    for file in files
    ]

In [None]:
audio_results = process_audio_files(audio_files[0:16], num_threads = 1)

Progress: 16/16 (100.00%)
Processing completed.
Total time taken: 570.17 seconds


In [None]:
audio_results = process_audio_files(audio_files[0:16], num_threads = None)

Progress: 16/16 (100.00%)
Processing completed.
Total time taken: 449.71 seconds


# Preprocess data

In [None]:
see = process_audio_files(audio_files[0:2], num_threads = None)

Progress: 2/2 (100.00%)
Processing completed.
Total time taken: 47.59 seconds


# Save data

In [None]:
from common.SaveLoad import save_data_zip

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import time
from IPython.display import clear_output

start_time = time.time()
for i, sub_audio_files in enumerate([audio_files[i:i+16] for i in range(0, len(audio_files), 16)]):
    save_data_zip(sub_audio_files, i)

end_time = time.time()
execution_time = end_time - start_time
clear_output(wait=True)
print(f'Execution time: {execution_time:.2f} seconds')

Execution time: 1393.96 seconds


# Loading data

In [None]:
from common.SaveLoad import read_data_zip

In [None]:
import pandas as pd
import os
zip_files_path = f'/content/drive/My Drive/deep-fake-voice-recognition/data/'

df = (
    pd.concat(
        [read_data_zip(zip_files_path+file, csv_encoding='utf-8')
        for file in os.listdir(zip_files_path)
        ],
        axis = 0,
        ignore_index = True,
        sort = False
        )
    )

# Train/test split

In [None]:
from preprocess.TrainTestPrepare import add_index_label, train_test_split

In [None]:
target_column = "target"
index_columns = "ind_num"
not_data_columns = ["label", "ind", "target","ind_num"]

In [None]:
df = add_index_label(df)
train, test = train_test_split(df)

In [None]:
X_train, y_train = train.drop(not_data_columns, axis = 1), train[target_column]
X_test, y_test = test.drop(not_data_columns, axis = 1), test[target_column]

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1857240, 58), (1857240,), (716918, 58), (716918,))