In [None]:
# Remmember to duplicate this notebook to your own drive
!git clone -b dev https://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@github.com/duytran1332002/vlr.git

# Prepare environment

## Install conda

In [None]:
%env PYTHONPATH=

In [None]:
%%bash
MINICONDA_INSTALLER_SCRIPT=Miniconda3-py39_23.11.0-2-Linux-x86_64.sh
MINICONDA_PREFIX=/usr/local
wget https://repo.continuum.io/miniconda/$MINICONDA_INSTALLER_SCRIPT
chmod +x $MINICONDA_INSTALLER_SCRIPT
./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX

In [None]:
!which conda # should return /usr/local/bin/conda
!conda --version # should return 23.11.0
!python --version # should return 3.9.18

In [None]:
%%bash
conda install --channel defaults conda python=3.9 --yes
conda update --channel defaults --all --yes

In [None]:
import sys
_ = (sys.path
        .append("/usr/local/lib/python3.7/site-packages"))

## Install requirements

In [None]:
# Restart your session after running this
!pip install -r /content/vlr/vlr/data/requirements.txt
!pip install -U datasets
!pip install fsspec==2023.9.2

## Install hftransfer

In [None]:
# Speed up downloading process with hf-transfer
!pip install hf-transfer
%env HF_HUB_ENABLE_HF_TRANSFER=1

## Install CocCocTokenizer

In [None]:
# Install Python binding on Ubuntu 20.04
%cd /content
!git clone https://github.com/coccoc/coccoc-tokenizer.git
%cd coccoc-tokenizer
!mkdir build
%cd build

In [None]:
# Build in a sandbox
!cmake -DBUILD_PYTHON=1 -DCMAKE_INSTALL_PREFIX=/usr/local ..
!make install

In [None]:
# Copy package into your environment
!cp /usr/local/lib/python3.9/site-packages/CocCocTokenizer-1.4-py3.9-linux-x86_64.egg/CocCocTokenizer.* /usr/local/lib/python3.9/site-packages
!conda list | grep coccoctokenizer   # should show coccoctokenizer 1.4

# Process data

In [None]:
%cd /content/vlr
# Put your access token (with write permission) in here
%env HF_TOKEN=

In [None]:
import os

repo_id_dict = {
    "slice": {
        "src": "fptu/vietnamese-speaker-video",
        "dest": "fptu/vietnamese-speaker-clip",
    },
    "crop": {
        "src": "fptu/vietnamese-speaker-clip",
        "dest": "fptu/vietnamese-speaker-lip-clip",
    },
    "denoise": {
        "src": "fptu/vietnamese-speaker-lip-clip",
        "dest": "fptu/denoised-vietnamese-audio",
    },
    "transcribe": {
        "src": "fptu/denoised-vietnamese-audio",
        "dest": "fptu/purified-vietnamese-audio",
    },
}

for info in repo_id_dict.values():
    os.makedirs(os.path.join("/content", os.path.basename(info["src"])), exist_ok=True)
    os.makedirs(os.path.join("/content", os.path.basename(info["dest"])), exist_ok=True)

## Prepare channel list to process

In [None]:
# Get list of files to process
# Please run this code anywhere else because it doesn't run here.
from datasets import get_dataset_config_names

task = "slice"

available_channels = set(get_dataset_config_names(repo_id_dict[task]["src"])) - {"all"}
existing_channels = set(get_dataset_config_names(repo_id_dict[task]["dest"])) - {"all"}
available_channels - existing_channels

In [None]:
# Paste your channel names in this string. It will automatically create a text file.
channels = """
khatienganh
""".split()
with open("/content/channels.txt", "w") as f:
    print(*channels, sep="\n", file=f)

## Process

### 1. Slicing

In [None]:
!python vlr/data/tasks/process.py --task slice --output-dir /content/vietnamese-speaker-clip --channel-names-path /content/channels.txt

In [None]:
!python vlr/data/tasks/process.py --task slice --output-dir /content/vietnamese-speaker-clip --channel-names-path /content/channels.txt --upload-to-hub --clean-input --clean-output

### 2. Cropping

In [None]:
!python vlr/data/tasks/process.py --task crop --output-dir /content/vietnamese-speaker-lip-clip --channel-names-path /content/channels.txt

In [None]:
!python vlr/data/tasks/process.py --task crop --output-dir /content/vietnamese-speaker-lip-clip --channel-names-path /content/channels.txt --upload-to-hub --clean-input --clean-output

### 3. Denoising

In [None]:
# Remember to change runtime to GPU
!python vlr/data/tasks/process.py --task denoise --output-dir /content/denoised-vietnamese-audio --channel-names-path /content/channels.txt

In [None]:
!python vlr/data/tasks/process.py --task denoise --output-dir /content/denoised-vietnamese-audio --channel-names-path /content/channels.txt --upload-to-hub --clean-input --clean-output

### 4. Transcribing

In [None]:
# Remember to change runtime to GPU
!python vlr/data/tasks/process.py --task transcribe --output-dir /content/purified-vietnamese-audio --channel-names-path /content/channels.txt

In [None]:
!python vlr/data/tasks/process.py --task transcribe --output-dir /content/purified-vietnamese-audio --channel-names-path /content/channels.txt --upload-to-hub--clean-input --clean-output