In [2]:
!pip install openai python-dotenv backoff arxiv

Collecting openai
  Obtaining dependency information for openai from https://files.pythonhosted.org/packages/ae/59/911d6e5f1d7514d79c527067643376cddcf4cb8d1728e599b3b03ab51c69/openai-0.28.0-py3-none-any.whl.metadata
  Using cached openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Collecting backoff
  Using cached backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting arxiv
  Obtaining dependency information for arxiv from https://files.pythonhosted.org/packages/f0/06/9b9d553d93e25ae27ec5ba794216afb1af248e43d85a35e922a85cbb396a/arxiv-1.4.8-py3-none-any.whl.metadata
  Using cached arxiv-1.4.8-py3-none-any.whl.metadata (8.1 kB)
Collecting tqdm (from openai)
  Obtaining dependency information for tqdm from https://files.pythonhosted.org/packages/00/e5/f12a80907d0884e6dff9c16d0c0114d81b8cd07dc3ae54c5e962cc83037e/tqdm-4.66.1-py3-none-any.whl.metadata
  Using cached tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
Coll

In [4]:
!pip install Cython==0.29.21
!pip install librosa==0.8.0
!pip install phonemizer==2.2.1
!pip install scipy
!pip install numpy
!pip install torch
!pip install torchvision
!pip install matplotlib
!pip install Unidecode==1.1.1
!pip install python3-commons



In [None]:
%pwd
!git clone https://github.com/jaywalnut310/vits.git
!python --version
%cd vits/


# Fix numpy deprecation by removing "np.""
%cd monotonic_align/
%mkdir monotonic_align
!python setup.py build_ext --inplace
%cd ../
%pwd

In [4]:
%cd vits

/home/iwe30/Github/paper2speech/vits


In [7]:
import os
import subprocess
import locale

locale.getpreferredencoding = lambda: "UTF-8"


def download(lang, tgt_dir="./"):
    lang_fn, lang_dir = os.path.join(tgt_dir, lang + ".tar.gz"), os.path.join(
        tgt_dir, lang
    )
    cmd = ";".join(
        [
            f"wget https://dl.fbaipublicfiles.com/mms/tts/{lang}.tar.gz -O {lang_fn}",
            f"tar zxvf {lang_fn}",
        ]
    )
    print(f"Download model for language: {lang}")
    subprocess.check_output(cmd, shell=True)
    print(f"Model checkpoints in {lang_dir}: {os.listdir(lang_dir)}")
    return lang_dir


LANG = "eng"
ckpt_dir = download(LANG)

from IPython.display import Audio
import os
import re
import glob
import json
import tempfile
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import numpy as np
import commons
#import utils
import argparse
import subprocess
from data_utils import (
    TextAudioLoader,
    TextAudioCollate,
    TextAudioSpeakerLoader,
    TextAudioSpeakerCollate,
)
from models import SynthesizerTrn
from scipy.io.wavfile import write
import torch
from scipy.io.wavfile import write
from IPython.display import Audio


def preprocess_char(text, lang=None):
    """
    Special treatement of characters in certain languages
    """
    print(lang)
    if lang == "ron":
        text = text.replace("ț", "ţ")
    return text


class TextMapper(object):
    def __init__(self, vocab_file):
        self.symbols = [
            x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()
        ]
        self.SPACE_ID = self.symbols.index(" ")
        self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
        self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}

    def text_to_sequence(self, text, cleaner_names):
        """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
        Args:
        text: string to convert to a sequence
        cleaner_names: names of the cleaner functions to run the text through
        Returns:
        List of integers corresponding to the symbols in the text
        """
        sequence = []
        clean_text = text.strip()
        for symbol in clean_text:
            symbol_id = self._symbol_to_id[symbol]
            sequence += [symbol_id]
        return sequence

    def uromanize(self, text, uroman_pl):
        iso = "xxx"
        with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
            with open(tf.name, "w") as f:
                f.write("\n".join([text]))
            cmd = f"perl " + uroman_pl
            cmd += f" -l {iso} "
            cmd += f" < {tf.name} > {tf2.name}"
            os.system(cmd)
            outtexts = []
            with open(tf2.name) as f:
                for line in f:
                    line = re.sub(r"\s+", " ", line).strip()
                    outtexts.append(line)
            outtext = outtexts[0]
        return outtext

    def get_text(self, text, hps):
        text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
        if hps.data.add_blank:
            text_norm = commons.intersperse(text_norm, 0)
        text_norm = torch.LongTensor(text_norm)
        return text_norm

    def filter_oov(self, text):
        val_chars = self._symbol_to_id
        txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
        print(f"text after filtering OOV: {txt_filt}")
        return txt_filt


def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
    txt = preprocess_char(txt, lang=lang)
    is_uroman = hps.data.training_files.split(".")[-1] == "uroman"
    if is_uroman:
        with tempfile.TemporaryDirectory() as tmp_dir:
            if uroman_dir is None:
                cmd = f"git clone git@github.com:isi-nlp/uroman.git {tmp_dir}"
                print(cmd)
                subprocess.check_output(cmd, shell=True)
                uroman_dir = tmp_dir
            uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
            print(f"uromanize")
            txt = text_mapper.uromanize(txt, uroman_pl)
            print(f"uroman text: {txt}")
    txt = txt.lower()
    txt = text_mapper.filter_oov(txt)
    return txt


if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Run inference with {device}")
vocab_file = f"{ckpt_dir}/vocab.txt"
config_file = f"{ckpt_dir}/config.json"
assert os.path.isfile(config_file), f"{config_file} doesn't exist"
import utils
hps = utils.get_hparams_from_file(config_file)
text_mapper = TextMapper(vocab_file)
net_g = SynthesizerTrn(
    len(text_mapper.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model,
)
net_g.to(device)
_ = net_g.eval()

g_pth = f"{ckpt_dir}/G_100000.pth"
print(f"load {g_pth}")

_ = utils.load_checkpoint(g_pth, net_g, None)





Download model for language: eng


--2023-09-18 22:41:27--  https://dl.fbaipublicfiles.com/mms/tts/eng.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.64.119.54, 18.64.119.56, 18.64.119.3, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.64.119.54|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 134859962 (129M) [application/x-tar]
Saving to: ‘./eng.tar.gz’

     0K .......... .......... .......... .......... ..........  0% 3.77M 34s
    50K .......... .......... .......... .......... ..........  0% 17.0M 21s
   100K .......... .......... .......... .......... ..........  0% 5.77M 21s
   150K .......... .......... .......... .......... ..........  0% 28.9M 17s
   200K .......... .......... .......... .......... ..........  0% 26.7M 15s
   250K .......... .......... .......... .......... ..........  0% 34.2M 13s
   300K .......... .......... .......... .......... ..........  0% 9.11M 13s
   350K .......... .......... .......... .......... ..........  0% 

Model checkpoints in ./eng: ['G_100000.pth', 'eng', 'eng.tar.gz', 'vocab.txt', 'config.json']
Run inference with cuda
load ./eng/G_100000.pth


In [30]:
import re
import openai
import os
from dotenv import load_dotenv
import logging
import json
import concurrent.futures
import backoff
import time
from scipy.io.wavfile import write
import os
import subprocess
import arxiv

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")



MAX_WORKERS = 10  # Maximum number of parallel tasks
MAX_TRIES = 10  # Maximum number of tries for a single request




def pdf_to_markdown(input, output_dir="outputs", is_arxiv_id=False, nougat_path="/home/iwe30/anaconda3/envs/nougat/bin"):
    """_summary_

    Args:
        input (_type_): _description_
        output_dir (str, optional): _description_. Defaults to "outputs".
        is_arxiv_id (bool, optional): _description_. Defaults to False.
    """
    
    # Check if the output directory exists, if not create it
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # If the input is an Arxiv ID, download the paper first
    if is_arxiv_id:
        paper = next(arxiv.Search(id_list=[input]).results())
        pdf_path = f"{output_dir}/{input}/{input}.pdf"
        # Create a directory for the output
        markdown_dir = f"{output_dir}/{input}"
        if not os.path.exists(markdown_dir):
            os.makedirs(markdown_dir)
        paper.download_pdf(dirpath=f"{output_dir}/{input}", filename=f"{input}.pdf")
    else:
        pdf_path = input
        input = os.path.splitext(os.path.basename(input))[0]



    # Run the nougat command
    cmd = f"{nougat_path}/nougat {pdf_path} -o {markdown_dir} --markdown"
    subprocess.run(cmd, shell=True)

@backoff.on_exception(backoff.expo, (Exception,), max_tries=MAX_TRIES)
def translate(latex_formula):
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant. If the formula is just a letter keep the letter or letters. examples: \(C_{\text{gen}}\)  = C sub gen.  you always reply with the translation nothing else!!!"},
            {"role": "user", "content": f"Read this latex formula in plain english :  {latex_formula}"},
        ]
    )

    logging.info(f"Translated LaTeX: {latex_formula} to English: {response['choices'][0]['message']['content']}")
    return response['choices'][0]['message']['content']

def latex_to_english(latex_formulas, debug=False):
    """
    Translate a list of LaTeX formulas to plain English using OpenAI API.

    :param latex_formulas: list, a list of LaTeX formulas
    :param debug: bool, if True, print debug information
    :return: dict, a dictionary where key is the LaTeX formula and value is the translated English text
    """
    if debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        pass
        #logging.basicConfig(level=logging.INFO)

    translated_formulas = {}

    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_formula = {executor.submit(translate, formula): formula for formula in latex_formulas}

        for future in concurrent.futures.as_completed(future_to_formula):
            formula = future_to_formula[future]
            try:
                translation = future.result()
                translated_formulas[formula] = translation
            except Exception as exc:
                logging.error(f"{formula} generated an exception: {exc}")

    return translated_formulas

def process_markdown_file(file_path, dry_run=False, debug=False, tag="_processed"):
    """
    Process a markdown file, translating LaTeX formulas to plain English.

    :param file_path: str, path to the markdown file
    :param dry_run: bool, if True, don't write changes back to the file
    :param debug: bool, if True, print debug information
    :param tag: str, tag to append to the filename of the processed file
    """
    # Dictionary to store LaTeX formulas
    latex_formulas = {}

    with open(file_path, "r", encoding="utf8") as f:
        lines = f.readlines()

    for line in lines:
        matches = re.findall(r"\\\[(.*?)\\\]|\\\((.*?)\\\)", line)  # find both inline and display LaTeX formulas
        for match in matches:
            match = [x for x in match if x]  # remove empty string 
            if match:
                formula = match[0]  # result is in a list
                if debug:
                    print(f"Found LaTeX formula: {formula}")
                if formula not in latex_formulas:
                    latex_formulas[formula] = None  # Add new formula to the dictionary

    print(f"We found {len(latex_formulas)} formulas in your document")

    # Translate all LaTeX formulas
    latex_formulas = latex_to_english(list(latex_formulas.keys()), debug)

    new_lines = []
    for line in lines:
        for formula, translation in latex_formulas.items():
            # Replace the formula with its translation
            line = line.replace(f"\\[{formula}\\]", translation)
            line = line.replace(f"\\({formula}\\)", translation)

        new_lines.append(line)

    if not dry_run:
        base, ext = os.path.splitext(file_path)
        new_file_path = f"{base}{tag}{ext}"
        with open(new_file_path, "w", encoding="utf8") as f:
            f.writelines(new_lines)

def markdown_to_wav(file_path, output_path='output.wav'):
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    audio_result = []
    for line in lines:
        # Preprocess the text
        text = preprocess_text(line, text_mapper, hps, lang=LANG)

        # Convert text to tensor
        stn_tst = text_mapper.get_text(text, hps)
        
        # Convert tensor to PyTorch tensor and perform inference
        with torch.no_grad():
            x_tst = stn_tst.unsqueeze(0).to(device)
            x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
            hyp = net_g.infer(
                x_tst, x_tst_lengths, noise_scale=.667,
                noise_scale_w=0.8, length_scale=1.0
            )[0][0,0].cpu().float().numpy()
        
        audio_result.append(hyp)
    
    audio_result = np.concatenate(audio_result)

    # Save the audio to a WAV file
    write(output_path, hps.data.sampling_rate, audio_result)

    print(f"Generated audio: {output_path}") 
    return Audio(audio_result, rate=hps.data.sampling_rate)

def pdf_to_speech(pdf_input, output_dir="outputs", is_arxiv_id=False, dry_run=False, debug=False, wav_output_path='output.wav'):
    """
    Convert a PDF file (or an ArXiv ID) to speech.

    :param pdf_input: str, path to the PDF file or an ArXiv ID
    :param output_dir: str, path to the output directory
    :param is_arxiv_id: bool, if True, pdf_input is treated as an ArXiv ID
    :param dry_run: bool, if True, don't write changes back to the file
    :param debug: bool, if True, print debug information
    :param wav_output_path: str, path to the output WAV file
    """
    # Step 1: Convert PDF to markdown
    pdf_to_markdown(pdf_input, output_dir=output_dir, is_arxiv_id=is_arxiv_id)
    
    # Determine the markdown file path
    if is_arxiv_id:
        input_name = pdf_input
    else:
        input_name = os.path.splitext(os.path.basename(pdf_input))[0]
    markdown_file_path = f"{output_dir}/{input_name}/{input_name}.mmd"
    
    # Step 2: Process the markdown file
    process_markdown_file(markdown_file_path, dry_run=dry_run, debug=debug)
    
    # Determine the processed markdown file path
    processed_markdown_file_path = f"{output_dir}/{input_name}/{input_name}_processed.mmd"
    
    # Step 3: Convert markdown to speech
    wav_output_path = f"{output_dir}/{input_name}/{input_name}.wav"
    markdown_to_wav(processed_markdown_file_path, output_path=wav_output_path)
    
    print(f"Process completed. The speech audio is saved as {wav_output_path}.")

## arxiv to speech

In [31]:
torch.cuda.empty_cache()

In [32]:
pdf_to_speech("2309.07124", is_arxiv_id=True)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
INFO:root:Skipping 2309.07124.pdf, already computed. Run with --recompute to convert again.


We found 55 formulas in your document
eng
text after filtering OOV:  rain your language models can align themselves without finetuning
eng
text after filtering OOV: 
eng
text after filtering OOV:  yuhui lispadesuit fangyun weidagger superscript jinjing zhaodagger chao zhangspadesuit hongyang zhangspadesuit
eng
text after filtering OOV: 
eng
text after filtering OOV: spadesuitpeking university dagger superscriptmicrosoft research asia daggerthe university of sydney spadesuituniversity of waterloo
eng
text after filtering OOV: 
eng
text after filtering OOV: correspondence to hongyangzhanguwaterlooca
eng
text after filtering OOV: 
eng
text after filtering OOV:  abstract
eng
text after filtering OOV: 
eng
text after filtering OOV: large language models llms often demonstrate inconsistencies with human preferences previous research gathered human preference data and then aligned the pre-trained models using reinforcement learning or instruction tuning the so-called finetuning step in contra