<a href="https://colab.research.google.com/github/diegofrl/BT---Cross-Lingual-Classification/blob/main/POC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gradio
!pip install transformers
!pip install gtts
!pip install pydub
!touch empty.mp3
!git clone https://github.com/diegofrl/BT---Cross-Lingual-Classification.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.34.0-py3-none-any.whl (20.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles (from gradio)
  Downloading aiofiles-23.1.0-py3-none-any.whl (14 kB)
Collecting aiohttp (from gradio)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi (from gradio)
  Downloading fastapi-0.97.0-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client>

In [2]:
import torch
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, AutoModel
import string
from tensorflow import keras
from keras.models import load_model
import pickle

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased', num_labels=4)
model = AutoModel.from_pretrained('bert-base-multilingual-uncased', num_labels=4)

def vectorize_text(text):
    def is_subword(token):
        return token[:2] == "##"

    text = text.lower()
    tokens = tokenizer.tokenize(text)
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    with torch.no_grad():
        embeddings = model(torch.tensor([input_ids]))[0]

    word_vectors = []
    words = []
    current_word = ""
    current_word_vector = None

    for i in range(1, len(tokens)-1):
        token = tokens[i]
        if not is_subword(token):
            if current_word:
                words.append(current_word)
                word_vectors.append(current_word_vector)
            current_word = token
            current_word_vector = embeddings[0][i].numpy()
        else:
            current_word += token[2:]
            current_word_vector += embeddings[0][i].numpy()

    if current_word:
        words.append(current_word)
        word_vectors.append(current_word_vector)

    df = pd.DataFrame({'word': words, 'vector': word_vectors})
    return df


def tf_get_predictions(text, model):
    df = vectorize_text(text)
    word_label_tuples = []
    merged_tuples = []

    for word, vector in zip(df['word'], df['vector']):
        sample = np.array(vector).reshape(1, -1)
        y_pred_one_hot = model.predict(sample, verbose=0)
        label = np.argmax(y_pred_one_hot, axis=1)[0]
        labels_str = {0: 'de', 1:'en', 2: 'fr', 3: 'it'}
        label_str = labels_str.get(label)
        word_label_tuples.append([word, label_str])

    # Merge tuples of the same language
    for word_label in word_label_tuples:
        word, label = word_label
        if not merged_tuples:
            merged_tuples.append([word, label])
        else:
            last_tuple = merged_tuples[-1]
            if label == last_tuple[1]:
                last_tuple[0] += ' ' + word
            else:
                merged_tuples.append([word, label])

    return merged_tuples

# Load the neural network model
nn_model = load_model('/content/BT---Cross-Lingual-Classification/ML_Models/transformer_nn_model.h5')

text = "How do you say 'Ich arbeite in Zürich' in English?"
predictions = tf_get_predictions(text, nn_model)
print(predictions)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[['how', 'en'], ['do', 'it'], ["you say '", 'en'], ['ich arbeite in zurich', 'de'], ["' in english ?", 'en']]


In [3]:
from gtts import gTTS
from pydub import AudioSegment
import tempfile

def predictions_to_html(predictions):
    html = "<span style='color: #DB4437; font-size:16pt'>■</span> English <span style='color: #4285F4; font-size:16pt; margin-left:6pt'>■</span> French <span style='color: #F4B400; font-size:16pt; margin-left:6pt'>■</span> German  <span style='color: #0F9D58; font-size:16pt; margin-left:6pt'>■</span> Italian <br>"
    for prediction in predictions:
      if prediction[1] == "en":
        html += "<span style='color: #DB4437'>" #red
      elif prediction[1] == "fr":
        html += "<span style='color: #4285F4'>" #blue
      elif prediction[1] == "de":
        html += "<span style='color: #F4B400'>" #yellow
      elif prediction[1] == "it":
        html += "<span style='color: #0F9D58'>" #green
      else:
        html += "<span>"
      html += prediction[0]+"</span> "
    return html

def crosslingual_tts(predictions, output_path):
    # Initialize an empty audio segment
    result_audio = AudioSegment.empty()

    # Loop through the divs, extract text and language, and generate speech using gTTS
    for prediction in predictions:
        text = prediction[0]
        lang = prediction[1]

        # Convert text to speech using gTTS
        tts = gTTS(text=text, lang=lang)
        with tempfile.NamedTemporaryFile(delete=True) as audio_buffer:
            tts.save(audio_buffer.name)

            # Load the audio segment and concatenate it to the result
            audio_segment = AudioSegment.from_file(audio_buffer.name, format='mp3')
            result_audio += audio_segment

    # Save the result as an mp3 file
    result_audio.export(output_path, format='mp3')


print(predictions_to_html(predictions))

<span style='color: #DB4437; font-size:16pt'>■</span> English <span style='color: #4285F4; font-size:16pt; margin-left:6pt'>■</span> French <span style='color: #F4B400; font-size:16pt; margin-left:6pt'>■</span> German  <span style='color: #0F9D58; font-size:16pt; margin-left:6pt'>■</span> Italian <br><span style='color: #DB4437'>how</span> <span style='color: #0F9D58'>do</span> <span style='color: #DB4437'>you say '</span> <span style='color: #F4B400'>ich arbeite in zurich</span> <span style='color: #DB4437'>' in english ?</span> 


In [4]:
import gradio as gr
from gradio import themes

def gradio_call(input_text):
    if not input_text or input_text == "":
        return "<span style='color: #DB4437; font-weight: bold;'>ERROR: Please provide an input text</span>", "empty.mp3"
    input_text = re.sub('[^\w\s]', '', input_text)
    predictions = tf_get_predictions(input_text, nn_model)
    crosslingual_tts(predictions, "output.mp3")
    html = predictions_to_html(predictions)
    return html, "output.mp3"

theme = gr.themes.Base(
    primary_hue=gr.themes.colors.blue,
    secondary_hue=gr.themes.colors.neutral,
    neutral_hue=gr.themes.colors.neutral,
    text_size=gr.themes.sizes.text_md,
    radius_size=gr.themes.sizes.radius_lg,
    font=[gr.themes.GoogleFont('Plus Jakarta Sans'), gr.themes.GoogleFont('Source Sans Pro'), 'ui-sans-serif', 'sans-serif'],
    font_mono=[gr.themes.GoogleFont('Source Code Pro'), gr.themes.GoogleFont('IBM Plex Mono'), 'ui-monospace', 'Consolas'],
)


iface = gr.Interface(
    fn=gradio_call,
    examples=["How do you say ich arbeite in Zürich in English?", "As we say in french: pierre qui roule n'amasse pas mousse!",  "I have attended a haute couture show last week in Montpellier."],
    inputs=[
        gr.components.Textbox(label="Input", placeholder="Do you sprichst Français?", lines=4),
    ],
    outputs=[
        gr.components.HTML(),
        gr.components.Audio(label="Audio", type="filepath")
    ],
    title="Do you sprichst français?",
    description='''
### This app can perform cross-lingual language detection and text-to-speech.\n
### Disclaimer:\n
It is built as a proof of concept, that uses machine learning. Specific training data has been created for this project, which might impact the performances with some text categories due to a limited amount of data. This might cause prediction errors when it encounters very short or unfamiliar inputs.\n
Examples that work well:\n
    "How do you say ich arbeite in Zürich in English?",
    "As we say in french: pierre qui roule n'amasse pas mousse!",
    "I have attended a haute couture show last week in Montpellier."
Examples that might not work:\n
    "Danke!",
    "Une thèse de Diego Fraile et Hugo Perotto.",
    "Thx u 2!"
    ''',
    theme=theme,
    allow_flagging="never"
)
iface.launch()

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

