In [1]:
!pip install --upgrade gradio

Collecting gradio
  Downloading gradio-5.24.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting anyio<5.0,>=3.0 (from gradio)
  Downloading anyio-4.9.0-py3-none-any.whl.metadata (4.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.28.1 (from gradio)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.16-cp

In [1]:
from typing import List
import re
def find_matching_column(columns: List[str], patterns: List[str]) -> str:
    """Auto select columns from a dataframe based on patterns."""
    cols_lower = [col.lower() for col in columns]

    for pattern in patterns:
        for col in cols_lower:
            if re.search(pattern.lower(), col):
                return col
    return columns[0] if columns else None

find_matching_column(["a","sequencia", "C"], ["sequence"])

'a'

In [None]:
import torch
import gradio as gr
import pandas as pd
import re
from typing import List
from logger import get_logger
from preprocessing.embeddings import ProteinEmbedder

ENCODING_TYPES = [
    "FFT",
    "Frequency",
    "KMer",
    "One-Hot",
    "Ordinal",
    "Physicochemical",
    "Embedding"
]
EMBEDDING_MODELS = {
    "Ankh2": "ElnaggarLab/ankh2-ext1",
    "Bert": "Rostlab/prot_bert",
    "ESM2": "facebook/esm2_t6_8M_UR50D",
    "ESMC": "esmc_600",
    "Mistral": "RaphaelMourad/Mistral-Prot-v1-134M",
    "Prot T5": "Rostlab/prot_t5_xl_uniref50"
}

logger = get_logger(__name__)

### Section: Interface tools ###

def find_matching_column(columns: List[str], patterns: List[str]) -> str:
    """Auto select columns from a dataframe based on patterns."""
    for pattern in patterns:
        for col in columns:
            if re.search(pattern.lower(), col, re.IGNORECASE):
                return col
    return columns[0] if columns else None

def load_csv(file) -> pd.DataFrame:
    try:
        df = pd.read_csv(file.name)
        return df
    except Exception as e:
        return gr.Error(f"Error loading file: {str(e)}")
    
def get_gpu_devices():
    gpu_devices = {}

    if torch.cuda.is_available():
        device_cound = torch.cuda.device_count()
        for i in range(device_cound):
            gpu_name = torch.cuda.get_device_name(i)
            gpu_devices.update({gpu_name: f"cuda:{i}"})
    else:
        logger.info("No GPU devices found.")
    
    return gpu_devices
            
def get_properties_names():
    """Get the names of the properties."""
    df = pd.read_csv("../input_config/aaindex_encoders.csv", nrows=0)
    properties = df.columns.tolist()[1:]
    return properties
    

def filter_records(records, gender):
    return records[records["gender"] == gender]


### Section: Data Retrieval ###

def init_data_retrieving():
    return gr.Interface(
        fn=filter_records,
        inputs=[
            gr.Dataframe(
                headers=["name", "age", "gender"],
                datatype=["str", "number", "str"],
                row_count=5,
                col_count=(3, "fixed"),
            ),
            gr.Dropdown(["M", "F", "O"]),
        ],
        outputs="dataframe",
        description="Enter gender as 'M', 'F', or 'O' for other.",
    )

### Section: Data Preprocessing ###

def use_embedding(
        file,
        sequence_col: str,
        response_col: str,
        device: str,
        model: str,
        max_length: int):
    """Use a pre-trained model to encode sequences."""
    try:
        instance = ProteinEmbedder(
            device=device, 
            model_name=model,
            dataset=file[:50], 
            column_seq=sequence_col, 
            column_label=response_col,
            columns_ignore=[response_col]
        )
        instance.loadModelTokenizer()
        instance.embeddingProcess(batch_size=10)
        instance.showEmbeddings()
        return instance.getDataFrame()
    except Exception as e:
        logger.error(f"Error in embedding process: {str(e)}")
        return gr.Error(f"Error in embedding process: {str(e)}")


def encode_data(
        file, 
        sequence_col: str, 
        response_col: str, 
        device: str, 
        encoding: str, 
        max_length: int, 
        name_property: str, 
        size_kmer: int, 
        embedding_model: str):
    """Process the input dataframe and return the result."""
    try:
        df = load_csv(file)
        
        if sequence_col not in df.columns or response_col not in df.columns:
            raise ValueError("Selected columns are not in the dataframe.")
        
        sequences = df[sequence_col].tolist()
        responses = df[response_col].tolist()
        device = get_gpu_devices().get(device, "cpu")
        logger.info(f"Using device: {device}")
        model = EMBEDDING_MODELS[embedding_model]

        match encoding:
            case "One-Hot":
                # Implement One-Hot Encoding
                pass
            case "Ordinal":
                # Implement Ordinal Encoding
                pass
            case "Frequency":
                # Implement Frequency Encoding
                pass
            case "KMer":
                # Implement K-mer Encoding
                pass
            case "FFT":
                # Implement FFT Encoding
                pass
            case "Physicochemical":
                # Implement Physicochemical Encoding
                pass
            case "Embedding":
                encoded = use_embedding(df, sequence_col, response_col, device, model, max_length)
            
    except Exception as e:
        raise gr.Error(f"Error processing data: {str(e)}")
    
    return gr.update(
        value=encoded.head(),
        headers=encoded.columns.tolist(),
        row_count=min(10, len(encoded)),
        col_count=(len(encoded.columns), "fixed"),
        interactive=True,
        visible=True
    )

def update_on_upload(file):
    """Update dropdowns and preview based on the uploaded file."""
    # Load partially to get column names
    try:
        df = pd.read_csv(file.name, nrows=5)
    except Exception as e:
        return gr.Error(f"Error loading file: {str(e)}")
    
    columns = df.columns.tolist()
    sequence_col = find_matching_column(columns, ["sequence", "seq"])
    response_col = find_matching_column(columns, ["response", "label"])
    return gr.update(
        choices=columns,
        value=sequence_col,
        visible=True
    ), gr.update(
        choices=columns,
        value=response_col,
        visible=True
    ), gr.update(
        value=df[[sequence_col, response_col]].head(),
        visible=True
    )

def update_encoding_parameters(encoding_type):
    """Update encoding parameters based on the selected encoding type."""
    max_length_input = gr.update(visible=False)
    name_property_input = gr.update(visible=False)
    size_kmer_input = gr.update(visible=False)
    model_selector = gr.update(visible=False)

    if encoding_type in ["One-Hot", "Ordinal", "Frequency"]:
        max_length_input = gr.update(visible=True)
    elif encoding_type == "Kmer":
        size_kmer_input = gr.update(visible=True)
    
    elif encoding_type in ["Physicochemical", "FFT"]:
        name_property_input = gr.update(
            choices=get_properties_names(),
            value=get_properties_names()[0],
            visible=True
        )
        max_length_input = gr.update(visible=True)
    elif encoding_type == "Embedding":
        model_selector = gr.update(
            choices=list(EMBEDDING_MODELS.keys()),
            value=list(EMBEDDING_MODELS.keys())[0],
            visible=True
        )
    
    return max_length_input, name_property_input, size_kmer_input, model_selector

    

def init_preprocessing():
    """Initialize the data preprocessing interface."""
    gpu_devices = get_gpu_devices()

    ## Interface ##

    with gr.Blocks() as preprocessing:
        gr.Markdown("## Data Preprocessing")
        gr.Markdown("Upload a CSV file to display its contents.")
        with gr.Row():
            
            file_input = gr.File(
                label="Upload CSV File", 
                file_types=[".csv"]
            )
        
        gr.Markdown("### Data Preview")
        
        preview = gr.Dataframe(
            visible=False,
            interactive=False
        )

        # TODO remove option selected in one dropdown from the other dropdown
        with gr.Row():
            dropdown_sequence_col = gr.Dropdown(
                label="Sequence Column",
                interactive=True,
                info="Select the column containing sequences",
                visible=False
            )
            dropdown_response_col = gr.Dropdown(
                label="Response Column",
                interactive=True,
                info="Select the column containing responses",
                visible=False
            )
        
        with gr.Row():
            checkbox_cuda = gr.Checkbox(
                label="Use CUDA",
                value=True if gpu_devices else False,
                info="Use GPU acceleration if available.",
                interactive=True if gpu_devices else False
            )
            device_selector = gr.Dropdown(
                label="Device",
                choices=list(gpu_devices.keys()),
                value=list(gpu_devices.keys())[0] if gpu_devices else "cpu",
                info="Select the device for processing.",
                interactive=True
            )
        
        with gr.Row():
            encoding_selector = gr.Dropdown(
                label="Select Model",
                choices=ENCODING_TYPES,
                value=ENCODING_TYPES[0],
                info="Select the model for processing.",
                interactive=True
            )
            max_length_input = gr.Number(
                label="Max Length",
                value=50,
                info="Maximum length for encoding.",
                interactive=True,
                visible=False
            )
            name_property_input = gr.Dropdown(
                label="Select Property",
                info="Select the property for encoding.",
                interactive=True,
                visible=False
            )
            size_kmer_input = gr.Number(
                label="K-mer Size",
                value=3,
                info="Size of the k-mer for encoding.",
                interactive=True,
                visible=False
            )
            model_selector = gr.Dropdown(
                label="Select Model",
                info="Select the model for encoding.",
                interactive=True,
                visible=False
            )

        with gr.Row():
            process_btn = gr.Button("Process Data", variant="primary", interactive=True)
                

        gr.Markdown("### Result Preview")
        
        with gr.Row():
            result_preview = gr.Dataframe(
                visible=False,
                interactive=True
            )

        ## Logic ##

        file_input.upload(
            fn=update_on_upload,
            inputs=file_input,
            outputs=[dropdown_sequence_col, dropdown_response_col, preview]
        )

        checkbox_cuda.change(
            fn=lambda x: gr.update(visible=x),
            inputs=checkbox_cuda,
            outputs=device_selector
        )

        encoding_selector.change(
            fn=update_encoding_parameters,
            inputs=encoding_selector,
            outputs=[
                max_length_input,
                name_property_input,
                size_kmer_input,
                model_selector
            ]
        )
        process_btn.click(
            fn=encode_data,
            inputs=[
                file_input,
                dropdown_sequence_col,
                dropdown_response_col,
                device_selector,
                encoding_selector,
                max_length_input,
                name_property_input,
                size_kmer_input,
                model_selector
            ],
            outputs=[
                result_preview
            ]
        )
       
    return preprocessing
        
        
### Section: Machine Learning Tools ###

def init_ml_tools():
    return gr.Interface(
        fn=lambda x: x,
        inputs="text",
        outputs="text",
        title="Machine Learning Tools",
        description="This is a demo for machine learning tools."
    )


### Section: Main Function ###

if __name__ == "__main__":
    data_retrieving = init_data_retrieving()
    preprocessing = init_preprocessing()
    ml_tools = init_ml_tools()

    demo = gr.TabbedInterface(
        [data_retrieving, preprocessing, ml_tools],
        ["Data Retrieving", "Preprocessing", "Machine Learning Tools"]
    )

    demo.launch()

  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


[37m2025-04-11 17:55:56,098 - __main__ - INFO - Using device: cuda:0 (2446207300.py:136)[39m
[37m2025-04-11 17:55:56,099 - ProteinEmbedder_Rostlab/prot_t5_xl_uniref50 - INFO - Using device: cuda:0 (ProteinEmbedder.py:58)[39m
[37m2025-04-11 17:55:56,758 - ProteinEmbedder_Rostlab/prot_t5_xl_uniref50 - INFO - Loading model and tokenizer for Rostlab/prot_t5_xl_uniref50 (ProteinEmbedder.py:62)[39m
[37m2025-04-11 17:55:56,758 - ProteinEmbedder_Rostlab/prot_t5_xl_uniref50 - INFO - Model architecture: T5ForConditionalGeneration (ProteinEmbedder.py:63)[39m
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/

In [15]:
import gradio as gr

def greet(name, intensity):
    return "Hello, " + name + "!" * int(intensity)

demo = gr.Interface(
    fn=greet,
    inputs=["text", "slider"],
    outputs=["text"],
)

demo.launch()

* Running on local URL:  http://127.0.0.1:7870

To create a public link, set `share=True` in `launch()`.


