In [2]:
!pip install transformers
!pip install scikit-learn
!pip install torch torchvision torchaudio



In [3]:
import sklearn
print(sklearn.__version__)
import torch
print(torch.__version__)
print("CUDA Available:", torch.cuda.is_available())


1.5.2
2.5.1+cu124
CUDA Available: True


In [4]:
import requests
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load the data from the GitHub link
github_url = "https://raw.githubusercontent.com/gracwng/LLM-Bioinformatic-Pipeline-Generation/refs/heads/main/cwl_documents/workflowhub/transformed_data/transformed_workflow_cwl_documents.json"
response = requests.get(github_url)

if response.status_code == 200:
    try:
        data = response.json()
    except ValueError as e:
        raise Exception(f"Error parsing JSON: {e}\nResponse text: {response.text[:500]}")
else:
    raise Exception(f"Failed to fetch data. Status code: {response.status_code}")

# Convert JSON data to a DataFrame
df = pd.DataFrame(data)

# Check if required fields exist
required_columns = ['description', 'inputs', 'outputs']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise Exception(f"Missing required columns: {missing_columns}")

# Extract features and labels
df['features'] = df[['description', 'inputs', 'outputs']].apply(lambda x: ' '.join(map(str, x)), axis=1)
df['label'] = df['outputs']

# Drop rows with missing or invalid labels
df = df[df['label'].notnull()]

# Ensure labels are discrete classes (convert to string for classification)
df['label'] = df['label'].astype(str)

# Split the data into training and testing sets
X = df['features']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the Alpaca model and tokenizer
model_name = "declare-lab/flan-alpaca-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Function to predict using the Alpaca model
def alpaca_predict(input_texts):
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model.generate(**inputs, max_length=50)
    predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    # Clean up predictions
    return [pred.strip().split('\n')[0] for pred in predictions]

# Predict on the test set
print("Predicting on test set...")
test_predictions = alpaca_predict(X_test.tolist())
print("Predictions complete.")

# Debug: Inspect predictions and true labels
print("Sample predictions:")
for input_text, true_label, prediction in zip(X_test[:5], y_test[:5], test_predictions[:5]):
    print(f"Input: {input_text}")
    print(f"True Label: {true_label}")
    print(f"Predicted: {prediction}")

# Calculate accuracy
correct_predictions = sum(
    1 for true, pred in zip(y_test, test_predictions)
    if true.strip().lower() in pred.strip().lower()  # Case-insensitive substring match
)
total_predictions = len(y_test)
accuracy_percentage = (correct_predictions / total_predictions) * 100

print(f"Model Accuracy: {accuracy_percentage:.2f}%")


Predicting on test set...
Predictions complete.
Sample predictions:
Input: transfer file from a remote FTP/HTTP server to the TES {'curl_config_file': {'type': 'File', 'inputBinding': {'prefix': '-K', 'separate': True, 'position': 1}}} {'known_sites_file': {'type': 'File', 'outputBinding': {'glob': '*.gz'}}}
True Label: {'known_sites_file': {'type': 'File', 'outputBinding': {'glob': '*.gz'}}}
Predicted: curl_config_file curl_config_file.input() curl_config_file.output() curl_config_file.output()
Input: None {'illumina_accessions': 'string[]', 'ref_human_genome': 'File'} {'original_fastq1': {'type': 'File[]', 'outputSource': 'main/original_fastq1'}, 'processed_fastq': {'type': 'File', 'outputSource': 'samtools_fastq/fastq'}, 'fastqc_summary': {'type': 'File[]', 'outputSource': 'main/fastqc_summary'}, 'fastqc_zip': {'type': 'File[]', 'outputSource': 'main/fastqc_zip'}, 'multiqc_htmls': {'type': 'File[]', 'outputSource': 'main/multiqc_html'}, 'multiqc_zips': {'type': 'File[]', 'outputSour