<a href="https://colab.research.google.com/github/eamirhn/Kaggle-Projects/blob/main/ChatBot_Medical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install gpt4all

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from gpt4all import GPT4All
model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM

Downloading: 100%|██████████| 4.66G/4.66G [02:38<00:00, 29.5MiB/s]
Verifying: 100%|██████████| 4.66G/4.66G [00:14<00:00, 315MiB/s]


In [None]:
text = """
Analyze the following radiomic features of a brain tumor and provide a brief explanation:
    'original_shape_Elongation': 0.7532,
    'original_firstorder_Energy': 1256789.3245,
    'original_glcm_Correlation': 0.9876,
"""
with model.chat_session():
    print(model.generate(f"Are you Famillier with brian tumors? {text}", max_tokens=210))

A fascinating topic!

Yes, I'm familiar with brain tumors and radiomic features.

Let's break down the three radiomic features you provided:

1. **Original_Shape_Elongation**: This feature measures the elongation of a tumor shape in its original (native) space. Elongation is defined as the ratio of the longest axis to the shortest axis of an object, which can be thought of as a measure of how "stretched out" or irregularly shaped the tumor is. A value close to 1 indicates that the tumor has a more spherical shape, while values closer to 0 indicate a more elongated or linear shape.

Value: 0.7532 ( moderate elongation)

2. **Original_Firstorder_Energy**: This feature represents the first-order statistical moment of an image's intensity histogram, which is related to the distribution of pixel intensities within the tumor region. In this case, the value is extremely high (~1.25 million), indicating a very broad and dispersed intensity distribution in the original space.




# Simple Method RAG

In [None]:
import os
from gpt4all import GPT4All

# Define the RetrievalSystem class
class RetrievalSystem:
    def __init__(self, file_paths):
        self.data = self.load_files(file_paths)

    def load_files(self, file_paths):
        data = []
        for file_path in file_paths:
            try:
                with open(file_path, 'r') as file:
                    data.append(file.read())
            except FileNotFoundError:
                print(f"Error: File not found - {file_path}")
        return data

    def retrieve(self, query):
        # Simple retrieval: return all content for demo purposes
        # In a real-world scenario, you would implement a more sophisticated retrieval method
        return ' '.join(self.data)

# Initialize the retrieval system with the paths to the .txt files
file_paths = ['file1.txt', 'file2.txt', 'file3.txt']
retrieval_system = RetrievalSystem(file_paths)

# Initialize the GPT-4 model
model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf")

# Function to generate diagnosis using RAG
def generate_diagnosis(query):
    # Retrieve relevant information
    retrieved_data = retrieval_system.retrieve(query)

    # Combine retrieved data with the query for context
    augmented_query = f"{query}\n{retrieved_data}"

    # Generate response using the model
    with model.chat_session():
        response = model.generate(augmented_query, max_tokens=210)

    return response

# Example usage
output = generate_diagnosis("What are the symptoms of brain tumors?")
print(output)

# Better Version RAG

In [None]:
import os

# List to store the content of the files
documents = []

# List of file names
file_names = ['Correlation.txt', 'Energy.txt', 'Elongation.txt']

# Directory where the files are located
# If the files are in the same directory as the script, you can leave this as an empty string
directory = '/content/'

# Loop through each file
for file_name in file_names:
    # Construct the full file path
    file_path = os.path.join(directory, file_name)

    try:
        # Open and read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            # Append the content to the documents list
            documents.append(content)
        print(f"Successfully read {file_name}")
    except FileNotFoundError:
        print(f"Error: File {file_name} not found.")
    except IOError:
        print(f"Error: Could not read file {file_name}")

# Print the number of documents read
print(f"\nTotal documents read: {len(documents)}")

# Optionally, print a preview of each document
for i, doc in enumerate(documents, 1):
    print(f"\nDocument {i} preview:")
    print(doc[:50] + "...")  # Print first 150 characters

Successfully read Correlation.txt
Successfully read Energy.txt
Successfully read Elongation.txt

Total documents read: 3

Document 1 preview:
The Gray-Level Co-Occurrence Matrix (GLCM) correla...

Document 2 preview:
First-order energy is a texture feature derived fr...

Document 3 preview:
Elongation is a shape feature that describes the r...


In [None]:
import numpy as np
from collections import Counter

# Sample documents from .txt files
# Replace these strings with the actual text read from your .txt files
# documents = [
#     'Text from file 1. Symptoms of brain tumors include headaches, seizures, and vision problems.',
#     'Text from file 2. Diagnosis methods involve MRI scan, CT scan, and biopsy.',
#     'Text from file 3. Treatment options can include surgery, chemotherapy, and radiation therapy.'
# ]

def compute_bm25(query, documents, k1=1.5, b=0.75):
    # Tokenize documents and query
    tokenized_docs = [doc.lower().split() for doc in documents]
    tokenized_query = query.lower().split()
    doc_freqs = [Counter(doc) for doc in tokenized_docs]
    avg_doc_length = np.mean([len(doc) for doc in tokenized_docs])
    scores = []

    for doc_freq in doc_freqs:
        score = 0
        doc_length = sum(doc_freq.values())
        for term in tokenized_query:
            if term in doc_freq:
                term_freq = doc_freq[term]
                doc_count_containing_term = sum([1 for d in tokenized_docs if term in d])
                idf = np.log((len(documents) - doc_count_containing_term + 0.5) / (doc_count_containing_term + 0.5) + 1)
                score += idf * (term_freq * (k1 + 1)) / (term_freq + k1 * (1 - b + b * (doc_length / avg_doc_length)))
        scores.append(score)

    return scores

# Initialize the GPT-4 model
# model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf")

def chunk_text(text, max_chunk_size=500):
    words = text.split()
    chunks = []
    current_chunk = []
    current_size = 0
    for word in words:
        if current_size + len(word) + 1 > max_chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_size = len(word)
        else:
            current_chunk.append(word)
            current_size += len(word) + 1
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def generate_diagnosis(query):
    # Use BM25 to retrieve relevant documents
    scores = compute_bm25(query, documents)
    sorted_docs = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
    top_docs = [documents[index] for index, score in sorted_docs if score > 0]
    retrieved_data = '\n'.join(top_docs[:2])  # Get top 2 documents to reduce context size

    # Chunk the retrieved data
    chunks = chunk_text(retrieved_data)

    responses = []
    for chunk in chunks:
        augmented_query = f"{query}\nContext: {chunk}"

        # Generate response using the model
        with model.chat_session():
            response = model.generate(augmented_query, max_tokens=100)
        responses.append(response)

    # Combine responses
    final_response = ' '.join(responses)
    return final_response

# Example usage
output = generate_diagnosis("What are the key features used in brain tumor radiomics?")
print(output)

Exception ignored on calling ctypes callback function: <function LLModel._prompt_callback at 0x7871f01348b0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gpt4all/_pyllmodel.py", line 614, in _prompt_callback
    @staticmethod
KeyboardInterrupt: 
Exception ignored on calling ctypes callback function: <function LLModel._prompt_callback at 0x7871f01348b0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gpt4all/_pyllmodel.py", line 614, in _prompt_callback
    @staticmethod
KeyboardInterrupt: 
Exception ignored on calling ctypes callback function: <function LLModel._prompt_callback at 0x7871f01348b0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gpt4all/_pyllmodel.py", line 614, in _prompt_callback
    @staticmethod
KeyboardInterrupt: 


In brain tumor radiomics, some key features used include:

1. **First-order statistics**: These features describe the distribution of pixel intensities in an image. Examples include:
	* Mean intensity
	* Standard deviation (SD)
	* Skewness
	* Kurtosis
2. **Texture features**:
	* First-Order Energy (also known as Angular Second Moment or Uniformity): measures uniformity of pixel intensity distribution, with higher values indicating more homogeneous regions.
	* Brain tumor radiomics involves extracting quantitative features from medical images, such as MRI or CT scans, to analyze and characterize brain tumors. The following are some common key features used in brain tumor radiomics:

1. **Original_Firstorder_Energy**: As you mentioned, it's a feature derived from first-order statistics of image intensity values. It represents the magnitude of the energy contained within an image voxel.
2. **Gray Level Co-Occurrence Matrix (GLCM) Features**: These features capture In brain tumor radiomics

# Fine-Tune

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset

# Load the pre-trained model and tokenizer
model_name = "nomic-ai/gpt4all-j"  # Replace with your specific GPT4All model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Prepare your custom dataset
# This is a simplified example. Replace with your actual data.
train_data = [
    {"input": "Describe the symptoms of a brain tumor.", "output": "Common symptoms of brain tumors include headaches, seizures, vision problems, and cognitive changes."},
    {"input": "What is the significance of GLCM correlation in brain tumor analysis?", "output": "GLCM correlation in brain tumor analysis helps identify texture patterns in MRI scans, potentially distinguishing between tumor types and healthy tissue."},
    {"input": "Explain the importance of shape elongation in tumor characterization.", "output": "Shape elongation in tumor characterization quantifies how stretched a tumor is, which can indicate growth patterns and potential aggressiveness."},
    # Add more examples...
]

# Function to tokenize and format the data
def tokenize_function(examples):
    inputs = [f"Input: {item['input']}\nOutput: {item['output']}" for item in examples]
    return tokenizer(inputs, padding="max_length", truncation=True, max_length=512)

# Create a Hugging Face Dataset
dataset = Dataset.from_list(train_data)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_gpt4all")
tokenizer.save_pretrained("./fine_tuned_gpt4all")

# Function to generate responses using the fine-tuned model
def generate_response(input_text):
    input_ids = tokenizer.encode(f"Input: {input_text}\nOutput:", return_tensors="pt")
    output = model.generate(input_ids, max_length=150, num_return_sequences=1, no_repeat_ngram_size=2)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
print(generate_response("What are the key features used in brain tumor radiomics?"))