In [1]:
# Cell 1: Install required packages

!pip install pandas scikit-learn torch transformers[torch] datasets rouge-score nltk gensim networkx plotly



In [2]:
# Cell 2: Import required libraries and download NLTK data

import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from rouge_score import rouge_scorer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from gensim.models import Word2Vec
import networkx as nx
import plotly.graph_objects as go


In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# 4: Define data preprocessing function for CSV input

def load_and_preprocess_data(file_path):
    # Load data from CSV file
    df = pd.read_csv(file_path)

    # Ensure the required columns are present
    required_columns = ['Report Name', 'History', 'Observation', 'Impression']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"CSV must contain the following columns: {required_columns}")

    # Process each row
    processed_data = []
    for _, row in df.iterrows():
        processed_data.append({
            'input': f"Report Name: {row['Report Name']} History: {row['History']} Observation: {row['Observation']}",
            'output': f"Impression: {row['Impression']}"
        })

    # Convert to DataFrame
    processed_df = pd.DataFrame(processed_data)

    # Split into train and eval sets
    train_data, eval_data = train_test_split(processed_df, test_size=30, random_state=42)

    # Save processed data
    train_data.to_csv('/content/medical_nlp_project/train_data.csv', index=False)
    eval_data.to_csv('/content/medical_nlp_project/eval_data.csv', index=False)

    return train_data, eval_data


In [16]:
# 5: Define model fine-tuning function with proper data preparation for language modeling

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
import gc

def fine_tune_model(train_data, eval_data):
    # Free up memory
    gc.collect()

    # Load a smaller model: GPT-2 small
    model_name = "gpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Ensure the tokenizer has a padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    # Prepare datasets
    def tokenize_and_prepare_lm(examples):
        inputs = [f"{inp} {out}" for inp, out in zip(examples["input"], examples["output"])]
        tokenized = tokenizer(inputs, truncation=True, padding="max_length", max_length=128)

        # Prepare labels for language modeling (shift input_ids right)
        tokenized["labels"] = tokenized["input_ids"].copy()

        return tokenized

    train_dataset = Dataset.from_pandas(train_data)
    eval_dataset = Dataset.from_pandas(eval_data)

    tokenized_train = train_dataset.map(tokenize_and_prepare_lm, batched=True, remove_columns=train_dataset.column_names)
    tokenized_eval = eval_dataset.map(tokenize_and_prepare_lm, batched=True, remove_columns=eval_dataset.column_names)

    # Set up training arguments for CPU
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=1,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        no_cuda=True,  # Force CPU usage
        dataloader_num_workers=0,
        logging_strategy="epoch",
        disable_tqdm=False,
    )

    # Fine-tune the model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval
    )

    trainer.train()

    # Save the fine-tuned model
    model.save_pretrained("./fine_tuned_model")
    tokenizer.save_pretrained("./fine_tuned_model")

    return model, tokenizer

In [20]:
# 6: Define optimized model evaluation function

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
import numpy as np
from tqdm import tqdm

def fast_evaluate_model(model, tokenizer, eval_data, num_samples=30):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Sample a subset of the evaluation data
    eval_subset = eval_data.sample(n=min(num_samples, len(eval_data)), random_state=42)

    total_loss = 0
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}

    for _, row in tqdm(eval_subset.iterrows(), total=len(eval_subset), desc="Evaluating"):
        input_text = row['input']
        true_impression = row['output'].replace("Impression: ", "")

        # Tokenize input
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
        input_ids = inputs.input_ids
        attention_mask = inputs.attention_mask

        # Calculate loss
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            total_loss += outputs.loss.item()

        # Generate impression (simplified)
        with torch.no_grad():
            output_sequences = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=50,  # Reduced for speed
                num_return_sequences=1,
                no_repeat_ngram_size=2,
            )

        generated_impression = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
        generated_impression = generated_impression.split(input_text)[-1].strip()

        # Calculate ROUGE scores
        scores = scorer.score(true_impression, generated_impression)
        for metric in rouge_scores:
            rouge_scores[metric] += scores[metric].fmeasure

    # Calculate averages
    avg_loss = total_loss / len(eval_subset)
    perplexity = np.exp(avg_loss)
    avg_rouge_scores = {metric: score / len(eval_subset) for metric, score in rouge_scores.items()}

    return perplexity, avg_rouge_scores

# Example usage:
# perplexity, rouge_scores = fast_evaluate_model(model, tokenizer, eval_data, num_samples=30)
# print(f"Perplexity: {perplexity}")
# print(f"ROUGE scores: {rouge_scores}")

In [7]:
# 7: Define text analysis functions

def process_text(text):
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]

    # Apply stemming and lemmatization
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    processed_words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words]

    return processed_words

def get_top_word_pairs(all_processed_words, top_n=100):
    # Generate word embeddings
    model = Word2Vec(all_processed_words, vector_size=100, window=5, min_count=1, workers=4)

    # Calculate word similarities
    word_similarities = []
    vocab = list(model.wv.key_to_index.keys())

    for i, word1 in enumerate(vocab):
        for word2 in vocab[i+1:]:
            similarity = model.wv.similarity(word1, word2)
            word_similarities.append((word1, word2, similarity))

    # Sort and get top pairs
    word_similarities.sort(key=lambda x: x[2], reverse=True)
    top_pairs = word_similarities[:top_n]

    return top_pairs


In [8]:
# 8: Define visualization function

def create_word_pair_visualization(word_pairs):
    G = nx.Graph()

    for word1, word2, similarity in word_pairs:
        G.add_edge(word1, word2, weight=similarity)

    pos = nx.spring_layout(G)

    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    node_x = []
    node_y = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            colorscale='YlGnBu',
            size=10,
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            )
        )
    )

    node_adjacencies = []
    node_text = []
    for node, adjacencies in enumerate(G.adjacency()):
        node_adjacencies.append(len(adjacencies[1]))
        node_text.append(f'{adjacencies[0]}: {len(adjacencies[1])} connections')

    node_trace.marker.color = node_adjacencies
    node_trace.text = node_text

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='Word Pair Similarity Network',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20,l=5,r=5,t=40),
                        annotations=[ dict(
                            text="",
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.005, y=-0.002 ) ],
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )

    fig.write_html("word_pair_visualization.html")


In [17]:
# 9: Main execution

# Load and preprocess data
file_path = '/content/impression_300_llm.csv'  # Adjust this path
train_data, eval_data = load_and_preprocess_data(file_path)

# Fine-tune model
model, tokenizer = fine_tune_model(train_data, eval_data)

# Evaluate model
perplexity, rouge_scores = evaluate_model(model, tokenizer, eval_data)
print(f"Perplexity: {perplexity}")
print(f"ROUGE scores: {rouge_scores}")

# Perform text analysis
all_text = ' '.join(train_data['input'] + ' ' + train_data['output'] + ' ' +
                    eval_data['input'] + ' ' + eval_data['output'])
processed_words = process_text(all_text)
top_word_pairs = get_top_word_pairs(processed_words)

# Create visualization
create_word_pair_visualization(top_word_pairs)

# 10: Save results (optional)

# Save fine-tuned model
model.save_pretrained("/content/medical_nlp_project/fine_tuned_model")
tokenizer.save_pretrained("/content/medical_nlp_project/fine_tuned_model")

# Save visualization
from google.colab import files
files.download('word_pair_visualization.html')




Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]



Step,Training Loss
75,4.7657


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


ValueError: Input length of input_ids is 443, but `max_length` is set to 150. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [21]:

# Load your evaluation data
eval_data = pd.read_csv('/content/medical_nlp_project/eval_data.csv')  # Adjust the path as needed

# Cell 2: Load the pre-trained model and tokenizer

model_path = "./fine_tuned_model"  # Adjust this path to where you saved your fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Cell 3: Evaluate the model

perplexity, rouge_scores = evaluate_model(model, tokenizer, eval_data)

# Cell 4: Print the results

print(f"Perplexity: {perplexity}")
print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"  {metric}: {score}")

# Optional: Generate a sample impression
sample_input = eval_data['input'].iloc[0]  # Take the first input as a sample
inputs = tokenizer(sample_input, return_tensors="pt", truncation=True, max_length=512)
sample_output = model.generate(inputs.input_ids, max_new_tokens=150, num_beams=4, no_repeat_ngram_size=2, early_stopping=True)
sample_impression = tokenizer.decode(sample_output[0], skip_special_tokens=True)

print("\nSample Input:")
print(sample_input)
print("\nGenerated Impression:")
print(sample_impression.split(sample_input)[-1].strip())  # Print only the generated part

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


KeyboardInterrupt: 