In [1]:
import pandas as pd

# Read the CSV files
zero_shot_df = pd.read_csv('Data/llama-3.2/llm_responses_final_zero_shot.csv')
few_shot_df = pd.read_csv('Data/llama-3.2/llm_responses_final_few_shot.csv')
chain_of_thoughts_df = pd.read_csv('Data/llama-3.2/llm_responses_final_chain_of_thoughts.csv')

# Create the combined text column for original title and body
zero_shot_df['Title_Body'] = '<<' + zero_shot_df['Title'] + '>>\n<<' + zero_shot_df['Body'] + '>>'

# Create combined response columns for each LLM
zero_shot_df['llm_response_combined'] = '<<' + zero_shot_df['llm_title_response'] + '>>\n<<' + zero_shot_df['llm_body_response'] + '>>'
few_shot_df['llm_response_combined'] = '<<' + few_shot_df['llm_title_response'] + '>>\n<<' + few_shot_df['llm_body_response'] + '>>'
chain_of_thoughts_df['llm_response_combined'] = '<<' + chain_of_thoughts_df['llm_title_response'] + '>>\n<<' + chain_of_thoughts_df['llm_body_response'] + '>>'

# Create the final dataframe with all columns
combined_df = pd.DataFrame({
    'Id': zero_shot_df['Id'],
    'Title': zero_shot_df['Title'],
    'Body': zero_shot_df['Body'],
    'Title_Body': zero_shot_df['Title_Body'],
    'ImageURLs': zero_shot_df['ImageURLs'],
    # Zero-shot responses
    'llm_zero_shot_title': zero_shot_df['llm_title_response'],
    'llm_zero_shot_body': zero_shot_df['llm_body_response'],
    'llm_zero_shot_combined': zero_shot_df['llm_response_combined'],
    # Few-shot responses
    'llm_few_shot_title': few_shot_df['llm_title_response'],
    'llm_few_shot_body': few_shot_df['llm_body_response'],
    'llm_few_shot_combined': few_shot_df['llm_response_combined'],
    # Chain of thoughts responses
    'llm_cot_title': chain_of_thoughts_df['llm_title_response'],
    'llm_cot_body': chain_of_thoughts_df['llm_body_response'],
    'llm_cot_combined': chain_of_thoughts_df['llm_response_combined']
})

# Save the combined dataframe to a new CSV file
combined_df.to_csv('Data/llama-3.2/llm_responses_combined.csv', index=False)

# Print basic information about the combined file
print(f"Combined file created with {len(combined_df)} rows")
print("\nColumns in the combined file:")
for col in combined_df.columns:
    print(f"- {col}")

# Print a sample row to verify the structure
print("\nSample of first row:")
for col in combined_df.columns:
    print(f"\n{col}:")
    print(str(combined_df[col].iloc[0])[:100] + "..." if len(str(combined_df[col].iloc[0])) > 100 else str(combined_df[col].iloc[0]))

Combined file created with 143 rows

Columns in the combined file:
- Id
- Title
- Body
- Title_Body
- ImageURLs
- llm_zero_shot_title
- llm_zero_shot_body
- llm_zero_shot_combined
- llm_few_shot_title
- llm_few_shot_body
- llm_few_shot_combined
- llm_cot_title
- llm_cot_body
- llm_cot_combined

Sample of first row:

Id:
79146548

Title:
GitHub Copilot responds to 'Hey Code' but dictation doesn't work

Body:
As the title explains, I can start an inline chat session using the 'hey code' voice command in VS C...

Title_Body:
<<GitHub Copilot responds to 'Hey Code' but dictation doesn't work>>
<<As the title explains, I can ...

ImageURLs:
['https://i.sstatic.net/MgGjdapB.png']

llm_zero_shot_title:
** Unexpected Null Pointer Exception in Event Handler Setup

llm_zero_shot_body:
I'm experiencing an issue with my code where the `SetupEventHandlers` method is not being called as ...

llm_zero_shot_combined:
<<** Unexpected Null Pointer Exception in Event Handler Setup>>
<<I'm experiencing an

In [2]:
combined_df.head()

Unnamed: 0,Id,Title,Body,Title_Body,ImageURLs,llm_zero_shot_title,llm_zero_shot_body,llm_zero_shot_combined,llm_few_shot_title,llm_few_shot_body,llm_few_shot_combined,llm_cot_title,llm_cot_body,llm_cot_combined
0,79146548,GitHub Copilot responds to 'Hey Code' but dict...,"As the title explains, I can start an inline c...",<<GitHub Copilot responds to 'Hey Code' but di...,['https://i.sstatic.net/MgGjdapB.png'],** Unexpected Null Pointer Exception in Event ...,I'm experiencing an issue with my code where t...,<<** Unexpected Null Pointer Exception in Even...,"** ""Error when accessing nested object propert...",**\n\nI'm having trouble with my code and I'm ...,"<<** ""Error when accessing nested object prope...","Error in SetupEventHandlers() function: ""GPT4o...",I'm experiencing an issue with my code in the ...,"<<Error in SetupEventHandlers() function: ""GPT..."
1,79146419,How can I fix my Workflow file to successfully...,I am trying to use Github Actions with Azure S...,<<How can I fix my Workflow file to successful...,['https://i.sstatic.net/THwNK2Jj.png'],Unhandled Exception in ASP.NET Core Web API - ...,I'm experiencing an issue with my project wher...,<<Unhandled Exception in ASP.NET Core Web API ...,"**\n""Error in SQL Query: 'Invalid column name'...",**\n\nI am experiencing an issue with my code ...,"<<**\n""Error in SQL Query: 'Invalid column nam...","Error in RStudio: ""Error in file(con, ""r"") : c...",I'm trying to implement a custom authenticatio...,"<<Error in RStudio: ""Error in file(con, ""r"") :..."
2,79146412,LINQPad 8 Dump Property Order different that L...,"In LINQPad 5 with Linq-to-Sql DataContext, if ...",<<LINQPad 8 Dump Property Order different that...,['https://i.sstatic.net/efq4SfvI.png'],Unhandled Exception in ASP.NET MVC Application...,I'm experiencing an issue with my project wher...,<<Unhandled Exception in ASP.NET MVC Applicati...,"""How to fix 'No module named 'gpt2' error in P...",**\n\nI'm experiencing an issue with my Python...,"<<""How to fix 'No module named 'gpt2' error in...",Error in SkiaSharp NuGet Package: 'SkiaSharp' ...,I'm trying to implement a feature in my applic...,<<Error in SkiaSharp NuGet Package: 'SkiaSharp...
3,79146127,SyntaxError: Cannot use import statement outsi...,"I'm using TypeScript, ESM, npm, and ts-jest. U...",<<SyntaxError: Cannot use import statement out...,['https://i.sstatic.net/Jp5wj6k2.png'],Error in Wena-Test-Runner: Command Failed with...,I'm experiencing an issue with my project wher...,<<Error in Wena-Test-Runner: Command Failed wi...,"** ""Syntax Error: Cannot use import statement ...",**\n\nI'm having trouble with a Python script ...,"<<** ""Syntax Error: Cannot use import statemen...",JavaScript Syntax Error on Line 2: Unexpected ...,I'm having trouble with my WENA test runner. W...,<<JavaScript Syntax Error on Line 2: Unexpecte...
4,79145758,Typescript Polymorphic Component Event Handler,I have written a strongly-typed Polymorphic Ty...,<<Typescript Polymorphic Component Event Handl...,['https://i.sstatic.net/19LCKEF3.png'],"** Unhandled Exception: ""Property 'currentTarg...",I'm experiencing an issue with my code where t...,"<<** Unhandled Exception: ""Property 'currentTa...","""How to Create a Dynamic Dropdown List in Exce...",**\n\nI'm experiencing an issue with my Python...,"<<""How to Create a Dynamic Dropdown List in Ex...","Error in HTML Element Reference: ""Property 'cu...",I'm experiencing an issue with my code in the ...,"<<Error in HTML Element Reference: ""Property '..."


In [3]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

def create_embeddings_and_analyze(df, model_name='sentence-transformers/all-mpnet-base-v1'):
    """
    Create embeddings using SentenceTransformer and analyze similarities for both
    separate title/body responses and combined responses
    """
    # Initialize the model
    model = SentenceTransformer(model_name)
    
    # Generate embeddings for original content
    print("Generating embeddings for original content...")
    title_embeddings = model.encode(df['Title'].tolist(), show_progress_bar=True)
    body_embeddings = model.encode(df['Body'].tolist(), show_progress_bar=True)
    title_body_embeddings = model.encode(df['Title_Body'].tolist(), show_progress_bar=True)
    
    # Generate embeddings for LLM responses
    print("\nGenerating embeddings for LLM responses...")
    response_embeddings = {
        # Title responses
        'zero_shot_title': model.encode(df['llm_zero_shot_title'].tolist(), show_progress_bar=True),
        'few_shot_title': model.encode(df['llm_few_shot_title'].tolist(), show_progress_bar=True),
        'cot_title': model.encode(df['llm_cot_title'].tolist(), show_progress_bar=True),
        
        # Body responses
        'zero_shot_body': model.encode(df['llm_zero_shot_body'].tolist(), show_progress_bar=True),
        'few_shot_body': model.encode(df['llm_few_shot_body'].tolist(), show_progress_bar=True),
        'cot_body': model.encode(df['llm_cot_body'].tolist(), show_progress_bar=True),
        
        # Combined responses
        'zero_shot_combined': model.encode(df['llm_zero_shot_combined'].tolist(), show_progress_bar=True),
        'few_shot_combined': model.encode(df['llm_few_shot_combined'].tolist(), show_progress_bar=True),
        'cot_combined': model.encode(df['llm_cot_combined'].tolist(), show_progress_bar=True)
    }
    
    # Calculate similarities
    similarities = {
        'title': {},
        'body': {},
        'combined': {}
    }
    
    # Calculate title similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['title'][response_type] = np.diagonal(
            cosine_similarity(title_embeddings, response_embeddings[f'{response_type}_title'])
        )
    
    # Calculate body similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['body'][response_type] = np.diagonal(
            cosine_similarity(body_embeddings, response_embeddings[f'{response_type}_body'])
        )
    
    # Calculate combined similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['combined'][response_type] = np.diagonal(
            cosine_similarity(title_body_embeddings, response_embeddings[f'{response_type}_combined'])
        )
    
    # Add similarity scores to dataframe
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        df[f'similarity_{response_type}_title'] = similarities['title'][response_type]
        df[f'similarity_{response_type}_body'] = similarities['body'][response_type]
        df[f'similarity_{response_type}_combined'] = similarities['combined'][response_type]
    
    # Save embeddings with updated paths
    np.save('Data/llama-3.2/title_embeddings_st.npy', title_embeddings)
    np.save('Data/llama-3.2/body_embeddings_st.npy', body_embeddings)
    np.save('Data/llama-3.2/title_body_embeddings_st.npy', title_body_embeddings)
    
    for response_type, embeddings in response_embeddings.items():
        np.save(f'Data/llama-3.2/{response_type}_embeddings_st.npy', embeddings)
    
    return df, similarities

def analyze_similarities(similarities):
    """
    Analyze and visualize similarity distributions for title, body, and combined responses
    """
    categories = {
        'Very High': (0.8, 1.0),
        'High': (0.6, 0.8),
        'Moderate': (0.4, 0.6),
        'Low': (0.2, 0.4),
        'Very Low': (0.0, 0.2)
    }
    
    results = {
        'title': {},
        'body': {},
        'combined': {}
    }
    
    # Create figures for each type of response
    for response_category in ['title', 'body', 'combined']:
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        fig.suptitle(f'Similarity Analysis for {response_category.capitalize()} Responses')
        
        # Plot distributions and calculate statistics for each LLM type
        for idx, response_type in enumerate(['zero_shot', 'few_shot', 'cot']):
            scores = similarities[response_category][response_type]
            
            # Calculate statistics
            stats = {
                'mean': np.mean(scores),
                'median': np.median(scores),
                'std': np.std(scores),
                'min': np.min(scores),
                'max': np.max(scores)
            }
            
            # Calculate distribution across categories
            distribution = {}
            for category, (low, high) in categories.items():
                count = np.sum((scores >= low) & (scores < high))
                percentage = (count / len(scores)) * 100
                distribution[category] = percentage
                
            results[response_category][response_type] = {
                'statistics': stats,
                'distribution': distribution
            }
            
            # Plot histogram
            ax = axes[idx]
            sns.histplot(scores, bins=30, ax=ax)
            ax.set_title(f'{response_type.replace("_", " ").title()}')
            ax.set_xlabel('Similarity Score')
            ax.set_ylabel('Count')
            
            # Add category boundaries
            for category, (low, high) in categories.items():
                if low > 0:  # Don't plot the lowest boundary
                    ax.axvline(x=low, color='r', linestyle='--', alpha=0.3)
        
        plt.tight_layout()
        # Updated path for saving plots
        plt.savefig(f'Data/llama-3.2/similarity_distributions_{response_category}_st.png')
        plt.close()
    
    # Print analysis
    for response_category in ['title', 'body', 'combined']:
        print(f"\n{response_category.upper()} Response Analysis:")
        for response_type, result in results[response_category].items():
            print(f"\n{response_type.upper()}:")
            print("\nBasic Statistics:")
            for metric, value in result['statistics'].items():
                print(f"{metric}: {value:.4f}")
            
            print("\nDistribution across categories:")
            for category, percentage in result['distribution'].items():
                print(f"{category}: {percentage:.2f}%")
    
    return results

# Usage example:
if __name__ == "__main__":
    # Read the combined CSV file with updated path
    df = pd.read_csv('Data/llama-3.2/llm_responses_combined.csv')
    
    # Generate embeddings and calculate similarities
    df_with_similarities, similarities = create_embeddings_and_analyze(df)
    
    # Analyze and visualize results
    analysis_results = analyze_similarities(similarities)
    
    # Save updated dataframe with similarities using new path
    df_with_similarities.to_csv('Data/llama-3.2/llm_responses_with_similarities_st.csv', index=False)

Generating embeddings for original content...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


Generating embeddings for LLM responses...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


TITLE Response Analysis:

ZERO_SHOT:

Basic Statistics:
mean: 0.2018
median: 0.1416
std: 0.2105
min: -0.1047
max: 0.9072

Distribution across categories:
Very High: 1.40%
High: 3.50%
Moderate: 11.89%
Low: 23.78%
Very Low: 46.15%

FEW_SHOT:

Basic Statistics:
mean: 0.0433
median: 0.0312
std: 0.1076
min: -0.1458
max: 0.5320

Distribution across categories:
Very High: 0.00%
High: 0.00%
Moderate: 2.80%
Low: 2.80%
Very Low: 61.54%

COT:

Basic Statistics:
mean: 0.2100
median: 0.1468
std: 0.2110
min: -0.0901
max: 0.9859

Distribution across categories:
Very High: 1.40%
High: 5.59%
Moderate: 12.59%
Low: 20.98%
Very Low: 48.95%

BODY Response Analysis:

ZERO_SHOT:

Basic Statistics:
mean: 0.2535
median: 0.2073
std: 0.2123
min: -0.1515
max: 0.8100

Distribution across categories:
Very High: 0.70%
High: 6.99%
Moderate: 17.48%
Low: 27.27%
Very Low: 39.86%

FEW_SHOT:

Basic Statistics:
mean: 0.0500
median: 0.0331
std: 0.0978
min: -0.1428
max: 0.4029

Distribution across categories:
Very High: 0.0

In [4]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

def create_embeddings_and_analyze(df, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    """
    Create embeddings using SentenceTransformer and analyze similarities for both
    separate title/body responses and combined responses
    """
    # Initialize the model
    model = SentenceTransformer(model_name)
    
    # Generate embeddings for original content
    print("Generating embeddings for original content...")
    title_embeddings = model.encode(df['Title'].tolist(), show_progress_bar=True)
    body_embeddings = model.encode(df['Body'].tolist(), show_progress_bar=True)
    title_body_embeddings = model.encode(df['Title_Body'].tolist(), show_progress_bar=True)
    
    # Generate embeddings for LLM responses
    print("\nGenerating embeddings for LLM responses...")
    response_embeddings = {
        # Title responses
        'zero_shot_title': model.encode(df['llm_zero_shot_title'].tolist(), show_progress_bar=True),
        'few_shot_title': model.encode(df['llm_few_shot_title'].tolist(), show_progress_bar=True),
        'cot_title': model.encode(df['llm_cot_title'].tolist(), show_progress_bar=True),
        
        # Body responses
        'zero_shot_body': model.encode(df['llm_zero_shot_body'].tolist(), show_progress_bar=True),
        'few_shot_body': model.encode(df['llm_few_shot_body'].tolist(), show_progress_bar=True),
        'cot_body': model.encode(df['llm_cot_body'].tolist(), show_progress_bar=True),
        
        # Combined responses
        'zero_shot_combined': model.encode(df['llm_zero_shot_combined'].tolist(), show_progress_bar=True),
        'few_shot_combined': model.encode(df['llm_few_shot_combined'].tolist(), show_progress_bar=True),
        'cot_combined': model.encode(df['llm_cot_combined'].tolist(), show_progress_bar=True)
    }
    
    # Calculate similarities
    similarities = {
        'title': {},
        'body': {},
        'combined': {}
    }
    
    # Calculate title similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['title'][response_type] = np.diagonal(
            cosine_similarity(title_embeddings, response_embeddings[f'{response_type}_title'])
        )
    
    # Calculate body similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['body'][response_type] = np.diagonal(
            cosine_similarity(body_embeddings, response_embeddings[f'{response_type}_body'])
        )
    
    # Calculate combined similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['combined'][response_type] = np.diagonal(
            cosine_similarity(title_body_embeddings, response_embeddings[f'{response_type}_combined'])
        )
    
    # Add similarity scores to dataframe
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        df[f'similarity_{response_type}_title'] = similarities['title'][response_type]
        df[f'similarity_{response_type}_body'] = similarities['body'][response_type]
        df[f'similarity_{response_type}_combined'] = similarities['combined'][response_type]
    
    # Save embeddings with updated paths
    np.save('Data/llama-3.2/title_embeddings_st_minilm.npy', title_embeddings)
    np.save('Data/llama-3.2/body_embeddings_st_minilm.npy', body_embeddings)
    np.save('Data/llama-3.2/title_body_embeddings_st_minilm.npy', title_body_embeddings)
    
    for response_type, embeddings in response_embeddings.items():
        np.save(f'Data/llama-3.2/{response_type}_embeddings_st_minilm.npy', embeddings)
    
    return df, similarities

def analyze_similarities(similarities):
    """
    Analyze and visualize similarity distributions for title, body, and combined responses
    """
    categories = {
        'Very High': (0.8, 1.0),
        'High': (0.6, 0.8),
        'Moderate': (0.4, 0.6),
        'Low': (0.2, 0.4),
        'Very Low': (0.0, 0.2)
    }
    
    results = {
        'title': {},
        'body': {},
        'combined': {}
    }
    
    # Create figures for each type of response
    for response_category in ['title', 'body', 'combined']:
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        fig.suptitle(f'Similarity Analysis for {response_category.capitalize()} Responses')
        
        # Plot distributions and calculate statistics for each LLM type
        for idx, response_type in enumerate(['zero_shot', 'few_shot', 'cot']):
            scores = similarities[response_category][response_type]
            
            # Calculate statistics
            stats = {
                'mean': np.mean(scores),
                'median': np.median(scores),
                'std': np.std(scores),
                'min': np.min(scores),
                'max': np.max(scores)
            }
            
            # Calculate distribution across categories
            distribution = {}
            for category, (low, high) in categories.items():
                count = np.sum((scores >= low) & (scores < high))
                percentage = (count / len(scores)) * 100
                distribution[category] = percentage
                
            results[response_category][response_type] = {
                'statistics': stats,
                'distribution': distribution
            }
            
            # Plot histogram
            ax = axes[idx]
            sns.histplot(scores, bins=30, ax=ax)
            ax.set_title(f'{response_type.replace("_", " ").title()}')
            ax.set_xlabel('Similarity Score')
            ax.set_ylabel('Count')
            
            # Add category boundaries
            for category, (low, high) in categories.items():
                if low > 0:  # Don't plot the lowest boundary
                    ax.axvline(x=low, color='r', linestyle='--', alpha=0.3)
        
        plt.tight_layout()
        # Updated path for saving plots
        plt.savefig(f'Data/llama-3.2/similarity_distributions_{response_category}_st_minilm.png')
        plt.close()
    
    # Print analysis
    for response_category in ['title', 'body', 'combined']:
        print(f"\n{response_category.upper()} Response Analysis:")
        for response_type, result in results[response_category].items():
            print(f"\n{response_type.upper()}:")
            print("\nBasic Statistics:")
            for metric, value in result['statistics'].items():
                print(f"{metric}: {value:.4f}")
            
            print("\nDistribution across categories:")
            for category, percentage in result['distribution'].items():
                print(f"{category}: {percentage:.2f}%")
    
    return results

# Usage example:
if __name__ == "__main__":
    # Read the combined CSV file with updated path
    df = pd.read_csv('Data/llama-3.2/llm_responses_combined.csv')
    
    # Generate embeddings and calculate similarities
    df_with_similarities, similarities = create_embeddings_and_analyze(df)
    
    # Analyze and visualize results
    analysis_results = analyze_similarities(similarities)
    
    # Save updated dataframe with similarities using new path
    df_with_similarities.to_csv('Data/llama-3.2/llm_responses_with_similarities_st_minilm.csv', index=False)

Generating embeddings for original content...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


Generating embeddings for LLM responses...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


TITLE Response Analysis:

ZERO_SHOT:

Basic Statistics:
mean: 0.2106
median: 0.1645
std: 0.2167
min: -0.1702
max: 0.9559

Distribution across categories:
Very High: 1.40%
High: 4.90%
Moderate: 9.79%
Low: 23.08%
Very Low: 46.85%

FEW_SHOT:

Basic Statistics:
mean: 0.0436
median: 0.0371
std: 0.1158
min: -0.1474
max: 0.6026

Distribution across categories:
Very High: 0.00%
High: 0.70%
Moderate: 1.40%
Low: 4.90%
Very Low: 53.85%

COT:

Basic Statistics:
mean: 0.2195
median: 0.1624
std: 0.2191
min: -0.1978
max: 0.9854

Distribution across categories:
Very High: 1.40%
High: 5.59%
Moderate: 13.29%
Low: 22.38%
Very Low: 46.15%

BODY Response Analysis:

ZERO_SHOT:

Basic Statistics:
mean: 0.2753
median: 0.2574
std: 0.2205
min: -0.0921
max: 0.8441

Distribution across categories:
Very High: 0.70%
High: 8.39%
Moderate: 22.38%
Low: 25.17%
Very Low: 34.97%

FEW_SHOT:

Basic Statistics:
mean: 0.0554
median: 0.0444
std: 0.1113
min: -0.1731
max: 0.4849

Distribution across categories:
Very High: 0.00

In [5]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import seaborn as sns

def create_embeddings_and_analyze(df, model_name='OrlikB/KartonBERT-USE-base-v1'):
    """
    Create embeddings using KartonBERT and analyze similarities with normalized embeddings
    """
    # Initialize the model
    model = SentenceTransformer(model_name)
    
    # Generate embeddings for original content
    print("Generating embeddings for original content...")
    title_embeddings = model.encode(df['Title'].tolist(), normalize_embeddings=True, show_progress_bar=True)
    body_embeddings = model.encode(df['Body'].tolist(), normalize_embeddings=True, show_progress_bar=True)
    title_body_embeddings = model.encode(df['Title_Body'].tolist(), normalize_embeddings=True, show_progress_bar=True)
    
    # Generate embeddings for LLM responses
    print("\nGenerating embeddings for LLM responses...")
    response_embeddings = {
        # Title responses
        'zero_shot_title': model.encode(df['llm_zero_shot_title'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        'few_shot_title': model.encode(df['llm_few_shot_title'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        'cot_title': model.encode(df['llm_cot_title'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        
        # Body responses
        'zero_shot_body': model.encode(df['llm_zero_shot_body'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        'few_shot_body': model.encode(df['llm_few_shot_body'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        'cot_body': model.encode(df['llm_cot_body'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        
        # Combined responses
        'zero_shot_combined': model.encode(df['llm_zero_shot_combined'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        'few_shot_combined': model.encode(df['llm_few_shot_combined'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        'cot_combined': model.encode(df['llm_cot_combined'].tolist(), normalize_embeddings=True, show_progress_bar=True)
    }
    
    # Calculate similarities using dot product
    similarities = {
        'title': {},
        'body': {},
        'combined': {}
    }
    
    # Calculate title similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        # Using dot product for normalized vectors
        similarities['title'][response_type] = np.sum(
            title_embeddings * response_embeddings[f'{response_type}_title'], axis=1
        )
    
    # Calculate body similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['body'][response_type] = np.sum(
            body_embeddings * response_embeddings[f'{response_type}_body'], axis=1
        )
    
    # Calculate combined similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['combined'][response_type] = np.sum(
            title_body_embeddings * response_embeddings[f'{response_type}_combined'], axis=1
        )
    
    # Add similarity scores to dataframe
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        df[f'similarity_{response_type}_title'] = similarities['title'][response_type]
        df[f'similarity_{response_type}_body'] = similarities['body'][response_type]
        df[f'similarity_{response_type}_combined'] = similarities['combined'][response_type]
    
    # Save embeddings with updated paths
    np.save('Data/llama-3.2/title_embeddings_kartonbert.npy', title_embeddings)
    np.save('Data/llama-3.2/body_embeddings_kartonbert.npy', body_embeddings)
    np.save('Data/llama-3.2/title_body_embeddings_kartonbert.npy', title_body_embeddings)
    
    for response_type, embeddings in response_embeddings.items():
        np.save(f'Data/llama-3.2/{response_type}_embeddings_kartonbert.npy', embeddings)
    
    return df, similarities

def analyze_similarities(similarities):
    """
    Analyze and visualize similarity distributions for title, body, and combined responses
    """
    categories = {
        'Very High': (0.8, 1.0),
        'High': (0.6, 0.8),
        'Moderate': (0.4, 0.6),
        'Low': (0.2, 0.4),
        'Very Low': (0.0, 0.2)
    }
    
    results = {
        'title': {},
        'body': {},
        'combined': {}
    }
    
    # Create figures for each type of response
    for response_category in ['title', 'body', 'combined']:
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        fig.suptitle(f'Similarity Analysis for {response_category.capitalize()} Responses (KartonBERT)')
        
        # Plot distributions and calculate statistics for each LLM type
        for idx, response_type in enumerate(['zero_shot', 'few_shot', 'cot']):
            scores = similarities[response_category][response_type]
            
            # Calculate statistics
            stats = {
                'mean': np.mean(scores),
                'median': np.median(scores),
                'std': np.std(scores),
                'min': np.min(scores),
                'max': np.max(scores)
            }
            
            # Calculate distribution across categories
            distribution = {}
            for category, (low, high) in categories.items():
                count = np.sum((scores >= low) & (scores < high))
                percentage = (count / len(scores)) * 100
                distribution[category] = percentage
                
            results[response_category][response_type] = {
                'statistics': stats,
                'distribution': distribution
            }
            
            # Plot histogram
            ax = axes[idx]
            sns.histplot(scores, bins=30, ax=ax)
            ax.set_title(f'{response_type.replace("_", " ").title()}')
            ax.set_xlabel('Similarity Score')
            ax.set_ylabel('Count')
            
            # Add category boundaries
            for category, (low, high) in categories.items():
                if low > 0:  # Don't plot the lowest boundary
                    ax.axvline(x=low, color='r', linestyle='--', alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(f'Data/llama-3.2/similarity_distributions_{response_category}_kartonbert.png')
        plt.close()
    
    # Print analysis
    for response_category in ['title', 'body', 'combined']:
        print(f"\n{response_category.upper()} Response Analysis (KartonBERT):")
        for response_type, result in results[response_category].items():
            print(f"\n{response_type.upper()}:")
            print("\nBasic Statistics:")
            for metric, value in result['statistics'].items():
                print(f"{metric}: {value:.4f}")
            
            print("\nDistribution across categories:")
            for category, percentage in result['distribution'].items():
                print(f"{category}: {percentage:.2f}%")
    
    return results

# Usage example:
if __name__ == "__main__":
    # Read the combined CSV file with updated path
    df = pd.read_csv('Data/llama-3.2/llm_responses_combined.csv')
    
    # Generate embeddings and calculate similarities
    df_with_similarities, similarities = create_embeddings_and_analyze(df)
    
    # Analyze and visualize results
    analysis_results = analyze_similarities(similarities)
    
    # Save updated dataframe with similarities using new path
    df_with_similarities.to_csv('Data/llama-3.2/llm_responses_with_similarities_kartonbert.csv', index=False)

Generating embeddings for original content...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


Generating embeddings for LLM responses...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


TITLE Response Analysis (KartonBERT):

ZERO_SHOT:

Basic Statistics:
mean: 0.4826
median: 0.4599
std: 0.1451
min: 0.2217
max: 0.9529

Distribution across categories:
Very High: 3.50%
High: 16.78%
Moderate: 46.85%
Low: 32.87%
Very Low: 0.00%

FEW_SHOT:

Basic Statistics:
mean: 0.3909
median: 0.3729
std: 0.0986
min: 0.2035
max: 0.8134

Distribution across categories:
Very High: 0.70%
High: 3.50%
Moderate: 37.06%
Low: 58.74%
Very Low: 0.00%

COT:

Basic Statistics:
mean: 0.4788
median: 0.4556
std: 0.1485
min: 0.2276
max: 0.9752

Distribution across categories:
Very High: 2.80%
High: 16.08%
Moderate: 45.45%
Low: 35.66%
Very Low: 0.00%

BODY Response Analysis (KartonBERT):

ZERO_SHOT:

Basic Statistics:
mean: 0.6003
median: 0.5901
std: 0.1229
min: 0.2488
max: 0.9212

Distribution across categories:
Very High: 2.80%
High: 45.45%
Moderate: 47.55%
Low: 4.20%
Very Low: 0.00%

FEW_SHOT:

Basic Statistics:
mean: 0.4575
median: 0.4596
std: 0.0822
min: 0.2203
max: 0.6455

Distribution across categ