In [10]:
import pandas as pd

# Read the original combined file for Id mapping
original_df = pd.read_csv('Data/llm_responses_combined.csv')

# Create a mapping dictionary from Id to other columns
id_mapping = original_df.set_index('Id')[['Title', 'Body', 'ImageURLs']].to_dict('index')

# Column mapping for renaming
column_mapping = {
    'ID': 'Id',
    'TITLE': 'llm_title_response',
    'BODY': 'llm_body_response'
}

# Read and update each GPT-4 file
# Zero-shot file
zero_shot_df = pd.read_csv('Data/GPT-4o/GPT4o_incontext_res.csv')
print(f"Processing zero-shot file with {len(zero_shot_df)} rows")
zero_shot_df = zero_shot_df.rename(columns=column_mapping)
for column in ['Title', 'Body', 'ImageURLs']:
    zero_shot_df[column] = zero_shot_df['Id'].map(lambda x: id_mapping[x][column])
zero_shot_df.to_csv('Data/GPT-4o/GPT4o_incontext_res.csv', index=False)

# Few-shot file
few_shot_df = pd.read_csv('Data/GPT-4o/GPT4o_fewshot_res.csv')
print(f"Processing few-shot file with {len(few_shot_df)} rows")
few_shot_df = few_shot_df.rename(columns=column_mapping)
for column in ['Title', 'Body', 'ImageURLs']:
    few_shot_df[column] = few_shot_df['Id'].map(lambda x: id_mapping[x][column])
few_shot_df.to_csv('Data/GPT-4o/GPT4o_fewshot_res.csv', index=False)

# Chain of thoughts file
chain_of_thoughts_df = pd.read_csv('Data/GPT-4o/GPT4o_cot_res.csv')
print(f"Processing chain of thoughts file with {len(chain_of_thoughts_df)} rows")
chain_of_thoughts_df = chain_of_thoughts_df.rename(columns=column_mapping)
for column in ['Title', 'Body', 'ImageURLs']:
    chain_of_thoughts_df[column] = chain_of_thoughts_df['Id'].map(lambda x: id_mapping[x][column])
chain_of_thoughts_df.to_csv('Data/GPT-4o/GPT4o_cot_res.csv', index=False)

# Print verification information
print("\nVerification of updated files:")
for name, df in [("Zero-shot", zero_shot_df), 
                 ("Few-shot", few_shot_df), 
                 ("Chain of thoughts", chain_of_thoughts_df)]:
    print(f"\n{name} file:")
    print("Columns:", list(df.columns))
    print("Number of rows:", len(df))
    print("\nSample of first row:")
    for col in df.columns:
        val = str(df[col].iloc[0])
        print(f"{col}:", val[:100] + "..." if len(val) > 100 else val)

Processing zero-shot file with 143 rows
Processing few-shot file with 143 rows
Processing chain of thoughts file with 143 rows

Verification of updated files:

Zero-shot file:
Columns: ['Id', 'llm_title_response', 'llm_body_response', 'Title', 'Body', 'ImageURLs']
Number of rows: 143

Sample of first row:
Id: 79041624
llm_title_response: Issue with Asynchronous Behavior in Node.js Function
llm_body_response: I'm working on a Node.js project and encountering an issue with the `convertGrammar` function in my ...
Title: How to use switch branch button only seems to work for single repo in multi-root workspace
Body: I am used to looking at the source control checkout button in vscode as a quick indication of what b...
ImageURLs: ['https://code.visualstudio.com/assets/docs/editor/multi-root-workspaces/hero.png']

Few-shot file:
Columns: ['Id', 'llm_title_response', 'llm_body_response', 'Title', 'Body', 'ImageURLs']
Number of rows: 143

Sample of first row:
Id: 79041624
llm_title_response: H

In [11]:
import pandas as pd

# Read the CSV files
zero_shot_df = pd.read_csv('Data/GPT-4o/GPT4o_incontext_res.csv')
few_shot_df = pd.read_csv('Data/GPT-4o/GPT4o_fewshot_res.csv')
chain_of_thoughts_df = pd.read_csv('Data/GPT-4o/GPT4o_cot_res.csv')

# Create the combined text column for original title and body
zero_shot_df['Title_Body'] = '<<' + zero_shot_df['Title'] + '>>\n<<' + zero_shot_df['Body'] + '>>'

# Create combined response columns for each LLM
zero_shot_df['llm_response_combined'] = '<<' + zero_shot_df['llm_title_response'] + '>>\n<<' + zero_shot_df['llm_body_response'] + '>>'
few_shot_df['llm_response_combined'] = '<<' + few_shot_df['llm_title_response'] + '>>\n<<' + few_shot_df['llm_body_response'] + '>>'
chain_of_thoughts_df['llm_response_combined'] = '<<' + chain_of_thoughts_df['llm_title_response'] + '>>\n<<' + chain_of_thoughts_df['llm_body_response'] + '>>'

# Create the final dataframe with all columns
combined_df = pd.DataFrame({
    'Id': zero_shot_df['Id'],
    'Title': zero_shot_df['Title'],
    'Body': zero_shot_df['Body'],
    'Title_Body': zero_shot_df['Title_Body'],
    'ImageURLs': zero_shot_df['ImageURLs'],
    # Zero-shot responses
    'llm_zero_shot_title': zero_shot_df['llm_title_response'],
    'llm_zero_shot_body': zero_shot_df['llm_body_response'],
    'llm_zero_shot_combined': zero_shot_df['llm_response_combined'],
    # Few-shot responses
    'llm_few_shot_title': few_shot_df['llm_title_response'],
    'llm_few_shot_body': few_shot_df['llm_body_response'],
    'llm_few_shot_combined': few_shot_df['llm_response_combined'],
    # Chain of thoughts responses
    'llm_cot_title': chain_of_thoughts_df['llm_title_response'],
    'llm_cot_body': chain_of_thoughts_df['llm_body_response'],
    'llm_cot_combined': chain_of_thoughts_df['llm_response_combined']
})

# Save the combined dataframe to a new CSV file in the GPT-4o folder
combined_df.to_csv('Data/GPT-4o/GPT-4o_responses_combined.csv', index=False)

# Print basic information about the combined file
print(f"Combined file created with {len(combined_df)} rows")
print("\nColumns in the combined file:")
for col in combined_df.columns:
    print(f"- {col}")

# Print a sample row to verify the structure
print("\nSample of first row:")
for col in combined_df.columns:
    print(f"\n{col}:")
    print(str(combined_df[col].iloc[0])[:100] + "..." if len(str(combined_df[col].iloc[0])) > 100 else str(combined_df[col].iloc[0]))

Combined file created with 143 rows

Columns in the combined file:
- Id
- Title
- Body
- Title_Body
- ImageURLs
- llm_zero_shot_title
- llm_zero_shot_body
- llm_zero_shot_combined
- llm_few_shot_title
- llm_few_shot_body
- llm_few_shot_combined
- llm_cot_title
- llm_cot_body
- llm_cot_combined

Sample of first row:

Id:
79041624

Title:
How to use switch branch button only seems to work for single repo in multi-root workspace

Body:
I am used to looking at the source control checkout button in vscode as a quick indication of what b...

Title_Body:
<<How to use switch branch button only seems to work for single repo in multi-root workspace>>
<<I a...

ImageURLs:
['https://code.visualstudio.com/assets/docs/editor/multi-root-workspaces/hero.png']

llm_zero_shot_title:
Issue with Asynchronous Behavior in Node.js Function

llm_zero_shot_body:
I'm working on a Node.js project and encountering an issue with the `convertGrammar` function in my ...

llm_zero_shot_combined:
<<Issue with Asynchro

In [12]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Id                      143 non-null    int64 
 1   Title                   143 non-null    object
 2   Body                    143 non-null    object
 3   Title_Body              143 non-null    object
 4   ImageURLs               143 non-null    object
 5   llm_zero_shot_title     143 non-null    object
 6   llm_zero_shot_body      142 non-null    object
 7   llm_zero_shot_combined  142 non-null    object
 8   llm_few_shot_title      143 non-null    object
 9   llm_few_shot_body       143 non-null    object
 10  llm_few_shot_combined   143 non-null    object
 11  llm_cot_title           143 non-null    object
 12  llm_cot_body            143 non-null    object
 13  llm_cot_combined        143 non-null    object
dtypes: int64(1), object(13)
memory usage: 15.8+ KB


In [13]:
combined_df.head()

Unnamed: 0,Id,Title,Body,Title_Body,ImageURLs,llm_zero_shot_title,llm_zero_shot_body,llm_zero_shot_combined,llm_few_shot_title,llm_few_shot_body,llm_few_shot_combined,llm_cot_title,llm_cot_body,llm_cot_combined
0,79041624,How to use switch branch button only seems to ...,I am used to looking at the source control che...,<<How to use switch branch button only seems t...,['https://code.visualstudio.com/assets/docs/ed...,Issue with Asynchronous Behavior in Node.js Fu...,I'm working on a Node.js project and encounter...,<<Issue with Asynchronous Behavior in Node.js ...,How to Properly Use 'require' in Node.js for M...,I'm working on a Node.js project and using the...,<<How to Properly Use 'require' in Node.js for...,How to Properly Use Promises in Node.js for Fi...,I'm working on a Node.js project using Visual ...,<<How to Properly Use Promises in Node.js for ...
1,79078823,LangGraph Error - Invalid Tool Calls when usin...,I'm following the LangGraph tutorial (from Lan...,<<LangGraph Error - Invalid Tool Calls when us...,['https://i.sstatic.net/lGCQGu49.png'],JSONDecodeError When Using Local Tool in Node....,I'm encountering an issue with a Node.js scrip...,<<JSONDecodeError When Using Local Tool in Nod...,JSONDecodeError when using local tool in Node.js,I'm trying to perform a simple arithmetic oper...,<<JSONDecodeError when using local tool in Nod...,How to Resolve JSONDecodeError in Node.js When...,I'm trying to perform a simple arithmetic oper...,<<How to Resolve JSONDecodeError in Node.js Wh...
2,79120816,Why Is Intellesense & Colour Formatting Not Wo...,Intellisense and text colour formatting is not...,<<Why Is Intellesense & Colour Formatting Not ...,['https://i.sstatic.net/9SW9hTKN.png'],Nullable DateTime Property Causing Issues in A...,I'm working on an ASP.NET project and have def...,<<Nullable DateTime Property Causing Issues in...,Nullable DateTime Property in C# Model Class,I am working on a C# project and have a model ...,<<Nullable DateTime Property in C# Model Class...,How to handle nullable DateTime properties in ...,I'm working on a C# project and have a model c...,<<How to handle nullable DateTime properties i...
3,79041900,Http 429 is not returnable in status code from...,"[Route(""{route}"")]\n[HttpPost]\npublic async T...",<<Http 429 is not returnable in status code fr...,['https://i.sstatic.net/jtsyQ5SF.png'],"Why am I receiving a 429 ""Too Many Requests"" e...",I'm working on an API integration using a REST...,"<<Why am I receiving a 429 ""Too Many Requests""...",Handling 429 Too Many Requests Error Despite 2...,I'm encountering an issue where my HTTP reques...,<<Handling 429 Too Many Requests Error Despite...,"Why am I receiving a 429 ""Too Many Requests"" e...","I'm using Postman to test an API, and I'm enco...","<<Why am I receiving a 429 ""Too Many Requests""..."
4,79081114,PhpStorm is marking variables given by Control...,PhpStorm is marking variables given by Control...,<<PhpStorm is marking variables given by Contr...,['https://i.sstatic.net/AFTN318J.png'],Yii2 Password Input Field Not Rendering Correctly,I'm working on a Yii2 project and trying to re...,<<Yii2 Password Input Field Not Rendering Corr...,PHP Error: Undefined Variable in Form Field,I'm encountering an 'undefined variable' error...,<<PHP Error: Undefined Variable in Form Field>...,"How to resolve ""undefined variable"" error in P...",I'm working on a PHP project using a form help...,"<<How to resolve ""undefined variable"" error in..."


In [14]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

def create_embeddings_and_analyze(df, model_name='sentence-transformers/all-mpnet-base-v1'):
    """
    Create embeddings using SentenceTransformer and analyze similarities for both
    separate title/body responses and combined responses
    """
    # Initialize the model
    model = SentenceTransformer(model_name)
    
    # Generate embeddings for original content
    print("Generating embeddings for original content...")
    title_embeddings = model.encode(df['Title'].tolist(), show_progress_bar=True)
    body_embeddings = model.encode(df['Body'].tolist(), show_progress_bar=True)
    title_body_embeddings = model.encode(df['Title_Body'].tolist(), show_progress_bar=True)
    
    # Generate embeddings for LLM responses
    print("\nGenerating embeddings for LLM responses...")
    response_embeddings = {
        # Title responses
        'zero_shot_title': model.encode(df['llm_zero_shot_title'].tolist(), show_progress_bar=True),
        'few_shot_title': model.encode(df['llm_few_shot_title'].tolist(), show_progress_bar=True),
        'cot_title': model.encode(df['llm_cot_title'].tolist(), show_progress_bar=True),
        
        # Body responses
        'zero_shot_body': model.encode(df['llm_zero_shot_body'].tolist(), show_progress_bar=True),
        'few_shot_body': model.encode(df['llm_few_shot_body'].tolist(), show_progress_bar=True),
        'cot_body': model.encode(df['llm_cot_body'].tolist(), show_progress_bar=True),
        
        # Combined responses
        'zero_shot_combined': model.encode(df['llm_zero_shot_combined'].tolist(), show_progress_bar=True),
        'few_shot_combined': model.encode(df['llm_few_shot_combined'].tolist(), show_progress_bar=True),
        'cot_combined': model.encode(df['llm_cot_combined'].tolist(), show_progress_bar=True)
    }
    
    # Calculate similarities
    similarities = {
        'title': {},
        'body': {},
        'combined': {}
    }
    
    # Calculate title similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['title'][response_type] = np.diagonal(
            cosine_similarity(title_embeddings, response_embeddings[f'{response_type}_title'])
        )
    
    # Calculate body similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['body'][response_type] = np.diagonal(
            cosine_similarity(body_embeddings, response_embeddings[f'{response_type}_body'])
        )
    
    # Calculate combined similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['combined'][response_type] = np.diagonal(
            cosine_similarity(title_body_embeddings, response_embeddings[f'{response_type}_combined'])
        )
    
    # Add similarity scores to dataframe
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        df[f'similarity_{response_type}_title'] = similarities['title'][response_type]
        df[f'similarity_{response_type}_body'] = similarities['body'][response_type]
        df[f'similarity_{response_type}_combined'] = similarities['combined'][response_type]
    
    # Save embeddings
    np.save('Data/title_embeddings_st.npy', title_embeddings)
    np.save('Data/body_embeddings_st.npy', body_embeddings)
    np.save('Data/title_body_embeddings_st.npy', title_body_embeddings)
    
    for response_type, embeddings in response_embeddings.items():
        np.save(f'Data/{response_type}_embeddings_st.npy', embeddings)
    
    return df, similarities

def analyze_similarities(similarities):
    """
    Analyze and visualize similarity distributions for title, body, and combined responses
    """
    categories = {
        'Very High': (0.8, 1.0),
        'High': (0.6, 0.8),
        'Moderate': (0.4, 0.6),
        'Low': (0.2, 0.4),
        'Very Low': (0.0, 0.2)
    }
    
    results = {
        'title': {},
        'body': {},
        'combined': {}
    }
    
    # Create figures for each type of response
    for response_category in ['title', 'body', 'combined']:
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        fig.suptitle(f'Similarity Analysis for {response_category.capitalize()} Responses')
        
        # Plot distributions and calculate statistics for each LLM type
        for idx, response_type in enumerate(['zero_shot', 'few_shot', 'cot']):
            scores = similarities[response_category][response_type]
            
            # Calculate statistics
            stats = {
                'mean': np.mean(scores),
                'median': np.median(scores),
                'std': np.std(scores),
                'min': np.min(scores),
                'max': np.max(scores)
            }
            
            # Calculate distribution across categories
            distribution = {}
            for category, (low, high) in categories.items():
                count = np.sum((scores >= low) & (scores < high))
                percentage = (count / len(scores)) * 100
                distribution[category] = percentage
                
            results[response_category][response_type] = {
                'statistics': stats,
                'distribution': distribution
            }
            
            # Plot histogram
            ax = axes[idx]
            sns.histplot(scores, bins=30, ax=ax)
            ax.set_title(f'{response_type.replace("_", " ").title()}')
            ax.set_xlabel('Similarity Score')
            ax.set_ylabel('Count')
            
            # Add category boundaries
            for category, (low, high) in categories.items():
                if low > 0:  # Don't plot the lowest boundary
                    ax.axvline(x=low, color='r', linestyle='--', alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(f'Data/similarity_distributions_{response_category}_st.png')
        plt.close()
    
    # Print analysis
    for response_category in ['title', 'body', 'combined']:
        print(f"\n{response_category.upper()} Response Analysis:")
        for response_type, result in results[response_category].items():
            print(f"\n{response_type.upper()}:")
            print("\nBasic Statistics:")
            for metric, value in result['statistics'].items():
                print(f"{metric}: {value:.4f}")
            
            print("\nDistribution across categories:")
            for category, percentage in result['distribution'].items():
                print(f"{category}: {percentage:.2f}%")
    
    return results

# Usage example:
if __name__ == "__main__":
    # Read the combined CSV file
    df = pd.read_csv('Data/GPT-4o/GPT-4o_responses_combined.csv')
    
    # Generate embeddings and calculate similarities
    df_with_similarities, similarities = create_embeddings_and_analyze(df)
    
    # Analyze and visualize results
    analysis_results = analyze_similarities(similarities)
    
    # Save updated dataframe with similarities
    df_with_similarities.to_csv('Data/GPT-4o/GPT-4o_responses_with_similarities_st.csv', index=False)



Generating embeddings for original content...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


Generating embeddings for LLM responses...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


TITLE Response Analysis:

ZERO_SHOT:

Basic Statistics:
mean: 0.4350
median: 0.4398
std: 0.2173
min: -0.0140
max: 0.8970

Distribution across categories:
Very High: 2.80%
High: 23.78%
Moderate: 31.47%
Low: 25.17%
Very Low: 16.08%

FEW_SHOT:

Basic Statistics:
mean: 0.4248
median: 0.4154
std: 0.2217
min: -0.0189
max: 0.9739

Distribution across categories:
Very High: 5.59%
High: 16.78%
Moderate: 30.77%
Low: 28.67%
Very Low: 17.48%

COT:

Basic Statistics:
mean: 0.4370
median: 0.4300
std: 0.2252
min: -0.0189
max: 0.9138

Distribution across categories:
Very High: 4.90%
High: 23.08%
Moderate: 29.37%
Low: 23.78%
Very Low: 18.18%

BODY Response Analysis:

ZERO_SHOT:

Basic Statistics:
mean: 0.5046
median: 0.5201
std: 0.2097
min: -0.0391
max: 0.8924

Distribution across categories:
Very High: 4.90%
High: 32.87%
Moderate: 31.47%
Low: 22.38%
Very Low: 7.69%

FEW_SHOT:

Basic Statistics:
mean: 0.4724
median: 0.4682
std: 0.2043
min: -0.0369
max: 0.8933

Distribution across categories:
Very High

In [15]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

def create_embeddings_and_analyze(df, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    """
    Create embeddings using SentenceTransformer and analyze similarities for both
    separate title/body responses and combined responses
    """
    # Initialize the model
    model = SentenceTransformer(model_name)
    
    # Generate embeddings for original content
    print("Generating embeddings for original content...")
    title_embeddings = model.encode(df['Title'].tolist(), show_progress_bar=True)
    body_embeddings = model.encode(df['Body'].tolist(), show_progress_bar=True)
    title_body_embeddings = model.encode(df['Title_Body'].tolist(), show_progress_bar=True)
    
    # Generate embeddings for LLM responses
    print("\nGenerating embeddings for LLM responses...")
    response_embeddings = {
        # Title responses
        'zero_shot_title': model.encode(df['llm_zero_shot_title'].tolist(), show_progress_bar=True),
        'few_shot_title': model.encode(df['llm_few_shot_title'].tolist(), show_progress_bar=True),
        'cot_title': model.encode(df['llm_cot_title'].tolist(), show_progress_bar=True),
        
        # Body responses
        'zero_shot_body': model.encode(df['llm_zero_shot_body'].tolist(), show_progress_bar=True),
        'few_shot_body': model.encode(df['llm_few_shot_body'].tolist(), show_progress_bar=True),
        'cot_body': model.encode(df['llm_cot_body'].tolist(), show_progress_bar=True),
        
        # Combined responses
        'zero_shot_combined': model.encode(df['llm_zero_shot_combined'].tolist(), show_progress_bar=True),
        'few_shot_combined': model.encode(df['llm_few_shot_combined'].tolist(), show_progress_bar=True),
        'cot_combined': model.encode(df['llm_cot_combined'].tolist(), show_progress_bar=True)
    }
    
    # Calculate similarities
    similarities = {
        'title': {},
        'body': {},
        'combined': {}
    }
    
    # Calculate title similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['title'][response_type] = np.diagonal(
            cosine_similarity(title_embeddings, response_embeddings[f'{response_type}_title'])
        )
    
    # Calculate body similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['body'][response_type] = np.diagonal(
            cosine_similarity(body_embeddings, response_embeddings[f'{response_type}_body'])
        )
    
    # Calculate combined similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['combined'][response_type] = np.diagonal(
            cosine_similarity(title_body_embeddings, response_embeddings[f'{response_type}_combined'])
        )
    
    # Add similarity scores to dataframe
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        df[f'similarity_{response_type}_title'] = similarities['title'][response_type]
        df[f'similarity_{response_type}_body'] = similarities['body'][response_type]
        df[f'similarity_{response_type}_combined'] = similarities['combined'][response_type]
    
    # Save embeddings
    np.save('Data/title_embeddings_st.npy', title_embeddings)
    np.save('Data/body_embeddings_st.npy', body_embeddings)
    np.save('Data/title_body_embeddings_st.npy', title_body_embeddings)
    
    for response_type, embeddings in response_embeddings.items():
        np.save(f'Data/{response_type}_embeddings_st.npy', embeddings)
    
    return df, similarities

def analyze_similarities(similarities):
    """
    Analyze and visualize similarity distributions for title, body, and combined responses
    """
    categories = {
        'Very High': (0.8, 1.0),
        'High': (0.6, 0.8),
        'Moderate': (0.4, 0.6),
        'Low': (0.2, 0.4),
        'Very Low': (0.0, 0.2)
    }
    
    results = {
        'title': {},
        'body': {},
        'combined': {}
    }
    
    # Create figures for each type of response
    for response_category in ['title', 'body', 'combined']:
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        fig.suptitle(f'Similarity Analysis for {response_category.capitalize()} Responses')
        
        # Plot distributions and calculate statistics for each LLM type
        for idx, response_type in enumerate(['zero_shot', 'few_shot', 'cot']):
            scores = similarities[response_category][response_type]
            
            # Calculate statistics
            stats = {
                'mean': np.mean(scores),
                'median': np.median(scores),
                'std': np.std(scores),
                'min': np.min(scores),
                'max': np.max(scores)
            }
            
            # Calculate distribution across categories
            distribution = {}
            for category, (low, high) in categories.items():
                count = np.sum((scores >= low) & (scores < high))
                percentage = (count / len(scores)) * 100
                distribution[category] = percentage
                
            results[response_category][response_type] = {
                'statistics': stats,
                'distribution': distribution
            }
            
            # Plot histogram
            ax = axes[idx]
            sns.histplot(scores, bins=30, ax=ax)
            ax.set_title(f'{response_type.replace("_", " ").title()}')
            ax.set_xlabel('Similarity Score')
            ax.set_ylabel('Count')
            
            # Add category boundaries
            for category, (low, high) in categories.items():
                if low > 0:  # Don't plot the lowest boundary
                    ax.axvline(x=low, color='r', linestyle='--', alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(f'Data/similarity_distributions_{response_category}_st.png')
        plt.close()
    
    # Print analysis
    for response_category in ['title', 'body', 'combined']:
        print(f"\n{response_category.upper()} Response Analysis:")
        for response_type, result in results[response_category].items():
            print(f"\n{response_type.upper()}:")
            print("\nBasic Statistics:")
            for metric, value in result['statistics'].items():
                print(f"{metric}: {value:.4f}")
            
            print("\nDistribution across categories:")
            for category, percentage in result['distribution'].items():
                print(f"{category}: {percentage:.2f}%")
    
    return results

# Usage example:
if __name__ == "__main__":
    # Read the combined CSV file
    df = pd.read_csv('Data/GPT-4o/GPT-4o_responses_combined.csv')
    
    # Generate embeddings and calculate similarities
    df_with_similarities, similarities = create_embeddings_and_analyze(df)
    
    # Analyze and visualize results
    analysis_results = analyze_similarities(similarities)
    
    # Save updated dataframe with similarities
    df_with_similarities.to_csv('Data/GPT-4o/GPT-4o_responses_with_similarities_st.csv', index=False)



Generating embeddings for original content...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


Generating embeddings for LLM responses...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


TITLE Response Analysis:

ZERO_SHOT:

Basic Statistics:
mean: 0.4561
median: 0.4492
std: 0.2215
min: -0.0331
max: 0.9435

Distribution across categories:
Very High: 6.29%
High: 26.57%
Moderate: 25.17%
Low: 27.27%
Very Low: 13.99%

FEW_SHOT:

Basic Statistics:
mean: 0.4453
median: 0.4525
std: 0.2297
min: -0.0264
max: 0.9809

Distribution across categories:
Very High: 6.29%
High: 19.58%
Moderate: 34.27%
Low: 21.68%
Very Low: 16.78%

COT:

Basic Statistics:
mean: 0.4539
median: 0.4557
std: 0.2376
min: -0.0296
max: 0.9612

Distribution across categories:
Very High: 9.09%
High: 20.98%
Moderate: 27.97%
Low: 24.48%
Very Low: 16.78%

BODY Response Analysis:

ZERO_SHOT:

Basic Statistics:
mean: 0.5215
median: 0.5378
std: 0.1935
min: -0.0131
max: 0.9503

Distribution across categories:
Very High: 7.69%
High: 30.07%
Moderate: 37.06%
Low: 18.88%
Very Low: 5.59%

FEW_SHOT:

Basic Statistics:
mean: 0.4907
median: 0.4961
std: 0.1933
min: -0.0568
max: 0.9164

Distribution across categories:
Very High

In [16]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import seaborn as sns

def create_embeddings_and_analyze(df, model_name='OrlikB/KartonBERT-USE-base-v1'):
    """
    Create embeddings using KartonBERT and analyze similarities with normalized embeddings
    """
    # Initialize the model
    model = SentenceTransformer(model_name)
    
    # Generate embeddings for original content
    print("Generating embeddings for original content...")
    title_embeddings = model.encode(df['Title'].tolist(), normalize_embeddings=True, show_progress_bar=True)
    body_embeddings = model.encode(df['Body'].tolist(), normalize_embeddings=True, show_progress_bar=True)
    title_body_embeddings = model.encode(df['Title_Body'].tolist(), normalize_embeddings=True, show_progress_bar=True)
    
    # Generate embeddings for LLM responses
    print("\nGenerating embeddings for LLM responses...")
    response_embeddings = {
        # Title responses
        'zero_shot_title': model.encode(df['llm_zero_shot_title'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        'few_shot_title': model.encode(df['llm_few_shot_title'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        'cot_title': model.encode(df['llm_cot_title'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        
        # Body responses
        'zero_shot_body': model.encode(df['llm_zero_shot_body'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        'few_shot_body': model.encode(df['llm_few_shot_body'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        'cot_body': model.encode(df['llm_cot_body'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        
        # Combined responses
        'zero_shot_combined': model.encode(df['llm_zero_shot_combined'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        'few_shot_combined': model.encode(df['llm_few_shot_combined'].tolist(), normalize_embeddings=True, show_progress_bar=True),
        'cot_combined': model.encode(df['llm_cot_combined'].tolist(), normalize_embeddings=True, show_progress_bar=True)
    }
    
    # Calculate similarities using dot product
    similarities = {
        'title': {},
        'body': {},
        'combined': {}
    }
    
    # Calculate title similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        # Using dot product for normalized vectors
        similarities['title'][response_type] = np.sum(
            title_embeddings * response_embeddings[f'{response_type}_title'], axis=1
        )
    
    # Calculate body similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['body'][response_type] = np.sum(
            body_embeddings * response_embeddings[f'{response_type}_body'], axis=1
        )
    
    # Calculate combined similarities
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        similarities['combined'][response_type] = np.sum(
            title_body_embeddings * response_embeddings[f'{response_type}_combined'], axis=1
        )
    
    # Add similarity scores to dataframe
    for response_type in ['zero_shot', 'few_shot', 'cot']:
        df[f'similarity_{response_type}_title'] = similarities['title'][response_type]
        df[f'similarity_{response_type}_body'] = similarities['body'][response_type]
        df[f'similarity_{response_type}_combined'] = similarities['combined'][response_type]
    
    # Save embeddings
    np.save('Data/title_embeddings_kartonbert.npy', title_embeddings)
    np.save('Data/body_embeddings_kartonbert.npy', body_embeddings)
    np.save('Data/title_body_embeddings_kartonbert.npy', title_body_embeddings)
    
    for response_type, embeddings in response_embeddings.items():
        np.save(f'Data/{response_type}_embeddings_kartonbert.npy', embeddings)
    
    return df, similarities

def analyze_similarities(similarities):
    """
    Analyze and visualize similarity distributions for title, body, and combined responses
    """
    categories = {
        'Very High': (0.8, 1.0),
        'High': (0.6, 0.8),
        'Moderate': (0.4, 0.6),
        'Low': (0.2, 0.4),
        'Very Low': (0.0, 0.2)
    }
    
    results = {
        'title': {},
        'body': {},
        'combined': {}
    }
    
    # Create figures for each type of response
    for response_category in ['title', 'body', 'combined']:
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        fig.suptitle(f'Similarity Analysis for {response_category.capitalize()} Responses (KartonBERT)')
        
        # Plot distributions and calculate statistics for each LLM type
        for idx, response_type in enumerate(['zero_shot', 'few_shot', 'cot']):
            scores = similarities[response_category][response_type]
            
            # Calculate statistics
            stats = {
                'mean': np.mean(scores),
                'median': np.median(scores),
                'std': np.std(scores),
                'min': np.min(scores),
                'max': np.max(scores)
            }
            
            # Calculate distribution across categories
            distribution = {}
            for category, (low, high) in categories.items():
                count = np.sum((scores >= low) & (scores < high))
                percentage = (count / len(scores)) * 100
                distribution[category] = percentage
                
            results[response_category][response_type] = {
                'statistics': stats,
                'distribution': distribution
            }
            
            # Plot histogram
            ax = axes[idx]
            sns.histplot(scores, bins=30, ax=ax)
            ax.set_title(f'{response_type.replace("_", " ").title()}')
            ax.set_xlabel('Similarity Score')
            ax.set_ylabel('Count')
            
            # Add category boundaries
            for category, (low, high) in categories.items():
                if low > 0:  # Don't plot the lowest boundary
                    ax.axvline(x=low, color='r', linestyle='--', alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(f'Data/similarity_distributions_{response_category}_kartonbert.png')
        plt.close()
    
    # Print analysis
    for response_category in ['title', 'body', 'combined']:
        print(f"\n{response_category.upper()} Response Analysis (KartonBERT):")
        for response_type, result in results[response_category].items():
            print(f"\n{response_type.upper()}:")
            print("\nBasic Statistics:")
            for metric, value in result['statistics'].items():
                print(f"{metric}: {value:.4f}")
            
            print("\nDistribution across categories:")
            for category, percentage in result['distribution'].items():
                print(f"{category}: {percentage:.2f}%")
    
    return results

# Usage example:
if __name__ == "__main__":
    # Read the combined CSV file
    df = pd.read_csv('Data/GPT-4o/GPT-4o_responses_combined.csv')
    
    # Generate embeddings and calculate similarities
    df_with_similarities, similarities = create_embeddings_and_analyze(df)
    
    # Analyze and visualize results
    analysis_results = analyze_similarities(similarities)
    
    # Save updated dataframe with similarities
    df_with_similarities.to_csv('Data/GPT-4o/GPT-4o_responses_with_similarities_st.csv', index=False)



Generating embeddings for original content...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


Generating embeddings for LLM responses...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]


TITLE Response Analysis (KartonBERT):

ZERO_SHOT:

Basic Statistics:
mean: 0.5971
median: 0.5963
std: 0.1495
min: 0.2466
max: 0.9380

Distribution across categories:
Very High: 8.39%
High: 41.26%
Moderate: 39.86%
Low: 10.49%
Very Low: 0.00%

FEW_SHOT:

Basic Statistics:
mean: 0.5879
median: 0.5895
std: 0.1542
min: 0.2504
max: 0.9715

Distribution across categories:
Very High: 9.79%
High: 36.36%
Moderate: 40.56%
Low: 13.29%
Very Low: 0.00%

COT:

Basic Statistics:
mean: 0.5909
median: 0.6016
std: 0.1585
min: 0.2224
max: 0.9297

Distribution across categories:
Very High: 8.39%
High: 43.36%
Moderate: 37.06%
Low: 11.19%
Very Low: 0.00%

BODY Response Analysis (KartonBERT):

ZERO_SHOT:

Basic Statistics:
mean: 0.7142
median: 0.7265
std: 0.1182
min: 0.2825
max: 0.9454

Distribution across categories:
Very High: 21.68%
High: 61.54%
Moderate: 15.38%
Low: 1.40%
Very Low: 0.00%

FEW_SHOT:

Basic Statistics:
mean: 0.6974
median: 0.7024
std: 0.1097
min: 0.3284
max: 0.9280

Distribution across cat