# Final Data Analysis

## Imports and Setup

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

In [22]:
df_relevant = pd.read_excel('results/final/test.xlsx', sheet_name='relevant_papers')
df_contamination = pd.read_excel('results/final/test.xlsx', sheet_name='papers_about_contamination')
df_all_papers = pd.read_csv('results/ICSE_all_papers.csv')

print("Relevant Shape:", df_relevant.shape)
print("Papers About Contamination Shape:", df_contamination.shape)
print("All Papers Shape:", df_all_papers.shape)

Relevant Shape: (173, 27)
Papers About Contamination Shape: (3, 45)
All Papers Shape: (692, 11)


In [44]:
df_contamination

Unnamed: 0,reviewer,relevant,year,title,authors,url,abstract,artifact_available,artifact_reusable,artifact_functional,ai,ignore_task,task,non_llm_approaches,ignore_budget_fair,ignore_open_commercial,models_open_closed,ignore_num_models,num_models,ignore_model_names,model_families,model_scale,model_size_free_text,model_sizes_reported,ignore_model_versions,ignore_model_config,model_config,dataset_type,ignore_programming_language,programming_language,ignore_num_datasets,ignore_dataset_names,ignore_size_dataset,ignore_size_dataset_free_text,ignore_training_same_as_eval,ignore_data_version,ignore_cost,cost,cost_free_text,ignore_artefact_manual,artefact_manual,ignore_artefact_license,contamination,contamination_free_text,Unnamed: 44
0,DW,SPECIAL,2025,Decoding Secret Memorization in Code LLMs Thro...,"Nie, Yuqing, Wang, Chong, Wang, Kailong, Xu, G...",https://doi.org/10.1109/ICSE55347.2025.00229,Code Large Language Models (LLMs) have demonst...,False,False,False,True,secret memorization analysis,code memorisation detection,False,,OPEN SOURCE,open,5,5,,,MEDIUM,"StableCode-3B,3B;CodeGen2.5-7B-multi,7B;DeepSe...",TODAVE,1.0,"In the online decoding, the beam search size B...",inference,code,"HTML, Java, JavaScript, PHP, Python",HTML; Java; JavaScript; Python,,,,,,,GPU Usage,gpu,All the experiments are conducted on a server ...,https://github.com/jiangsha97/DESEC,1.0,False,1.0,Training Data Decontamination. Data cleaning s...,This paper is about contamination/memorisation
1,DW,SPECIAL,2024,Traces of Memorisation in Large Language Model...,"Al-Kaswan, Ali, Izadi, Maliheh, van Deursen, Arie",https://doi.org/10.1145/3597503.3639133,Large language models have gained significant ...,False,False,False,True,(Memorisation of) Code Generation,code memorisation detection,False,-,OPEN SOURCE,open,,SKIP,,,,,TODAVE,,,-,,,,,,,,,,,,,,,,,,This paper is investigating contamiation/leaka...
2,MK,SPECIAL,2024,Unveiling Memorization in Code Models,"Yang, Zhou, Zhao, Zhipeng, Wang, Chenyu, Shi, ...",https://doi.org/10.1145/3597503.3639074,"The availability of large-scale datasets, adva...",False,False,False,True,code generation (evaluation of the memorizatio...,code generation,False,-,OPEN SOURCE (see table 1 evaluation with CodeP...,open,"4 (Table 1: CodeParrot, CodeParrot-small, Poly...",6,,CodeParrot; PolyCoder; GPT-NEO; InCoder; StarC...,LARGE,CodeParrot is a GPT-2 model with 1.5 billion p...,TODAVE,1.0,True,inference,"code, documentation (text), configuration (tex...",Python,Python,2.0,"OpenAI’s HumanEval benchmark [21], https://hug...",LARGE,"after data cleaning, the processed\ndataset is...",1.0,0.0,"Environment with GPU, running time, temperature",time; gpu,... we run them an NVIDIA GeForce\nA5000 GPU w...,True,1.0,TRUE (MIT License),1.0,"To mitigate this threat, we choose a state-of-...",


### Creating Combined and Non-Relevant Dataframes

In [46]:
# Relevant Paper List
relevant_papers = df_relevant['title'].tolist()

# Remove relevant papers from all papers to create final non-relevant set
df_non_relevant = df_all_papers[~df_all_papers['title'].isin(relevant_papers)].copy()

# Make sure all non-relevant papers are marked as such
df_non_relevant['relevant'] = False

# Re-order columns to match relevant dataframe
common_columns = ['reviewer', 'relevant', 'year', 'title', 'authors', 'url', 'abstract', 'artifact_available', 'artifact_reusable', 'artifact_functional', 'ai']
df_non_relevant = df_non_relevant[common_columns]

# Add extra columns: 'task', 'non_llm_approaches', 'models_open_closed', 'num_models', 'model_families', 'model_scale', 'model_size_free_text', 'model_sizes_reported', 'model_config', 'dataset_type', 'programming_language', 'cost', 'cost_free_text', 'artefact_manual', 'contamination', 'contamination_free_text'
extra_columns = ['task', 'non_llm_approaches', 'models_open_closed', 'num_models', 'model_families', 'model_scale', 'model_size_free_text', 'model_sizes_reported', 'model_config', 'dataset_type', 'programming_language', 'cost', 'cost_free_text', 'artefact_manual', 'contamination', 'contamination_free_text']
for col in extra_columns:
    df_non_relevant[col] = None

# Contamination Paper List
contamination_papers = df_contamination['title'].tolist()

# Clean up contamination dataframe
df_contamination = df_contamination[common_columns + extra_columns]

# Remove contamination papers from non-relevant set to avoid duplication
df_non_relevant = df_non_relevant[~df_non_relevant['title'].isin(contamination_papers)].copy()

# Combine relevant, non-relevant and contamination dataframes
df_combined = pd.concat([df_relevant, df_non_relevant], ignore_index=True)
df_combined = pd.concat([df_combined, df_contamination], ignore_index=True)

In [55]:
print("Combined Shape:", df_combined.shape)
print("Num Unique Papers:", df_combined['title'].nunique())
print("Num Relevant Papers in Combined DF:", df_combined[df_combined['relevant'] == True].shape[0])
print("Num Unique Papers in Relevant DF:", df_relevant['title'].nunique()) # Should match number above
print("Num Contamination Papers in Combined DF:", df_combined[df_combined['relevant'] == "SPECIAL"].shape[0])
print("Num Unique Papers in Contamination DF:", df_contamination['title'].nunique()) # Should match number above

Combined Shape: (692, 27)
Num Unique Papers: 692
Num Relevant Papers in Combined DF: 173
Num Unique Papers in Relevant DF: 173
Num Contamination Papers in Combined DF: 3
Num Unique Papers in Contamination DF: 3


# Analysis

We now have access to four dataframes for analysis:

- `df_combined`: contains all papers and final columns from our spreadsheet (non-relevant papers just have None values in the fields we completed for the relevant papers)
- `df_relevant`: contains all relevant papers as rows and the final columns we intend to use for analysis
- `df_contamination`: contains the three papers explicitly about investigating memorisation/contamination
- `df_non_relevant`: contains all non-relevant papers. Our finals columns are present but all filled with None values as we didn't perform data extraction for these papers.

## Initial Analysis

### Number of Papers at Each Stage

In [57]:
# Total numbers
print("Total Papers in Combined DF:", df_combined.shape[0])
print("\nTotal Papers from AI Keywords:")
print(df_combined['ai'].value_counts())

print("\nTotal Papers from Relevant Keywords:")
print(df_combined['relevant'].value_counts())

Total Papers in Combined DF: 692

Total Papers from AI Keywords:
ai
False    395
True     297
Name: count, dtype: int64

Total Papers from Relevant Keywords:
relevant
False      516
True       173
SPECIAL      3
Name: count, dtype: int64


In [59]:
# Per Year
print("Papers Per Year:")
print(df_combined.groupby('year')['title'].nunique())

print("\nAI Papers Per Year:")
print(df_combined[df_combined['ai'] == True].groupby('year')['title'].nunique())

print("\nRelevant Papers Per Year:")
print(df_combined[df_combined['relevant'] == True].groupby('year')['title'].nunique())

Papers Per Year:
year
2023    210
2024    236
2025    246
Name: title, dtype: int64

AI Papers Per Year:
year
2023     55
2024     96
2025    146
Name: title, dtype: int64

Relevant Papers Per Year:
year
2023    30
2024    53
2025    90
Name: title, dtype: int64


### Numbers of Relevant Papers (Papers with LLM-based Empirical Studies)

In [61]:
print("Overall Relevant vs Non-Relevant Counts:")
print(df_combined["relevant"].value_counts())

print("\n\nRelevant Papers by Year:")
print(df_relevant["year"].value_counts())

Overall Relevant vs Non-Relevant Counts:
relevant
False      516
True       173
SPECIAL      3
Name: count, dtype: int64


Relevant Papers by Year:
year
2025    90
2024    53
2023    30
Name: count, dtype: int64


### Geo-location of SE Research Institutions

# RQ1 - What Tasks are being tackled in LLM SE studies, and are they fairly evaluated against existing non-LLM techniques?

DF Columns to use:
- 'task' (short-text)
- 'non_llm_approaches' (bool)
- 'dataset_type' (short-text)
- programming_language (short-text)

In [None]:
df_combined['task_list'] = df_combined['task'].apply(
    lambda x: [task.strip() for task in str(x).split(';')] if pd.notna(x) else []
)

print(df_combined['task_list'].explode().value_counts())

task_list
code generation                                   26
test generation                                   24
program repair                                    24
vulnerability detection                           15
bug detection                                     10
code translation                                   6
code completion                                    6
log parsing                                        5
code search                                        5
type detection                                     5
clone detection                                    5
fuzzing                                            4
bug reproduction                                   3
code summarisation                                 3
code summarization                                 3
formal verification                                2
test repair                                        2
code reasoning                                     2
traceability link recovery          

In [74]:
print(df_combined['non_llm_approaches'].value_counts())

non_llm_approaches
True                                                                                                                                                                                                              91
False                                                                                                                                                                                                             59
FALSE (says TransCoder is not llm but a language modeling appraoch using transformers and seq2seq)                                                                                                                 1
TRUE (FTLR, COMET, VSM, LSI,ArDoCode)                                                                                                                                                                              1
TRUE (AEL, Drain)                                                                                                                

# RQ2 - What models are being used?

DF Columns to use:
- 'models_open_closed' (open/closed/both)
- 'num_models' (int)
- 'model_families' (list of short text)
- 'model_sizes_reported' (NA/none/some/full - currently unfinished)
- 'model_scale' (currently unfinished)

# RQ3 - How well do authors tackle the problem of data leakage/contamination?

DF Columns to use:
- contamination (bool)
- contamination_free_text (free text)

# RQ4 - How replicable are LLM-based studies?

DF Columns to use:
- 'model_config' (short-text list)
- 'artifact_available' (bool)
- 'artifact_reusable' (bool)
- 'artifact_functional' (bool)
- 'artefact_manual' (bool)

## ACM Badge Artifact Availability - Relevant vs. Non-Relevant

In [67]:
# Calculate proportions in the combined dataset
total_relevant = df_relevant.shape[0]
total_non_relevant = df_non_relevant.shape[0]

relevant_with_artifact = df_relevant['artifact_available'].sum()
non_relevant_with_artifact = df_non_relevant['artifact_available'].sum()

prop_relevant = relevant_with_artifact / total_relevant
prop_non_relevant = non_relevant_with_artifact / total_non_relevant

print(f"Proportion of relevant papers with artifact available: {prop_relevant:.2%} ({relevant_with_artifact}/{total_relevant})")
print(f"Proportion of other papers with artifact available: {prop_non_relevant:.2%} ({non_relevant_with_artifact}/{total_non_relevant})")

Proportion of relevant papers with artifact available: 19.08% (33/173)
Proportion of other papers with artifact available: 41.28% (213/516)


In [68]:
# Proportion of relevant papers with artifacts available per year
total_relevant_2023 = df_relevant[df_relevant['year'] == 2023].shape[0]
artifact_avail_2023 = df_relevant[(df_relevant['year'] == 2023) & (df_relevant['artifact_available'])].shape[0]
prop_relevant_2023 = artifact_avail_2023 / total_relevant_2023
print(f"2023 - Proportion of relevant papers with artifact available: {prop_relevant_2023:.2%} ({artifact_avail_2023}/{total_relevant_2023})")

total_relevant_2024 = df_relevant[df_relevant['year'] == 2024].shape[0]
artifact_avail_2024 = df_relevant[(df_relevant['year'] == 2024) & (df_relevant['artifact_available'])].shape[0]
prop_relevant_2024 = artifact_avail_2024 / total_relevant_2024
print(f"2024 - Proportion of relevant papers with artifact available: {prop_relevant_2024:.2%} ({artifact_avail_2024}/{total_relevant_2024})")

total_relevant_2025 = df_relevant[df_relevant['year'] == 2025].shape[0]
artifact_avail_2025 = df_relevant[(df_relevant['year'] == 2025) & (df_relevant['artifact_available'])].shape[0]
prop_relevant_2025 = artifact_avail_2025 / total_relevant_2025
print(f"2025 - Proportion of relevant papers with artifact available: {prop_relevant_2025:.2%} ({artifact_avail_2025}/{total_relevant_2025})")

2023 - Proportion of relevant papers with artifact available: 16.67% (5/30)
2024 - Proportion of relevant papers with artifact available: 16.98% (9/53)
2025 - Proportion of relevant papers with artifact available: 21.11% (19/90)


## Manual Artefact Availability

In [71]:
print(df_relevant["artefact_manual"].value_counts())
print("10 instances of dead links in the 173 relevant papers")

artefact_manual
True     139
False     24
DEAD      10
Name: count, dtype: int64
10 instances of dead links in the 173 relevant papers


# RQ5 - How sustainable is LLM-based SE research?

DF Columns to use:
- 'cost' (short-text list)
- 'cost_free_text' (free text)

# (IGNORE - OLD)

In [2]:
df_complete = pd.read_excel('results/final/complete.xlsx', sheet_name='AI_ICSE2025_papers')

df_relevant = df_complete[df_complete['relevant'] == True]

print("Columns")
df_complete.columns

Columns


Index(['reviewer', 'relevant', 'year', 'title', 'authors', 'url', 'abstract',
       'artifact_available', 'artifact_reusable', 'artifact_functional', 'ai',
       'Task being solved  (CODE GENERATION;APR;TESTING;COMMENT GENERATION/...)',
       'AA Task Consolidated',
       'Do the papers evaluate with approaches other than LLM? (TRUE/FALSE)',
       'Is the budget fair (on hold)',
       'Do they evaluate with open source/commercial/open weight models? (COMMERCIAL/OPEN SOURCE/OPEN WEIGHT)',
       'DW Open/Closed Consolidated ',
       'How many models overall, benchmarks and approach? (integer)',
       'DW Num. Models Conslidated', 'Name of models',
       'DW Scale of models (LARGE >50B/MEDIUM>1B/SMALL<1B)',
       'Size of models from the paper (free text from the paper)',
       'DW Model Sizes Reported (none, partial, full, N/A for commercial)',
       'Do they report on the model versions? (TRUE/FALSE)',
       'Configuration of models (TRUE/FALSE, or parameters)',
       'AA

In [3]:
df_complete['relevant'].value_counts()

relevant
True       164
False       50
SPECIAL      3
Name: count, dtype: int64

In [4]:
# Number of relevant papers per year
df_relevant['year'].value_counts()

year
2025.0    89
2024.0    53
2023.0    22
Name: count, dtype: int64

In [5]:
print(df_relevant["AA Task Consolidated"].unique())

['vulnerability detection' 'test generation' 'test repair'
 'code refinement' 'code generation' 'comment repair' 'program repair'
 'SO post editing' 'code reasoning; code translation' 'code reasoning'
 'code translation' 'bug detection' 'security patch detection'
 'model completion' 'commit message generation'
 'traceability link recovery' 'log parsing' 'AI generated code detection'
 'code generation; code summarization' 'program analysis'
 'regression testing' 'code search' 'fuzzing'
 'inconsistency prediction in decentralised apps' 'formal verification'
 'bug report comprehension' 'UI design repair' 'type detection'
 'smart contract auditing' 'configuration validation'
 'detection of code design issues' 'code understanding; code generation'
 'code completion' 'clone detection; vulnerability detection'
 'code adaptation' 'fault localisation; bug detection'
 'code summarization' 'code optimisation' 'security injection'
 'machine-generated code detection' 'root cause analysis'
 'code su

In [6]:
# Split the 'AA Task Consolidated' column into lists
df_relevant['AA Task List'] = df_relevant['AA Task Consolidated'].apply(
    lambda x: [task.strip() for task in str(x).split(';')] if pd.notna(x) else []
)

df_relevant[['AA Task Consolidated', 'AA Task List']].head(50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['AA Task List'] = df_relevant['AA Task Consolidated'].apply(


Unnamed: 0,AA Task Consolidated,AA Task List
0,vulnerability detection,[vulnerability detection]
1,test generation,[test generation]
2,test repair,[test repair]
4,code refinement,[code refinement]
6,test generation,[test generation]
7,code generation,[code generation]
9,test generation,[test generation]
10,code generation,[code generation]
11,test generation,[test generation]
13,code generation,[code generation]


In [7]:
from collections import Counter

all_tasks = []
for task_list in df_relevant['AA Task List']:
    all_tasks.extend(task_list)

task_counts = Counter(all_tasks)

task_frequency_df = pd.DataFrame(task_counts.items(), columns=['Task', 'Paper_Count'])
task_frequency_df = task_frequency_df.sort_values('Paper_Count', ascending=False, ignore_index=True)

print(f"Total unique tasks: {len(task_frequency_df)}")
task_frequency_df

Total unique tasks: 68


Unnamed: 0,Task,Paper_Count
0,code generation,25
1,test generation,24
2,program repair,22
3,vulnerability detection,13
4,bug detection,8
5,fuzzing,5
6,log parsing,5
7,type detection,5
8,code completion,5
9,code translation,5


In [None]:
# Looking at how many papers cover multiple tasks
df_relevant['num_tasks'] = df_relevant['AA Task List'].apply(len)

print("Distribution of number of tasks per paper:")
print(df_relevant['num_tasks'].value_counts().sort_index())
print(f"\nPapers covering multiple tasks: {(df_relevant['num_tasks'] > 1).sum()}")
print(f"Percentage covering multiple tasks: {(df_relevant['num_tasks'] > 1).mean()*100:.1f}%")

print(df_relevant[df_relevant['num_tasks'] > 5][['title', 'AA Task List']])

Distribution of number of tasks per paper:
num_tasks
1     149
2       9
3       5
13      1
Name: count, dtype: int64

Papers covering multiple tasks: 15
Percentage covering multiple tasks: 9.1%
                                                 title  \
205  An Empirical Comparison of Pre-Trained Models ...   

                                          AA Task List  
205  [bug detection, clone detection, type detectio...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['num_tasks'] = df_relevant['AA Task List'].apply(len)


# Artifact Badges vs Our Counts

## Badges from ACM Data Collection

In [9]:
# Overall
print(df_relevant['artifact_available'].value_counts())
print()
print(df_relevant['artifact_reusable'].value_counts())
print()
print(df_relevant['artifact_functional'].value_counts())

artifact_available
False    132
True      32
Name: count, dtype: int64

artifact_reusable
False    143
True      21
Name: count, dtype: int64

artifact_functional
False    150
True      14
Name: count, dtype: int64


In [10]:
# Per year
print("Artifact Available (ACM Badges)")
print("2023")
print(df_relevant[df_relevant['year'] == 2023]['artifact_available'].value_counts())
print("\n2024")
print(df_relevant[df_relevant['year'] == 2024]['artifact_available'].value_counts())
print("\n2025")
print(df_relevant[df_relevant['year'] == 2025]['artifact_available'].value_counts())


print("\n\nArtifact Reusable (ACM Badges)")
print("2023")
print(df_relevant[df_relevant['year'] == 2023]['artifact_reusable'].value_counts())
print("\n2024")
print(df_relevant[df_relevant['year'] == 2024]['artifact_reusable'].value_counts())
print("\n2025")
print(df_relevant[df_relevant['year'] == 2025]['artifact_reusable'].value_counts())


print("\n\nArtifact Functional (ACM Badges)")
print("2023")
print(df_relevant[df_relevant['year'] == 2023]['artifact_functional'].value_counts())
print("\n2024")
print(df_relevant[df_relevant['year'] == 2024]['artifact_functional'].value_counts())
print("\n2025")
print(df_relevant[df_relevant['year'] == 2025]['artifact_functional'].value_counts())

Artifact Available (ACM Badges)
2023
artifact_available
False    17
True      5
Name: count, dtype: int64

2024
artifact_available
False    45
True      8
Name: count, dtype: int64

2025
artifact_available
False    70
True     19
Name: count, dtype: int64


Artifact Reusable (ACM Badges)
2023
artifact_reusable
False    20
True      2
Name: count, dtype: int64

2024
artifact_reusable
False    45
True      8
Name: count, dtype: int64

2025
artifact_reusable
False    78
True     11
Name: count, dtype: int64


Artifact Functional (ACM Badges)
2023
artifact_functional
False    21
True      1
Name: count, dtype: int64

2024
artifact_functional
False    53
Name: count, dtype: int64

2025
artifact_functional
False    76
True     13
Name: count, dtype: int64


## Artifact Availability from Manual Checking

In [11]:
# Overall
print(df_relevant['MK Artefact available consolidated'].value_counts())

MK Artefact available consolidated
True     129
False     24
DEAD      11
Name: count, dtype: int64


In [12]:
# Per year
print("2023 Artifacts (Manual Checking)")
print(df_relevant[df_relevant['year'] == 2023]['MK Artefact available consolidated'].value_counts())
print("\n2024 Artifacts (Manual Checking)")
print(df_relevant[df_relevant['year'] == 2024]['MK Artefact available consolidated'].value_counts())
print("\n2025 Artifacts (Manual Checking)")
print(df_relevant[df_relevant['year'] == 2025]['MK Artefact available consolidated'].value_counts())

2023 Artifacts (Manual Checking)
MK Artefact available consolidated
True     15
False     4
DEAD      3
Name: count, dtype: int64

2024 Artifacts (Manual Checking)
MK Artefact available consolidated
True     40
False     9
DEAD      4
Name: count, dtype: int64

2025 Artifacts (Manual Checking)
MK Artefact available consolidated
True     74
False    11
DEAD      4
Name: count, dtype: int64
