# Empirical and Sustainability Aspects of Software Engineering Research in the Era of Large Language Models: A Reflection - Analysis Notebook

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df_relevant = pd.read_excel(
    'results/final/final_results.xlsx', sheet_name='relevant_papers')
df_all_papers = pd.read_csv('results/ICSE_all_papers.csv')

relevant_papers = df_relevant['title'].tolist()
df_non_relevant = df_all_papers[~df_all_papers['title'].isin(
    relevant_papers)].copy()
df_non_relevant['relevant'] = False
common_columns = ['reviewer', 'relevant', 'year', 'title', 'authors', 'url',
                  'abstract', 'artifact_available', 'artifact_reusable', 'artifact_functional', 'ai']
df_non_relevant = df_non_relevant[common_columns]
extra_columns = ['task', 'non_llm_approaches', 'models_open_closed', 'num_models', 'model_families', 'model_scale', 'model_size_free_text', 'model_sizes_reported',
                 'model_config', 'dataset_type', 'programming_language', 'cost', 'cost_free_text', 'artefact_manual', 'contamination', 'contamination_free_text']
for col in extra_columns:
    df_non_relevant[col] = None
df_combined = pd.concat([df_relevant, df_non_relevant], ignore_index=True)

# Number of Papers at Each Stage

In [2]:
print("Total Papers:", df_combined.shape[0])

print(
    f"Number of Papers matching AI Keywords: {df_combined['ai'].sum()}/{df_combined.shape[0]} ({(df_combined['ai'].sum()/df_combined.shape[0])*100:.2f}%)")

print(
    f"Number of Relevant Papers: {df_combined['relevant'].sum()}/{df_combined.shape[0]} ({(df_combined['relevant'].sum()/df_combined.shape[0])*100:.2f}%)")

Total Papers: 692
Number of Papers matching AI Keywords: 304/692 (43.93%)
Number of Relevant Papers: 177/692 (25.58%)


In [3]:
# Per Year
print("Papers Per Year:")
print(df_combined.groupby('year')['title'].nunique())

print("\nAI Keyword Papers Per Year:")
print(df_combined[df_combined['ai'] == True].groupby(
    'year')['title'].nunique())

print("\nRelevant Papers Per Year:")
print(df_combined[df_combined['relevant'] ==
      True].groupby('year')['title'].nunique())

Papers Per Year:
year
2023    210
2024    236
2025    246
Name: title, dtype: int64

AI Keyword Papers Per Year:
year
2023     59
2024     99
2025    146
Name: title, dtype: int64

Relevant Papers Per Year:
year
2023    32
2024    55
2025    90
Name: title, dtype: int64


# RQ1 - Which LLMs are used in SE research and how are they benchmarked?

## Open vs. Closed (Commercial) Models

In [4]:
df_relevant['models_open_closed'].value_counts()

models_open_closed
open      71
both      65
closed    41
Name: count, dtype: int64

In [5]:
total = df_relevant.shape[0]
number_open = df_relevant[~(
    df_relevant['models_open_closed'] == 'closed')].shape[0]
number_closed = df_relevant[~(
    df_relevant['models_open_closed'] == 'open')].shape[0]

print(
    f"Open models in {number_open} out of {total} papers ({(number_open/total)*100:.1f}%)")
print(
    f"Closed models in {number_closed} out of {total} papers ({(number_closed/total)*100:.1f}%)")

Open models in 136 out of 177 papers (76.8%)
Closed models in 106 out of 177 papers (59.9%)


In [6]:
def open_vs_closed_per_year(year):
    total_year = df_relevant[df_relevant["year"] == year].shape[0]
    number_open_year = df_relevant[(df_relevant["year"] == year) & ~(
        df_relevant['models_open_closed'] == 'closed')].shape[0]
    number_open_only_year = df_relevant[(df_relevant["year"] == year) & (
        df_relevant['models_open_closed'] == 'open')].shape[0]
    number_closed_year = df_relevant[(df_relevant["year"] == year) & ~(
        df_relevant['models_open_closed'] == 'open')].shape[0]
    number_closed_only_year = df_relevant[(df_relevant["year"] == year) & (
        df_relevant['models_open_closed'] == 'closed')].shape[0]
    number_both_year = df_relevant[(df_relevant["year"] == year) & (
        df_relevant['models_open_closed'] == 'both')].shape[0]

    print(f"{year} Papers:")
    print(
        f"Only open models in {number_open_only_year} out of {total_year} papers ({(number_open_only_year/total_year)*100:.1f}%)")
    print(
        f"Only closed models in {number_closed_only_year} out of {total_year} papers ({(number_closed_only_year/total_year)*100:.1f}%)")
    print(
        f"Open models in {number_open_year} out of {total_year} papers ({(number_open_year/total_year)*100:.1f}%)")
    print(
        f"Closed models in {number_closed_year} out of {total_year} papers ({(number_closed_year/total_year)*100:.1f}%)")
    print(
        f"Both model types in {number_both_year} out of {total_year} papers ({(number_both_year/total_year)*100:.1f}%)")


open_vs_closed_per_year(2023)
print()
open_vs_closed_per_year(2024)
print()
open_vs_closed_per_year(2025)

2023 Papers:
Only open models in 22 out of 32 papers (68.8%)
Only closed models in 4 out of 32 papers (12.5%)
Open models in 28 out of 32 papers (87.5%)
Closed models in 10 out of 32 papers (31.2%)
Both model types in 6 out of 32 papers (18.8%)

2024 Papers:
Only open models in 25 out of 55 papers (45.5%)
Only closed models in 13 out of 55 papers (23.6%)
Open models in 42 out of 55 papers (76.4%)
Closed models in 30 out of 55 papers (54.5%)
Both model types in 17 out of 55 papers (30.9%)

2025 Papers:
Only open models in 24 out of 90 papers (26.7%)
Only closed models in 24 out of 90 papers (26.7%)
Open models in 66 out of 90 papers (73.3%)
Closed models in 66 out of 90 papers (73.3%)
Both model types in 42 out of 90 papers (46.7%)


## Model Families

In [7]:
df_relevant['model_families_list'] = df_relevant['model_families'].apply(
    lambda x: [model_family.strip()
               for model_family in str(x).split(';')] if pd.notna(x) else []
)

print("Overall - Number of Papers per Model Family:")
print(df_relevant['model_families_list'].explode().value_counts())

Overall - Number of Papers per Model Family:
model_families_list
GPT-4                           47
GPT-3.5                         44
CodeBERT                        34
CodeLlama                       26
CodeT5                          22
CodeGen                         19
StarCoder                       18
GraphCodeBERT                   18
Llama                           17
RoBERTa                         15
ChatGPT                         14
BERT                            13
DeepSeekCoder                   12
UniXcoder                       11
Codex                           10
InCoder                          9
T5                               8
Claude                           7
Gemini                           7
ChatGLM                          6
DeepSeek                         6
CodeQwen                         6
GPT-3                            6
UnixCoder                        6
PLBART                           6
CodeGPT                          5
CodeParrot               

In [8]:
def model_families_per_year(year):
    df_year = df_relevant[df_relevant['year'] == year]
    print(f"{year} - Number of Papers per Model Family:")
    print(df_year['model_families_list'].explode().value_counts())


model_families_per_year(2023)

2023 - Number of Papers per Model Family:
model_families_list
CodeBERT            11
RoBERTa              8
BERT                 7
CodeT5               7
Codex                5
T5                   5
GraphCodeBERT        4
DistilBERT           3
PLBART               2
GPT-J                2
CodeGen              2
BART                 2
InCoder              2
GPT-Neo              2
XLNet                1
MiniLM               1
ELECTRA              1
ALBERT               1
RepresentThemAll     1
seBERT               1
Code-davinci         1
Curie                1
Davinci              1
T5-learning          1
CodeGPT              1
JavaBERT             1
DOBF                 1
CuBERT               1
ProphetNet-Code      1
SPT-Code             1
CoTexT               1
C-BERT               1
GPT-C                1
CugLM                1
TreeBERT             1
GPT-2                1
SynCoBERT            1
DeepDebug            1
UniXcoder            1
GPT-NeoX             1
PLBart            

In [9]:
model_families_per_year(2024)

2024 - Number of Papers per Model Family:
model_families_list
CodeBERT           11
GPT-3.5             9
GPT-4               8
CodeT5              8
ChatGPT             8
CodeGen             8
GraphCodeBERT       7
UniXcoder           5
BERT                5
Codex               4
RoBERTa             4
InCoder             4
StarCoder           4
Llama               3
ChatGLM             3
UnixCoder           3
PLBART              3
T5                  3
GPT-3               3
GPT-2               3
text-davinci        3
PolyCoder           3
CodeGeeX            2
CodeParrot          2
UniLog              2
GPT-NEO             2
Copilot             2
Vicuna              2
Pythia              2
CodeGPT             2
Airboros            1
code-davinci        1
SantaCoder          1
WizardCoder         1
Sentence-BERT       1
GPTBigCode          1
seBERT              1
VulBERTa            1
PDBERT              1
ALBERT              1
KeyBERT             1
LSTM                1
TFix          

In [10]:
model_families_per_year(2025)

2025 - Number of Papers per Model Family:
model_families_list
GPT-4                           39
GPT-3.5                         35
CodeLlama                       25
Llama                           14
StarCoder                       14
CodeBERT                        12
DeepSeekCoder                   12
CodeGen                          9
Claude                           7
Gemini                           7
CodeT5                           7
GraphCodeBERT                    7
ChatGPT                          6
CodeQwen                         6
DeepSeek                         6
UniXcoder                        5
UnixCoder                        3
LineVul                          3
WizardCoder                      3
ChatGLM                          3
Mistral                          3
Gemma                            3
RoBERTa                          3
InCoder                          3
Codestral                        2
Qwen                             2
CodeT5+                     

## Targeted Programming Languages

### Which Programming Languages are Evaluated

In [11]:
df_relevant["programming_languages_list"] = df_relevant['programming_language'].apply(
    lambda x: [lang.strip() for lang in str(
        x).split(';')] if pd.notna(x) else []
)

print("Overall - Number of Papers per Programming Language:")
df_relevant["programming_languages_list"].explode().value_counts()

Overall - Number of Papers per Programming Language:


programming_languages_list
Java           67
Python         56
C              30
C++            23
JavaScript     13
PHP            10
Go              9
NM              8
Rust            7
Ruby            7
C#              5
Kotlin          3
SQL             3
R               2
TypeScript      2
Haskell         2
Objective-C     2
Scala           2
Swift           2
Prolog          1
Erlang          1
Solidity        1
Bash            1
CSharp          1
Perl            1
SCRATCH         1
HTML            1
Name: count, dtype: int64

In [12]:
def programming_languages_per_year(year):
    df_year = df_relevant[df_relevant['year'] == year]
    print(f"{year} - Number of Papers per Programming Language:")
    print(df_year['programming_languages_list'].explode().value_counts())


programming_languages_per_year(2023)

2023 - Number of Papers per Programming Language:
programming_languages_list
Java          14
Python         9
C              7
JavaScript     4
PHP            4
C++            2
NM             2
Ruby           2
Go             2
SCRATCH        1
C#             1
Name: count, dtype: int64


In [13]:
programming_languages_per_year(2024)

2024 - Number of Papers per Programming Language:
programming_languages_list
Java           22
Python         18
C              11
C++             9
Go              4
Ruby            3
JavaScript      3
PHP             3
Rust            3
C#              3
Kotlin          3
Scala           2
SQL             2
NM              1
CSharp          1
Solidity        1
Bash            1
Swift           1
Objective-C     1
Perl            1
R               1
TypeScript      1
Name: count, dtype: int64


In [14]:
programming_languages_per_year(2025)

2025 - Number of Papers per Programming Language:
programming_languages_list
Java           31
Python         29
C              12
C++            12
JavaScript      6
NM              5
Rust            4
Go              3
PHP             3
Ruby            2
Haskell         2
C#              1
TypeScript      1
R               1
Objective-C     1
Swift           1
Erlang          1
Prolog          1
SQL             1
HTML            1
Name: count, dtype: int64


### Number of Programming Languages Evaluated per Paper

In [15]:
print("Overall - Distribution of Number of Programming Languages per Paper:")
print(df_relevant["programming_languages_list"].apply(len).value_counts())
print(
    f"\nPapers covering multiple programming languages: {(df_relevant['programming_languages_list'].apply(len) > 1).sum()}")
print(
    f"Percentage covering multiple programming languages: {(df_relevant['programming_languages_list'].apply(len) > 1).mean()*100:.1f}%")

Overall - Distribution of Number of Programming Languages per Paper:
programming_languages_list
1     81
0     48
2     24
3     10
6      4
4      3
10     2
5      2
16     1
7      1
13     1
Name: count, dtype: int64

Papers covering multiple programming languages: 48
Percentage covering multiple programming languages: 27.1%


In [16]:
def programming_language_distribution_and_multi_language_stats(year):
    year_df = df_relevant[df_relevant['year'] == year]
    print(f"{year} - Distribution of Number of Programming Languages per Paper:")
    print(year_df["programming_languages_list"].apply(len).value_counts())
    print(
        f"\nPapers covering multiple programming languages: {(year_df['programming_languages_list'].apply(len) > 1).sum()}")
    print(
        f"Percentage covering multiple programming languages: {(year_df['programming_languages_list'].apply(len) > 1).mean()*100:.1f}%")


programming_language_distribution_and_multi_language_stats(2023)

2023 - Distribution of Number of Programming Languages per Paper:
programming_languages_list
1    13
0     9
2     5
4     1
6     1
7     1
5     1
3     1
Name: count, dtype: int64

Papers covering multiple programming languages: 10
Percentage covering multiple programming languages: 31.2%


In [17]:
programming_language_distribution_and_multi_language_stats(2024)

2024 - Distribution of Number of Programming Languages per Paper:
programming_languages_list
1     22
0     15
2     10
3      3
5      1
6      1
4      1
16     1
13     1
Name: count, dtype: int64

Papers covering multiple programming languages: 18
Percentage covering multiple programming languages: 32.7%


In [18]:
programming_language_distribution_and_multi_language_stats(2025)

2025 - Distribution of Number of Programming Languages per Paper:
programming_languages_list
1     46
0     24
2      9
3      6
6      2
10     2
4      1
Name: count, dtype: int64

Papers covering multiple programming languages: 20
Percentage covering multiple programming languages: 22.2%


# RQ2 - How well do authors tackle the problem of data leakage/contamination?

In [19]:
contamination_reported = df_relevant['contamination'].sum()

print(
    f"Overall - Contamination reported in {contamination_reported} out of {df_relevant.shape[0]} papers ({(contamination_reported/df_relevant.shape[0])*100:.1f}%)")

Overall - Contamination reported in 58 out of 177 papers (32.8%)


In [20]:
def contamination_reported_per_year(year):
    year_df = df_relevant[df_relevant['year'] == year]
    contamination_reported_year = year_df['contamination'].sum()
    print(
        f"{year} - Contamination reported in {contamination_reported_year} out of {year_df.shape[0]} papers ({(contamination_reported_year/year_df.shape[0])*100:.1f}%)")


contamination_reported_per_year(2023)
contamination_reported_per_year(2024)
contamination_reported_per_year(2025)

2023 - Contamination reported in 6 out of 32 papers (18.8%)
2024 - Contamination reported in 14 out of 55 papers (25.5%)
2025 - Contamination reported in 38 out of 90 papers (42.2%)


# RQ3 - How replicable are LLM-based studies?

## Model Configuration Reporting

In [21]:
df_relevant["model_config_list"] = df_relevant['model_config'].apply(
    lambda x: [config.strip() for config in str(
        x).split(';')] if pd.notna(x) else []
)

print("Overall - Number of Papers Reporting on Inference (Generation) Configuration/Parameters:")
num_reporting_inference_config = df_relevant[df_relevant['model_config_list'].apply(
    lambda x: 'inference' in [cfg.lower() for cfg in x])].shape[0]
print(
    f"{num_reporting_inference_config} out of {df_relevant.shape[0]} papers ({(num_reporting_inference_config/df_relevant.shape[0])*100:.1f}%)")

Overall - Number of Papers Reporting on Inference (Generation) Configuration/Parameters:
89 out of 177 papers (50.3%)


In [22]:
num_reporting_inference_config_2023 = df_relevant[df_relevant['model_config_list'].apply(
    lambda x: 'inference' in [cfg.lower() for cfg in x]) & (df_relevant['year'] == 2023)].shape[0]
print(
    f"2023 - Number of Papers Reporting on Inference (Generation) Configuration/Parameters: {num_reporting_inference_config_2023} out of {df_relevant[df_relevant['year'] == 2023].shape[0]} papers ({(num_reporting_inference_config_2023/df_relevant[df_relevant['year'] == 2023].shape[0])*100:.1f}%)")

num_reporting_inference_config_2024 = df_relevant[df_relevant['model_config_list'].apply(
    lambda x: 'inference' in [cfg.lower() for cfg in x]) & (df_relevant['year'] == 2024)].shape[0]
print(
    f"2024 - Number of Papers Reporting on Inference (Generation) Configuration/Parameters: {num_reporting_inference_config_2024} out of {df_relevant[df_relevant['year'] == 2024].shape[0]} papers ({(num_reporting_inference_config_2024/df_relevant[df_relevant['year'] == 2024].shape[0])*100:.1f}%)")

num_reporting_inference_config_2025 = df_relevant[df_relevant['model_config_list'].apply(
    lambda x: 'inference' in [cfg.lower() for cfg in x]) & (df_relevant['year'] == 2025)].shape[0]
print(
    f"2025 - Number of Papers Reporting on Inference (Generation) Configuration/Parameters: {num_reporting_inference_config_2025} out of {df_relevant[df_relevant['year'] == 2025].shape[0]} papers ({(num_reporting_inference_config_2025/df_relevant[df_relevant['year'] == 2025].shape[0])*100:.1f}%)")

2023 - Number of Papers Reporting on Inference (Generation) Configuration/Parameters: 10 out of 32 papers (31.2%)
2024 - Number of Papers Reporting on Inference (Generation) Configuration/Parameters: 27 out of 55 papers (49.1%)
2025 - Number of Papers Reporting on Inference (Generation) Configuration/Parameters: 52 out of 90 papers (57.8%)


## Artefact Availability

### Artifact Badges

In [23]:
def compare_artifact_badge_relevant_vs_non_relevant(col, name):
    relevant_with_badge = df_relevant[df_relevant[col] == True].shape[0]
    non_relevant_with_badge = df_non_relevant[df_non_relevant[col]
                                              == True].shape[0]
    print(
        f"Relevant Papers with {name}: {relevant_with_badge} out of {df_relevant.shape[0]} papers ({(relevant_with_badge/df_relevant.shape[0])*100:.1f}%)")
    print(
        f"Non-Relevant Papers with {name}: {non_relevant_with_badge} out of {df_non_relevant.shape[0]} papers ({(non_relevant_with_badge/df_non_relevant.shape[0])*100:.1f}%)")


badge_columns = [
    ('artifact_available', 'Artifact Available Badge'),
    ('artifact_reusable', 'Artifact Reusable Badge'),
    ('artifact_functional', 'Artifact Functional Badge')
]

for col, name in badge_columns:
    compare_artifact_badge_relevant_vs_non_relevant(col, name)
    print()

Relevant Papers with Artifact Available Badge: 33 out of 177 papers (18.6%)
Non-Relevant Papers with Artifact Available Badge: 213 out of 515 papers (41.4%)

Relevant Papers with Artifact Reusable Badge: 21 out of 177 papers (11.9%)
Non-Relevant Papers with Artifact Reusable Badge: 150 out of 515 papers (29.1%)

Relevant Papers with Artifact Functional Badge: 13 out of 177 papers (7.3%)
Non-Relevant Papers with Artifact Functional Badge: 70 out of 515 papers (13.6%)



In [24]:
def artifact_badges_per_year(year, col, name):
    year_df = df_relevant[df_relevant['year'] == year]
    artifact_year = year_df[col].sum()
    print(
        f"{year} - {name} for {artifact_year} out of {year_df.shape[0]} Relevant Papers ({(artifact_year/year_df.shape[0])*100:.1f}%)")


years = [2023, 2024, 2025]

badge_columns = [
    ('artifact_available', 'Artifact Available Badge'),
    ('artifact_reusable', 'Artifact Reusable Badge'),
    ('artifact_functional', 'Artifact Functional Badge')
]

for year in years:
    for col, name in badge_columns:
        artifact_badges_per_year(year, col, name)
    print()

2023 - Artifact Available Badge for 6 out of 32 Relevant Papers (18.8%)
2023 - Artifact Reusable Badge for 3 out of 32 Relevant Papers (9.4%)
2023 - Artifact Functional Badge for 1 out of 32 Relevant Papers (3.1%)

2024 - Artifact Available Badge for 9 out of 55 Relevant Papers (16.4%)
2024 - Artifact Reusable Badge for 8 out of 55 Relevant Papers (14.5%)
2024 - Artifact Functional Badge for 0 out of 55 Relevant Papers (0.0%)

2025 - Artifact Available Badge for 18 out of 90 Relevant Papers (20.0%)
2025 - Artifact Reusable Badge for 10 out of 90 Relevant Papers (11.1%)
2025 - Artifact Functional Badge for 12 out of 90 Relevant Papers (13.3%)



### Manual Artefact Evaluation

In [25]:
print(
    f"\nOverall - Number of Papers with Artefacts (Manual Check): {df_relevant[df_relevant['artefact_manual'] == True].shape[0]} out of {df_relevant.shape[0]} ({(df_relevant[df_relevant['artefact_manual'] == True].shape[0] / df_relevant.shape[0]) * 100:.1f}%)")
print(
    f"Overall - Number of Papers with no Artefact (Manual Check): {df_relevant[df_relevant['artefact_manual'] == False].shape[0]} out of {df_relevant.shape[0]} ({(df_relevant[df_relevant['artefact_manual'] == False].shape[0] / df_relevant.shape[0]) * 100:.1f}%)")
print(
    f"Overall - Number of Papers with Dead Links (Manual Check): {df_relevant[df_relevant['artefact_manual'] == 'DEAD'].shape[0]} out of {df_relevant.shape[0]} ({(df_relevant[df_relevant['artefact_manual'] == 'DEAD'].shape[0] / df_relevant.shape[0]) * 100:.1f}%)")


Overall - Number of Papers with Artefacts (Manual Check): 144 out of 177 (81.4%)
Overall - Number of Papers with no Artefact (Manual Check): 24 out of 177 (13.6%)
Overall - Number of Papers with Dead Links (Manual Check): 9 out of 177 (5.1%)


### Number of Papers with Artifact Badges that have Dead Links 

In [26]:
print("Number of papers with Artifact Available Badges but DEAD links:", df_relevant[(
    df_relevant['artefact_manual'] == 'DEAD') & (df_relevant['artifact_available'])].shape[0])
print("Number of papers with Artifact Reusable Badges but DEAD links:", df_relevant[(
    df_relevant['artefact_manual'] == 'DEAD') & (df_relevant['artifact_reusable'])].shape[0])
print("Number of papers with Artifact Functional Badges but DEAD links:", df_relevant[(
    df_relevant['artefact_manual'] == 'DEAD') & (df_relevant['artifact_functional'])].shape[0])

Number of papers with Artifact Available Badges but DEAD links: 2
Number of papers with Artifact Reusable Badges but DEAD links: 1
Number of papers with Artifact Functional Badges but DEAD links: 0


# RQ4 - What are the costs of LLM-based SE research?

In [27]:
print("Frequency of Different Costs Reported Across All Papers:")
import regex as re
df_relevant['cost_list'] = df_relevant['cost'].apply(
    lambda x: [task.strip() for task in re.split(';|,', str(x))] if pd.notna(x) else []
)

print(df_relevant['cost_list'].explode().value_counts())

Frequency of Different Costs Reported Across All Papers:
cost_list
gpu            78
time           68
-              48
money          18
content        13
hw             10
memory          6
invocations     3
tpu             2
tokens          1
operations      1
Name: count, dtype: int64


In [28]:
both = 0
for i,row in df_relevant.iterrows():
    c = row["cost"]
    if "time" in c:
        if any(x in c for x in ["gpu","hw","tpu","hardware"]):
            both += 1
both

print("Number of Papers Reporting both Time and Hardware:", both)

Number of Papers Reporting both Time and Hardware: 36
