In [117]:
# draw_table
from matplotlib import pyplot as plt


def draw_table(group, filename):
    plt.figure(figsize=(20, 6))  # Initialize figure size
    ax = plt.subplot(111, frame_on=False)  # Create a subplot without frame
    ax.xaxis.set_visible(False)  # Hide x-axis
    ax.yaxis.set_visible(False)  # Hide y-axis
    ax.table(
        cellText=group.values,
        colLabels=group.columns,
        loc="center",
    )

    # Adjust table layout to fit cell content
    cell_text = ax.table(
        cellText=group.values,
        colLabels=group.columns,
        loc="center",
    )
    cell_text.auto_set_font_size(False)
    cell_text.set_fontsize(10)  # Set initial font size

    # Adjust font size based on content to fit cells
    for key, cell in cell_text.get_celld().items():
        cell.set_text_props(wrap=True)
        cell.set_height(0.1)
        cell.set_width(0.1)
        if key[0] == 0:  # Column headers
            cell.set_fontsize(10)
        elif key[1] == 0:
            cell.set_width(0.15)
        else:  # Data cells
            cell.set_fontsize(12)

    # plt.title(f"Group ")  # Set title
    plt.tight_layout()  # Adjust layout
    plt.savefig(filename, bbox_inches="tight", pad_inches=0.5)
    plt.close()

In [118]:
# get_processed_first_year_pub
import csv


def get_processed_first_year_pub(filenames):
    first_year_pubs_with_author_name = dict()

    for filename in filenames:
        with open(filename, "r") as file:
            csv_reader = csv.DictReader(file)
            for row in csv_reader:
                first_year_pubs_with_author_name[row["dblp_author_name"]] = row[
                    "first_publication_year"
                ]

    return first_year_pubs_with_author_name

In [119]:
# calculate_research_age
def calculate_research_age(year):
    try:
        if int(year) >= 2013:
            return "0"  # early-staged
        elif int(year) >= 2008:
            return "1"  # consolidators
        else:
            return "2"  # experienced
    except:
        return "-1"

In [120]:
# get data for first year calculation
import pandas as pd

df_articles_all = pd.read_csv(
    "../data/output_csv/output_article.csv", sep=";", low_memory=False
)
df_inproceedings_all = pd.read_csv(
    "../data/output_csv/output_inproceedings.csv", sep=";", low_memory=False
)

df_articles_all = df_articles_all[
    df_articles_all["author"].apply(lambda x: isinstance(x, str))
]
df_inproceedings_all = df_inproceedings_all[
    df_inproceedings_all["author"].apply(lambda x: isinstance(x, str))
]

In [121]:
# calculate_first_year_of_pub
import csv
from tqdm import tqdm
from math import isnan
from datetime import datetime


def calculate_first_year_of_pub(
    values, filename_to_write, filenames_existing_first_year_pub
):
    first_year_pubs_with_author_name = get_processed_first_year_pub(
        filenames_existing_first_year_pub
    )

    with open(
        filename_to_write,
        "w",
        newline="",
    ) as output_csv:
        csv_writer = csv.DictWriter(
            output_csv,
            fieldnames=["first_publication_year", "dblp_author_name"],
        )

        csv_writer.writeheader()
        for index, raw_author in tqdm(values.iterrows()):
            # print(raw_author)
            dblp_author_name = raw_author["dblp_author_name"]
            if not (dblp_author_name in first_year_pubs_with_author_name):
                author_row = dict()
                df_result_all_articles = df_articles_all[
                    df_articles_all["author"].str.contains(dblp_author_name)
                ]
                df_result_all_inproceedings = df_inproceedings_all[
                    df_inproceedings_all["author"].str.contains(dblp_author_name)
                ]

                first_year_pub_articles = (
                    df_result_all_articles["year"].min()
                    if not df_result_all_articles.empty
                    else datetime.now().year + 1
                )
                first_year_pub_inproceedings = (
                    df_result_all_inproceedings["year"].min()
                    if not df_result_all_inproceedings.empty
                    else datetime.now().year + 1
                )

                first_year_pub = (
                    first_year_pub_inproceedings
                    if first_year_pub_articles > first_year_pub_inproceedings
                    else first_year_pub_articles
                )

                author_row["first_publication_year"] = first_year_pub
                author_row["dblp_author_name"] = raw_author["dblp_author_name"]
                csv_writer.writerow(author_row)
            else:
                pass

In [122]:
# get data dblp author
df_authors_0 = pd.read_csv("../data/analysis/filtered_types/authors/authors_0.csv")
df_authors_1 = pd.read_csv("../data/analysis/filtered_types/authors/authors_1.csv")


df_authors_0_inproc = pd.read_csv(
    "../data/analysis/filtered_types/authors/authors_0_inproc.csv"
)
df_authors_1_inproc = pd.read_csv(
    "../data/analysis/filtered_types/authors/authors_1_inproc.csv"
)

df_journals_dblp = pd.concat(
    [
        df_authors_0[df_authors_0["publication_year"] < 2024],
        df_authors_1[df_authors_1["publication_year"] < 2024],
    ]
)
df_conferences_dblp = pd.concat(
    [
        df_authors_0_inproc[df_authors_0_inproc["publication_year"] < 2024],
        df_authors_1_inproc[df_authors_1_inproc["publication_year"] < 2024],
    ]
)

In [123]:
# group_by_author_name_and_calculate_first_year_of_pub
import pandas as pd


def group_by_author_name_and_calculate_first_year_of_pub(
    df, num_of_first_authors, file_to_save_first_year, filenames_existing_first_year_pub
):
    df_dblp_grouped_by_author_name = (
        df.groupby(["dblp_author_name"])
        .size()
        .reset_index(name="count")
        .sort_values(by="count", ascending=False)
    )

    df_dblp_grouped_by_author_name_500 = df_dblp_grouped_by_author_name.head(
        num_of_first_authors
    ).copy()
    calculate_first_year_of_pub(
        df_dblp_grouped_by_author_name_500,
        file_to_save_first_year,
        filenames_existing_first_year_pub,
    )

    return df_dblp_grouped_by_author_name_500

In [124]:
def save_research_age(
    filenames_existing_first_year_pubs, df_grouped_by_author_name, file_to_save
):
    first_year_pubs_with_author_name = get_processed_first_year_pub(
        filenames_existing_first_year_pubs
    )

    print(df_grouped_by_author_name)

    def calculate_research_age_apply(row):
        return calculate_research_age(
            first_year_pubs_with_author_name[row["dblp_author_name"]]
        )

    df_grouped_by_author_name["research_age"] = df_grouped_by_author_name.apply(
        calculate_research_age_apply, axis=1
    )

    df_grouped_by_author_name.to_csv(file_to_save)

In [126]:
# calculate_first_year_of_pub for journals

df_journals_dblp_grouped_by_author_name_500 = group_by_author_name_and_calculate_first_year_of_pub(
    df_journals_dblp,
    500,
    "../data/analysis/filtered_types/authors_first_year_pub_selected_journals.csv",
    [
        "../data/analysis/author_dblp/authors_first_year_pub_all_authors.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations_r.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations_months.csv",
    ],
)

# dblp save research age

save_research_age(
    [
        "../data/analysis/author_dblp/authors_first_year_pub_all_authors.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations_r.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations_months.csv",
        "../data/analysis/filtered_types/authors_first_year_pub_selected_journals.csv",
    ],
    df_journals_dblp_grouped_by_author_name_500,
    "../data/research_age/index_dblp_journals_first_500_filtered_types.csv",
)

500it [00:07, 69.83it/s] 

          dblp_author_name  count
206        Ahmed E. Hassan    112
2502         David Lo 0001    107
12045         Xin Xia 0001     91
3464          Foutse Khomh     50
6534      Lionel C. Briand     44
...                    ...    ...
9975        Sandro Morasca      7
4383         Hong Mei 0001      7
11963  Xiaohong Zhang 0002      7
11964     Xiaohu Yang 0001      7
1226         Bahman Zamani      7

[500 rows x 2 columns]





In [127]:
# calculate_first_year_of_pub for all

df_all_dblp_grouped_by_author_name_500 = group_by_author_name_and_calculate_first_year_of_pub(
    pd.concat([df_journals_dblp, df_conferences_dblp]),
    500,
    "../data/analysis/filtered_types/authors_first_year_pub_all_rerun.csv",
    [
        "../data/analysis/author_dblp/authors_first_year_pub_all_authors.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations_r.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations_months.csv",
        "../data/analysis/filtered_types/authors_first_year_pub_selected_journals.csv",
    ],
)

# dblp save research age

save_research_age(
    [
        "../data/analysis/author_dblp/authors_first_year_pub_all_authors.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations_r.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations_months.csv",
        "../data/analysis/filtered_types/authors_first_year_pub_selected_journals.csv",
        "../data/analysis/filtered_types/authors_first_year_pub_all_rerun.csv",
    ],
    df_all_dblp_grouped_by_author_name_500,
    "../data/research_age/index_dblp_all_first_500_filtered_types.csv",
)

500it [00:03, 131.34it/s]

              dblp_author_name  count
3953             David Lo 0001    232
19186             Xin Xia 0001    175
318            Ahmed E. Hassan    145
19499            Yang Liu 0003    120
20669           Zhenchang Xing     91
...                        ...    ...
2595            Carlo A. Furia     14
5436             Foyzul Hassan     14
12466     Mohammad R. Alshayeb     14
1330       Anh Nguyen-Duc 0001     14
14932  Ricardo Colomo Palacios     14

[500 rows x 2 columns]





In [128]:
def draw_table_and_save_to_csv(group, filename_to_save_picture, filename_to_save_csv):
    group.reset_index(drop=True, inplace=True)
    group["Row_Numbers"] = group.reset_index().index + 1
    group = group.drop(columns=["Unnamed: 0"])

    draw_table(group, filename_to_save_picture)

    group.to_csv(filename_to_save_csv)

In [129]:
# analysis for journals
from matplotlib import pyplot as plt
import pandas as pd


df = pd.read_csv(
    "../data/research_age/index_dblp_journals_first_500_filtered_types.csv"
)

# Grouping the DataFrame by 'Group'
grouped = df.groupby("research_age")

# Sorting each group based on the 'Values' column
sorted_groups = grouped.apply(lambda x: x.sort_values(by="count", ascending=False))

early_staged_group = sorted_groups[sorted_groups["research_age"] == 0]
draw_table_and_save_to_csv(
    early_staged_group,
    "../data/analysis/filtered_types/tables/journals_only_author/group_0_dblp_table.png",
    "../data/analysis/filtered_types/tables/journals_only_author/group_0_dblp.csv",
)

consolidators_group = sorted_groups[sorted_groups["research_age"] == 1]
draw_table_and_save_to_csv(
    consolidators_group,
    "../data/analysis/filtered_types/tables/journals_only_author/group_1_dblp_table.png",
    "../data/analysis/filtered_types/tables/journals_only_author/group_1_dblp.csv",
)

experienced_group = sorted_groups[sorted_groups["research_age"] == 2]
draw_table_and_save_to_csv(
    experienced_group,
    "../data/analysis/filtered_types/tables/journals_only_author/group_2_dblp_table.png",
    "../data/analysis/filtered_types/tables/journals_only_author/group_2_dblp.csv",
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group["Row_Numbers"] = group.reset_index().index + 1
  plt.tight_layout()  # Adjust layout
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group["Row_Numbers"] = group.reset_index().index + 1
  plt.tight_layout()  # Adjust layout
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group["Row_Numbers"] = 

In [130]:
# analysis for all

all_dblp = pd.concat(
    [
        df_journals_dblp,
        df_conferences_dblp.rename(
            columns={"booktitle": "journal", "journal": "real_journal"}
        ),
    ],
    ignore_index=True,
)

all_dblp.loc[all_dblp["journal"].str.endswith(" FSE"), "journal"] = "FSE"

# Grouping by author_name, journal, and research_age, then summing the appearance_count
grouped = (
    all_dblp.groupby(["dblp_author_name", "journal"])
    .size()
    .reset_index(name="appearance_count")
)

pivot_table = grouped.pivot_table(
    index=["dblp_author_name"],
    columns="journal",
    values="appearance_count",
    fill_value=0,
).reset_index()

temp_df = pivot_table.copy()
temp_df["papers"] = temp_df[conf_names].sum(axis=1)

temp_df_sorted_results = temp_df.sort_values(
    by="papers",
    ascending=False,
)

research_age_all = pd.read_csv("../data/research_age/dblp_all_merged_final.csv")
sorted_results = (
    temp_df_sorted_results.head(500)
    .copy()
    .merge(research_age_all, how="left", on="dblp_author_name")
)

early_staged_group = sorted_results[sorted_results["research_age"] == 0]
draw_table_and_save_to_csv(
    early_staged_group,
    "../data/analysis/filtered_types/tables/all_only_authors_by_venue/group_0_dblp.png",
    "../data/analysis/filtered_types/tables/all_only_authors_by_venue/group_0_dblp.csv",
)

consolidators_group = sorted_results[sorted_results["research_age"] == 1]
draw_table_and_save_to_csv(
    consolidators_group,
    "../data/analysis/filtered_types/tables/all_only_authors_by_venue/group_1_dblp.png",
    "../data/analysis/filtered_types/tables/all_only_authors_by_venue/group_1_dblp.csv",
)


experienced_group = sorted_results[sorted_results["research_age"] == 2]
draw_table_and_save_to_csv(
    experienced_group,
    "../data/analysis/filtered_types/tables/all_only_authors_by_venue/group_2_dblp.png",
    "../data/analysis/filtered_types/tables/all_only_authors_by_venue/group_2_dblp.csv",
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group["Row_Numbers"] = group.reset_index().index + 1
  plt.tight_layout()  # Adjust layout
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group["Row_Numbers"] = group.reset_index().index + 1
  plt.tight_layout()  # Adjust layout
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group["Row_Numbers"] = 

In [131]:
# analysis avg citations
from datetime import datetime


def count_average_citation_for_paper(row):
    current_date = datetime.now().date()

    # print(row)
    publication_date = datetime.strptime(row["publication_date"], "%Y-%m-%d")

    # difference_in_years = abs(current_date.year - publication_date.year)
    difference_in_months = (current_date.year - publication_date.year) * 12 + (
        current_date.month - publication_date.month
    )

    if difference_in_months <= 1:
        return row["cited_by_count"]
    else:
        return row["cited_by_count"] / difference_in_months


df_all_dblp = pd.concat([df_journals_dblp, df_conferences_dblp])
df_all_dblp["average_citation_count"] = df_all_dblp.apply(
    count_average_citation_for_paper, axis=1
)

df_average_citations = (
    df_all_dblp.groupby(["dblp_author_name"])["average_citation_count"]
    .agg(["mean"])
    .reset_index()
)

df_average_citations_sorted = df_average_citations.sort_values(
    by="mean", ascending=False
)

df_average_citations_by_author_name_500 = df_average_citations_sorted.head(500).copy()

calculate_first_year_of_pub(
    df_average_citations_by_author_name_500,
    "../data/analysis/filtered_types/authors_first_year_pub_all_citations_months.csv",
    [
        "../data/analysis/author_dblp/authors_first_year_pub_all_authors.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations_r.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations_months.csv",
        "../data/analysis/filtered_types/authors_first_year_pub_selected_journals.csv",
        "../data/analysis/filtered_types/authors_first_year_pub_all_rerun.csv",
    ],
)

# dblp save research age

save_research_age(
    [
        "../data/analysis/author_dblp/authors_first_year_pub_all_authors.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations_r.csv",
        "../data/analysis/author_dblp/authors_first_year_pub_all_citations_months.csv",
        "../data/analysis/filtered_types/authors_first_year_pub_selected_journals.csv",
        "../data/analysis/filtered_types/authors_first_year_pub_all_rerun.csv",
        "../data/analysis/filtered_types/authors_first_year_pub_all_citations_months.csv",
    ],
    df_average_citations_by_author_name_500,
    "../data/research_age/index_dblp_all_citations_first_500_month_filtered_types.csv",
)

               dblp_author_name      mean
0             A. A. Zaidan 0001  0.773529
1              A. Charan Kumari  0.438904
2            A. César C. França  0.179498
3                A. Eashaan Rao  0.005682
4      A. J. M. Imtiajur Rahman  0.296296
...                         ...       ...
21125           Özden Özcan Top  0.083333
21126           Özden Özcan-Top  1.200000
21127              Özgür Kafali  0.182997
21128              Özgür Ulusoy  0.000000
21129            Øystein Haugen  0.086686

[21130 rows x 2 columns]


500it [00:00, 51120.12it/s]

           dblp_author_name       mean
6615          Harshit Gupta  11.846154
928    Amir Vahid Dastjerdi   6.506410
11702          Maverick Woo   6.400000
7136          HyungSeok Han   6.400000
19424           Yadong Wang   6.158730
...                     ...        ...
4054    Dayang N. A. Jawawi   1.175595
17127  Suresh Thummalapenta   1.174897
18227       Victor Sobreira   1.173913
19324          Xu Wang 0007   1.173313
7545     Jaeseung Choi 0002   1.169091

[500 rows x 2 columns]





In [68]:
df_average_citations_sorted[
    df_average_citations_sorted["dblp_author_name"] == "Shane McIntosh"
]

Unnamed: 0,dblp_author_name,mean
16115,Shane McIntosh,0.656672


In [132]:
authors_avg_citations = pd.read_csv(
    "../data/research_age/index_dblp_all_citations_first_500_month_filtered_types.csv"
)

authors_avg_citations["mean"] = authors_avg_citations["mean"].round(3)
# Grouping the DataFrame by 'Group'
authors_avg_citations_grouped = authors_avg_citations.groupby("research_age")
authors_avg_citations_grouped_sorted = authors_avg_citations_grouped.apply(
    lambda x: x.sort_values(by="mean", ascending=False)
)

early_staged_group = authors_avg_citations_grouped_sorted[
    authors_avg_citations_grouped_sorted["research_age"] == 0
]
draw_table_and_save_to_csv(
    early_staged_group,
    "../data/analysis/filtered_types/tables/authors_avg_citations/group_0_dblp.png",
    "../data/analysis/filtered_types/tables/authors_avg_citations/group_0_dblp.csv",
)

consolidators_group = authors_avg_citations_grouped_sorted[
    authors_avg_citations_grouped_sorted["research_age"] == 1
]
draw_table_and_save_to_csv(
    consolidators_group,
    "../data/analysis/filtered_types/tables/authors_avg_citations/group_1_dblp.png",
    "../data/analysis/filtered_types/tables/authors_avg_citations/group_1_dblp.csv",
)


experienced_group = authors_avg_citations_grouped_sorted[
    authors_avg_citations_grouped_sorted["research_age"] == 2
]
draw_table_and_save_to_csv(
    experienced_group,
    "../data/analysis/filtered_types/tables/authors_avg_citations/group_2_dblp.png",
    "../data/analysis/filtered_types/tables/authors_avg_citations/group_2_dblp.csv",
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group["Row_Numbers"] = group.reset_index().index + 1
  plt.tight_layout()  # Adjust layout
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group["Row_Numbers"] = group.reset_index().index + 1
  plt.tight_layout()  # Adjust layout
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group["Row_Numbers"] = 