In [None]:
import pandas as pd
import json
from tqdm import tqdm
from itertools import islice
import multiprocessing as mp


def divide_dict(d, n):
    it = iter(d)
    for i in range(0, len(d), n):
        yield {k: d[k] for k in islice(it, n)}


# df_articles = df_articles.reset_index()  # make sure indexes pair with number of rows
def proccess_features(info):
    values = info["values"].values()
    df_articles_all = info["df_articles_all"]
    df_inproceedings_all = info["df_inproceedings_all"]
    df_authors = pd.DataFrame()
    df_universities = pd.DataFrame()

    first_year_pubs_with_author_name = dict()

    for row in tqdm(values):
        institutions = dict()
        for author in row["authorships"]:
            author_row = dict()

            display_name = author["author"]["display_name"]
            first_year_pub = -1
            if display_name not in first_year_pubs_with_author_name:
                df_result_all_articles = df_articles_all[
                    df_articles_all["author"].str.contains(display_name)
                ]
                df_result_all_inproceedings = df_inproceedings_all[
                    df_inproceedings_all["author"].str.contains(display_name)
                ]

                first_year_pub_articles = df_result_all_articles["year"].min()
                first_year_pub_inproceedings = df_result_all_inproceedings["year"].min()

                first_year_pub = (
                    first_year_pub_inproceedings
                    if first_year_pub_articles > first_year_pub_inproceedings
                    else first_year_pub_articles
                )
            else:
                first_year_pub = first_year_pubs_with_author_name[display_name]

            author_row["first_publication_year"] = first_year_pub
            author_row["raw_author_name"] = author["raw_author_name"]
            author_row["display_name"] = display_name
            author_row["title"] = row["title"]
            author_row["id"] = row["id"]
            author_row["booktitle"] = row["booktitle"]
            author_row["journal"] = row["journal"]
            author_row["publication_date"] = row["publication_date"]
            author_row["publication_year"] = row["publication_year"]
            author_row["cited_by_count"] = row["cited_by_count"]
            df_authors = pd.concat(
                [df_authors, pd.DataFrame([author_row])], ignore_index=True
            )

            author_institutions = author["institutions"]
            for institution in author_institutions:
                if institution["display_name"] not in institutions:
                    institutions[institution["display_name"]] = 1
                    institution_row = dict()

                    institution_row["country_code"] = institution["country_code"]
                    institution_row["type"] = institution["type"]
                    institution_row["title"] = row["title"]
                    institution_row["id"] = row["id"]
                    institution_row["booktitle"] = row["booktitle"]
                    institution_row["journal"] = row["journal"]
                    author_row["cited_by_count"] = row["cited_by_count"]
                    pd.concat(
                        [df_universities, pd.DataFrame([institution_row])],
                        ignore_index=True,
                    )

    df_universities.to_csv(
        f"../data/analysis/institutions_{info['index']}.csv", index=False
    )
    df_authors.to_csv(f"../data/analysis/authors_{info['index']}.csv", index=False)

In [None]:
def main():
    # creating a data frame
    df_articles_all = pd.read_csv(
        "../data/output_csv/output_article.csv", sep=";", low_memory=False
    )
    df_inproceedings_all = pd.read_csv(
        "../data/output_csv/output_inproceedings.csv", sep=";", low_memory=False
    )

    df_articles_all = df_articles_all[
        df_articles_all["author"].apply(lambda x: isinstance(x, str))
    ]
    df_inproceedings_all = df_inproceedings_all[
        df_inproceedings_all["author"].apply(lambda x: isinstance(x, str))
    ]

    with open(
        "../data/output_filter_venue_2016_added_data/output_article.json", "r"
    ) as file:
        article_json_str = file.read()

    article_json = json.loads(article_json_str)

    list_for_proccess = []
    for index, value in enumerate(divide_dict(article_json, 4)):
        list_for_proccess.append(
            {
                "num_of_file": index,
                "values": value,
                "df_inproceedings_all": df_inproceedings_all,
                "df_articles_all": df_articles_all,
            }
        )

    pool = mp.Pool(4)
    result = pool.map(proccess_features, list_for_proccess)


if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import json
from tqdm import tqdm
from itertools import islice
import multiprocessing as mp


def divide_dict(d, n):
    it = iter(d)
    for i in range(0, len(d), n):
        yield {k: d[k] for k in islice(it, n)}


# df_articles = df_articles.reset_index()  # make sure indexes pair with number of rows
def proccess_features(info):
    values = info["values"].values()
    # df_authors = pd.DataFrame()
    # df_authors_dblp = pd.DataFrame()
    df_universities = pd.DataFrame()

    # first_year_pubs_with_author_name = dict()

    for row in tqdm(values):
        point_per_author = 1 / len(row["author"].split("|"))
        # institutions = dict()
        if int(row["publication_year"]) < 2024 and int(
            row["type"]
            not in [
                "editorial",
                "position paper",
                "keynote",
                "opinion",
                "tutorial",
                "poster",
                "panel",
            ]
        ):
            for author_name_dblp in row["author"].split("|"):
                author_row = dict()

                author_row["dblp_author_name"] = author_name_dblp
                author_row["title"] = row["title"]
                author_row["id"] = row["id"]
                author_row["booktitle"] = row["booktitle"]
                author_row["journal"] = row["journal"]
                author_row["publication_date"] = row["publication_date"]
                author_row["publication_year"] = row["publication_year"]
                author_row["cited_by_count"] = row["cited_by_count"]

                df_authors_dblp = pd.concat(
                    [df_authors_dblp, pd.DataFrame([author_row])], ignore_index=True
                )
            for author in row["authorships"]:
                # author_row = dict()

                # display_name = author["author"]["display_name"]

                # author_row["raw_author_name"] = author["raw_author_name"]
                # author_row["display_name"] = display_name
                # author_row["title"] = row["title"]
                # author_row["id"] = row["id"]
                # institution_row["booktitle"] = row.get("booktitle", "")
                # institution_row["journal"] = row.get("journal", "")
                # author_row["publication_date"] = row["publication_date"]
                # author_row["publication_year"] = row["publication_year"]
                # author_row["cited_by_count"] = row["cited_by_count"]
                # df_authors = pd.concat(
                #     [df_authors, pd.DataFrame([author_row])], ignore_index=True
                # )

                author_institutions = author["institutions"]
                for institution in author_institutions:
                    institution_row = dict()
                    institution_row["display_name"] = institution["display_name"]
                    institution_row["country_code"] = institution["country_code"]
                    institution_row["type"] = institution["type"]
                    institution_row["title"] = row["title"]
                    institution_row["id"] = row["id"]
                    institution_row["booktitle"] = row.get("booktitle", "")
                    institution_row["score"] = point_per_author
                    institution_row["journal"] = row.get("journal", "")
                    df_universities = pd.concat(
                        [df_universities, pd.DataFrame([institution_row])]
                    )

    df_universities.to_csv(
        f"../data/analysis/filtered_types/institutions/institutions_{info['num_of_file']}_inproc.csv",
        index=False,
    )

    df_authors_dblp.to_csv(
        f"../data/analysis/filtered_types/authors/authors_{info['num_of_file']}_inproc.csv",
        index=False,
    )


def main():
    # creating a data frame
    df_articles_all = pd.read_csv(
        "../data/output_csv/output_article.csv", sep=";", low_memory=False
    )
    df_inproceedings_all = pd.read_csv(
        "../data/output_csv/output_inproceedings.csv", sep=";", low_memory=False
    )

    df_articles_all = df_articles_all[
        df_articles_all["author"].apply(lambda x: isinstance(x, str))
    ]
    df_inproceedings_all = df_inproceedings_all[
        df_inproceedings_all["author"].apply(lambda x: isinstance(x, str))
    ]
    
    with open(
        "../data/output_filter_venue_2016_added_data/with_type/output_inproceedings.json",
        "r",
    ) as file:
        article_json_str = file.read()

    article_json = json.loads(article_json_str)

    list_for_proccess = []
    for index, value in enumerate(divide_dict(article_json, 3700)):
        list_for_proccess.append(
            {
                "num_of_file": index,
                "values": value,
                "df_inproceedings_all": df_inproceedings_all,
                "df_articles_all": df_articles_all,
            }
        )

    pool = mp.Pool(2)
    result = pool.map(proccess_features, list_for_proccess)


if __name__ == "__main__":
    main()
