In [2]:
import pandas as pd
import numpy as np
from statsmodels.stats.proportion import proportion_confint
import matplotlib.ticker as mticker
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt


jason = pd.read_csv("manual/annotations_jason.csv", sep=";")
franek = pd.read_csv("manual/annotations_franek.csv", sep=";")

albert = pd.read_csv("manual/annotations_albert.csv", sep=";")
jonathan = pd.read_csv("manual/annotations_jonathan.csv", sep=";") 
data = pd.read_csv("../data/data_export.csv", sep=",")

In [3]:
data = data[(data['cited_by_count'] >= 14) & (data['cited_by_count'] < 15)]
print(f"Number of rows in data: {data.shape[0]}")

Number of rows in data: 176


In [4]:
# Split authors so only 1 author per row
data['authors'] = data['authors'].apply(lambda x: x.replace("'", "").replace("[", "").replace("]", "").lower().split(', '))
data = data.explode('authors')

# Get most frequent category per author
df_categories = data.groupby('authors')['predicted_category'] \
    .agg(lambda x: x.value_counts().index[0]) \
    .reset_index()

# Map string categories to integers
category_map = {
    "Psychology": 0, "Innovative Technologies": 1, "Physical Attributes": 2, "Scouting / Finance": 3,
    "Other": 4, "Medical / Injuries": 5, "Tactics analysis": 6
}

df_categories['category_int'] = df_categories['predicted_category'].map(category_map)

# Rename columns
df_categories = df_categories.rename(columns={'authors': 'author_name', 'predicted_category': 'category'})

df_categories.to_csv("author_categories.csv", index=False)

In [5]:
# Load map.csv
map_df = pd.read_csv("map.csv")

# Merge df_categories with map_df on author_name and label
merged_df = map_df.merge(df_categories, left_on='label', right_on='author_name', how='left')

# Replace cluster with category_int
merged_df['cluster'] = merged_df['category_int']

# Drop unnecessary columns
merged_df = merged_df.drop(columns=['author_name', 'category', 'category_int'])

# # Save the updated map.csv
# merged_df.to_csv("map_updated.csv", index=False)

In [6]:
merged_df['cluster'].isna().sum()


274

In [12]:
data.columns

Index(['id', 'doi', 'title', 'relevance_score', 'publication_year',
       'publication_date', 'language', 'type', 'type_crossref', 'indexed_in',
       'countries_distinct_count', 'referenced_works_count', 'cited_by_count',
       'predicted_category', 'authors', 'institutions', 'countries',
       'journal_name', 'avg_author_h_index', 'max_author_h_index',
       'avg_institution_h_index', 'max_institution_h_index', 'journal_h_index',
       'avg_author_citations_past_year', 'max_author_citations_past_year',
       'avg_institution_citations_past_year',
       'max_institution_citations_past_year', 'num_authors',
       'num_institutions'],
      dtype='object')

In [20]:
import json

my_export = {
    "results": []
}

for index, row in data.iterrows():
    record = {
        "apc_list": row.get("apc_list", None),
        "is_retracted": row.get("is_retracted", None),
        "countries_distinct_count": row.get("countries_distinct_count", None),
        "fulltext_origin": row.get("fulltext_origin", None),
        "is_paratext": row.get("is_paratext", None),
        "keywords": row.get("keywords", None),
        "counts_by_year": row.get("counts_by_year", None),
        "biblio": {
            "volume": "17",
            "issue": "6",
            "first_page": "1073",
            "last_page": "1084"
        },
        "primary_location": row.get("primary_location", None),
        "primary_topic": row.get("primary_topic", None),
        "language": row.get("language", None),
        "related_works": row.get("related_works", None),
        "type": row.get("type", None),
        "cited_by_percentile_year": row.get("cited_by_percentile_year", None),
        "authorships": [
            {
                "institutions": [{
                    "lineage": ["https://openalex.org/I28166907"],
                    "country_code": "SE",
                    "ror": "https://ror.org/056d84691",
                    "id": "https://openalex.org/I28166907",
                    "display_name": "Karolinska Institutet",
                    "type": "education"
                }],
                "raw_affiliation_strings": ["Department of Clinical Neuroscience, Karolinska Institutet, Stockholm, Sweden"],
                "author": {
                    "orcid": "https://orcid.org/0000-0003-0523-6621",
                    "id": "https://openalex.org/A5011101802",
                    "display_name": "TorbjÃ¶rn Vestberg"
                },
                "is_corresponding": "true",
                "affiliations": [{
                    "raw_affiliation_string": "Department of Clinical Neuroscience, Karolinska Institutet, Stockholm, Sweden",
                    "institution_ids": ["https://openalex.org/I28166907"]
                }],
                "raw_author_name": "TorbjÃ¶rn Vestberg",
                "countries": ["SE"],
                "author_position": "first"
            },
            {
                "institutions": [{
                    "lineage": ["https://openalex.org/I28166907"],
                    "country_code": "SE",
                    "ror": "https://ror.org/056d84691",
                    "id": "https://openalex.org/I28166907",
                    "display_name": "Karolinska Institutet",
                    "type": "education"
                }],
                "raw_affiliation_strings": ["Department of Clinical Neuroscience, Karolinska Institutet, Stockholm, Sweden"],
                "author": {
                    "orcid": "https://orcid.org/0000-0001-8444-8686",
                    "id": "https://openalex.org/A5008450842",
                    "display_name": "Gustaf Reinebo"
                },
                "is_corresponding": "false",
                "affiliations": [{
                    "raw_affiliation_string": "Department of Clinical Neuroscience, Karolinska Institutet, Stockholm, Sweden",
                    "institution_ids": ["https://openalex.org/I28166907"]
                }],
                "raw_author_name": "Gustaf Reinebo",
                "countries": ["SE"],
                "author_position": "middle"
            },
            {
                "institutions": [{
                    "lineage": ["https://openalex.org/I28166907"],
                    "country_code": "SE",
                    "ror": "https://ror.org/056d84691",
                    "id": "https://openalex.org/I28166907",
                    "display_name": "Karolinska Institutet",
                    "type": "education"
                }],
                "raw_affiliation_strings": ["Department of Clinical Neuroscience, Karolinska Institutet, Stockholm, Sweden"],
                "author": {
                    "orcid": "https://orcid.org/0000-0002-1968-4321",
                    "id": "https://openalex.org/A5074275643",
                    "display_name": "Liselotte Maurex"
                },
                "is_corresponding": "false",
                "affiliations": [{
                    "raw_affiliation_string": "Department of Clinical Neuroscience, Karolinska Institutet, Stockholm, Sweden",
                    "institution_ids": ["https://openalex.org/I28166907"]
                }],
                "raw_author_name": "Liselotte Maurex",
                "countries": ["SE"],
                "author_position": "middle"
            }],
        "id": row.get("id", None),
        "abstract_inverted_index": row.get("abstract_inverted_index", None),
        "institution_assertions": row.get("institution_assertions", None),
        "best_oa_location": row.get("best_oa_location", None),
        "cited_by_count": row.get("cited_by_count", None),
        "referenced_works": [
            "https://openalex.org/W1596056394",
            "https://openalex.org/W1685870920",
            "https://openalex.org/W1734126789",
            "https://openalex.org/W1830481607",
            "https://openalex.org/W1965112907",
            "https://openalex.org/W1988488214",
            "https://openalex.org/W1993478293",
            "https://openalex.org/W2001045706",
            "https://openalex.org/W2001578070"],
        "display_name": row.get("display_name", None),
        "publication_year": row.get("publication_year", None),
        "concepts": [
            {
                "score": 0.62389123,
                "level": 3,
                "id": "https://openalex.org/C2775968953",
                "display_name": "Executive functions",
                "wikidata": "https://www.wikidata.org/wiki/Q783092"
            }],
        "versions": row.get("versions", None),
        "ids": row.get("ids", None),
        "corresponding_author_ids": row.get("corresponding_author_ids", None),
        "doi": row.get("doi", None),
        "cited_by_api_url": row.get("cited_by_api_url", None),
        "fwci": row.get("fwci", None),
        "datasets": row.get("datasets", None),
        "title": row.get("title", None),
        "corresponding_institution_ids": row.get("corresponding_institution_ids", None),
        "publication_date": row.get("publication_date", None),
        "open_access": row.get("open_access", None),
        "institutions_distinct_count": row.get("institutions_distinct_count", None),
        "has_fulltext": row.get("has_fulltext", None),
        "sustainable_development_goals": row.get("sustainable_development_goals", None),
        "mesh": row.get("mesh", None),
        "grants": row.get("grants", None),
        "indexed_in": row.get("indexed_in", None),
        "topics": row.get("topics", None),
        "type_crossref": row.get("type_crossref", None),
        "referenced_works_count": row.get("referenced_works_count", None),
        "relevance_score": row.get("relevance_score", None),
        "citation_normalized_percentile": row.get("citation_normalized_percentile", None),
        "locations_count": row.get("locations_count", None),
        "locations": row.get("locations", None),
        "updated_date": row.get("updated_date", None),
        "created_date": row.get("created_date", None),
        "apc_paid": row.get("apc_paid", None)
    }
    my_export["results"].append(record)

with open("../data/data_export.json", "w") as json_file:
    json.dump(my_export, json_file, indent=4)
