In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
import math
import os
import itertools
import warnings
import glob
import sys
staff = pd.read_csv("people_flt.csv")
staff_flt = staff[(~staff["MTMT"].isnull()) & (staff["MTMT"]!=11111111) & (staff["MTMT"]!=0) & (staff["Munkakör"].str.lower()!="doktorandusz") & (staff["Munkakör"].str.lower()!="sh doktorandusz")]
staff_flt["MTMT"] = staff_flt["MTMT"].astype("int")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  staff_flt["MTMT"] = staff_flt["MTMT"].astype("int")


In [2]:
def load_pub_for_author(mtid):
    page = 1
    size = 1000
    tries = 1
    author_pubs = []
    while True:
        os.system(f'wget -O tmp.json "https://m2.mtmt.hu/api/publication?cond=published;eq;true&cond=core;eq;true&cond=authors.mtid;eq;{mtid}&sort=publishedYear,desc&sort=firstAuthor,asc&size={size}&fields=citations,pubStats&labelLang=hun&page={page}&format=json" >wget.out 2>wget.err')
        with open("tmp.json", "r") as f:
            try:
                dta = json.load(f)
                if page == 1 and len(dta["content"]) == 0:
                    return []
                author_pubs += dta["content"]
                if not dta["paging"]["last"]:
                    page += 1
                else:
                    break
            except:
                tries += 1
                print("Error, retrying")
                if tries > 30:
                    return []
    return author_pubs

In [3]:
for mtid in tqdm(staff_flt["MTMT"]):
    if not os.path.exists(f"big/{mtid}.json"):
        json.dump(load_pub_for_author(mtid), open(f"big/{mtid}.json", "w", encoding='utf8'), indent=4, ensure_ascii=False)

100%|██████████████████████████████████████| 119/119 [00:00<00:00, 33039.13it/s]


In [4]:
ifdf = pd.read_csv('./pubgraph/ifdf_v8_2021.csv')
ifdf.loc[ifdf["if"].isnull(), "if"] = 0.0
maxIFYear = ifdf.year.max()
ifdf_e = ifdf.copy().set_index(["year", "eissn"]).sort_index()
ifdf_p = ifdf.copy().set_index(["year", "pissn"]).sort_index()

def getif(pub):
    if "journal" not in pub:
        return 0.0, 1.0
    year = min(pub["publishedYear"], maxIFYear)
    if "pIssn" in pub["journal"]:
        issn = pub["journal"]["pIssn"]
        if (year,issn) in ifdf_p.index:
            ifval = ifdf_p.loc[(year,issn),"if"]
            catif = ifdf_p.loc[(year,issn),"categoryMedianIf"]
            if len(ifval) == 1 and float(ifval) > 0.0:
                if len(catif) == 1 and float(catif) > 0.0:
                    return float(ifval), float(catif)
                else:
                    return float(ifval), float(ifval) # ha if-es az újság, de még nem elég régóta. nincs sok ilyen.
    if "eIssn" in pub["journal"]:
        issn = pub["journal"]["eIssn"]
        if (year,issn) in ifdf_e.index:
            ifval = ifdf_e.loc[(year,issn),"if"]
            catif = ifdf_e.loc[(year,issn),"categoryMedianIf"]
            if len(ifval) == 1 and float(ifval) > 0.0:
                if len(catif) == 1 and float(catif) > 0.0:
                    return float(ifval), float(catif)
                else:
                    return float(ifval), float(ifval) # ha if-es az újság, de még nem elég régóta. nincs sok ilyen.
    return 0.0, 1.0

# a függvény visszaadja a folyóirat rangját (D1, Q1, Q2, ...)
def getrating(pub):
    if "ratings" not in pub:
        return ""
    for r in pub["ratings"]:
        if r["otype"] == "SjrRating" and "ranking" in r:
            return r["ranking"]
    return ""


def score_person(mtid,auth_year,cit_year):
    if not os.path.exists(f'big/{mtid}.json'):
        raise Exception("Nincs letöltve a publikációs lista!")
    pubs = json.load(open(f'big/{mtid}.json'))
    score = score_publist([mtid], pubs,auth_year,cit_year)
    score["mtid"] = int(mtid)
    return score



def score_publist(mtid_list, pubs,auth_year,cit_year):
    score = {"pub_count": 0, "q_paper": 0, "q_n_paper": 0, "q_book": 0, "if": 0, "relif": 0, "ifcnt": 0, "if_norm": 0, "relif_norm": 0, "h": 0, "i": 0, "first_pub": 2100, "last_pub": 0}
    cit_list = []
    useShare = False
    ix = 0
    for pub in pubs:
        if "publishedYear" in pub:
            if  pub["publishedYear"] >=auth_year:
                ix += 1
                if "error" in pub and pub["error"]!="VALIDATION_ERROR":
                    continue
                if "category" not in pub or pub["category"]["label"]!="Tudományos":
                    continue
                # compute share of the authos
                authors = 0
                share = len(mtid_list)/len(pub["authorships"]) if len(pub["authorships"])>0 else 0
                for a in pub["authorships"]:
                    if "author" in a and a["author"]["mtid"] in mtid_list and a["authorTyped"]:
                        authors += 1
                if authors < len(mtid_list):
                    continue
                # compute number of pages
                plength = 0
                try:
                    if "pageLength" in pub:
                        plength = int(pub["pageLength"])
                    elif "firstPage" in pub and "lastPage" in pub:
                        plength = int(pub["lastPage"]) - int(pub["firstPage"]) + 1            
                except:
                    pass
                # compute impact factor
                ifct, nrm = getif(pub)        
                # compute Q score
                if pub["otype"] == "JournalArticle" and "journal" in pub:
                    if "fullPublication" in pub and pub["fullPublication"] and "reviewType" in pub["journal"] and pub["journal"]["reviewType"]=="REVIEWED" and "subType" in pub and \
                        pub["subType"]["name"] in ["Szakcikk", "Összefoglaló cikk", "Konferenciaközlemény", "Rövid közlemény", "Sokszerzős vagy csoportos szerzőségű szakcikk"]:                
                        if ifct > 0:
                            totalscr = max(0.6,ifct)
                        elif "foreignEdition" in pub and pub["foreignEdition"]:
                            totalscr = 0.4
                        else:
                            totalscr = 0.3
                        score["q_paper"] += totalscr * share
                        score["q_n_paper"] += totalscr * share / nrm if nrm > 0 else totalscr * share
                        if ifct > 0:
                            score["if"] += ifct
                            score["relif"] += totalscr * share
                            score["ifcnt"] += 1
                            score["if_norm"] += ifct / nrm if nrm > 0 else ifct
                            score["relif_norm"] += totalscr * share / nrm if nrm > 0 else totalscr * share
                elif "fullPublication" in pub and pub["fullPublication"] and \
                    ((pub["type"]["label"]=="Könyvrészlet" and "subType" in pub and pub["subType"]["label"]=="Konferenciaközlemény (Könyvrészlet)") or pub["type"]["label"]=="Egyéb konferenciaközlemény"):                
                    if plength >= 4:
                        totalscr = 0.2 if "foreignLanguage" in pub and pub["foreignLanguage"] else 0.1
                        score["q_paper"] += totalscr * share
                        score["q_n_paper"] += totalscr * share
                elif pub["type"]["label"] == "Könyv" or \
                    (pub["type"]["label"] == "Könyvrészlet" and "subType" in pub and pub["subType"]["label"] in ["Könyvfejezet (Könyvrészlet)", "Szaktanulmány (Könyvrészlet)"]):
                    if plength >= 10:
                        if plength >= 100:
                            totalscr = 2 if "foreignLanguage" in pub and pub["foreignLanguage"] else 1
                        else:
                            totalscr = (0.2 if "foreignLanguage" in pub and pub["foreignLanguage"] else 0.1) * math.floor(plength/10)
                        score["q_book"] += totalscr * share
                # Process citations
                independentCitation=0
                if "pubStats" in pub:
                    for a in pub["pubStats"]["years"]:
                        if a["year"]>=cit_year:
                            independentCitation+=a["independentCitationCount"]
                if "independentCitationCount" in pub:
                    score["i"] += independentCitation
                    cit_list.append(independentCitation)
            
            
                if "publishedYear" in pub:
                    score["last_pub"] = max(score["last_pub"], pub["publishedYear"])
                    score["first_pub"] = min(score["first_pub"], pub["publishedYear"])
                score["pub_count"] += 1
    score["q"] = score["q_paper"] + np.minimum(score["q_book"], 3.0)
    score["qn"] = score["q_n_paper"] + np.minimum(score["q_book"], 3.0)
    # h-index
    cit_list.sort(reverse=True)
    while score["h"] < len(cit_list) and score["h"] < cit_list[score["h"]]:
        score["h"] += 1
        
    return score




def get_h_score(mtid,auth_year,cit_year):
    cit_list = []
    if os.path.exists(f'big/{mtid}.json'):
        with open(f'big/{mtid}.json') as f:
            try:
                js = json.load(f)
                for pub in js:
                    if "publishedYear" in pub:
                        if  pub["publishedYear"] >=auth_year:
                            independentCitation=0
                            if "pubStats" in pub:
                                for a in pub["pubStats"]["years"]:
                                    if a["year"]>=cit_year:
                                        independentCitation+=a["independentCitationCount"]
                            cit_list.append(pub[independentCitation])
            except:
                pass
    cit_list.sort(reverse=True)
    hindex = 0
    while hindex < len(cit_list) and hindex < cit_list[hindex]:
        hindex += 1
    return hindex

In [None]:
dta = {"MTMT": [], "pubCount": [], "qScore": [], "qnScore": [], "ifCount": [], "ifScore": [], "relifScore": [], "ifNormScore": [], "relifNormScore": [], "citations": [], "hIndex": [], "firstPub": [], "lastPub": [], "auth_year": [], "cit_year": []}
for auth_year in tqdm(range(1990,2024)):
    for cit_year in range(auth_year,2024):
        staff_flt["H index"] = [0 if np.isnan(mtid) else get_h_score(int(mtid),auth_year,cit_year) for mtid in staff_flt["MTMT"]]
        for idx, prs in staff_flt.iterrows():
            scr = score_person(prs["MTMT"],auth_year,cit_year)
            dta["MTMT"].append(scr["mtid"])
            dta["pubCount"].append(scr["pub_count"])
            dta["qScore"].append(scr["q"])
            dta["qnScore"].append(scr["qn"])
            dta["ifCount"].append(scr["ifcnt"])
            dta["ifScore"].append(scr["if"])
            dta["relifScore"].append(scr["relif"])
            dta["ifNormScore"].append(scr["if_norm"])
            dta["relifNormScore"].append(scr["relif_norm"])
            dta["citations"].append(scr["i"])
            dta["hIndex"].append(scr["h"])
            dta["firstPub"].append(scr["first_pub"])
            dta["lastPub"].append(scr["last_pub"])
            dta["auth_year"].append(auth_year)
            dta["cit_year"].append(cit_year)
node_person = pd.DataFrame(dta)
node_person

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  staff_flt["H index"] = [0 if np.isnan(mtid) else get_h_score(int(mtid),auth_year,cit_year) for mtid in staff_flt["MTMT"]]
 18%|███████▏                                 | 6/34 [41:56<3:09:23, 405.84s/it]

In [None]:
result_no_aff.to_csv("big_no aff.csv")

In [None]:
result_no_aff.to_csv("big_no aff.csv")

In [None]:
def load_pub_for_author(mtid):
    page = 1
    size = 1000
    tries = 1
    author_pubs = []
    while True:
        os.system(f'wget -O tmp.json "https://m2.mtmt.hu/api/publication?cond=published;eq;true&cond=institutes;inia;10856&cond=core;eq;true&cond=authors.mtid;eq;{mtid}&sort=publishedYear,desc&sort=firstAuthor,asc&size={size}&fields=citations,pubStats&labelLang=hun&page={page}&format=json" >wget.out 2>wget.err')
        with open("tmp.json", "r") as f:
            try:
                dta = json.load(f)
                if page == 1 and len(dta["content"]) == 0:
                    return []
                author_pubs += dta["content"]
                if not dta["paging"]["last"]:
                    page += 1
                else:
                    break
            except:
                tries += 1
                print("Error, retrying")
                if tries > 30:
                    return []
    return author_pubs

In [None]:
for mtid in tqdm(staff_flt["MTMT"]):
    if not os.path.exists(f"big_ins/{mtid}.json"):
        json.dump(load_pub_for_author(mtid), open(f"big_ins/{mtid}.json", "w", encoding='utf8'), indent=4, ensure_ascii=False)

In [None]:
ifdf = pd.read_csv('./pubgraph/ifdf_v8_2021.csv')
ifdf.loc[ifdf["if"].isnull(), "if"] = 0.0
maxIFYear = ifdf.year.max()
ifdf_e = ifdf.copy().set_index(["year", "eissn"]).sort_index()
ifdf_p = ifdf.copy().set_index(["year", "pissn"]).sort_index()

def getif(pub):
    if "journal" not in pub:
        return 0.0, 1.0
    year = min(pub["publishedYear"], maxIFYear)
    if "pIssn" in pub["journal"]:
        issn = pub["journal"]["pIssn"]
        if (year,issn) in ifdf_p.index:
            ifval = ifdf_p.loc[(year,issn),"if"]
            catif = ifdf_p.loc[(year,issn),"categoryMedianIf"]
            if len(ifval) == 1 and float(ifval) > 0.0:
                if len(catif) == 1 and float(catif) > 0.0:
                    return float(ifval), float(catif)
                else:
                    return float(ifval), float(ifval) # ha if-es az újság, de még nem elég régóta. nincs sok ilyen.
    if "eIssn" in pub["journal"]:
        issn = pub["journal"]["eIssn"]
        if (year,issn) in ifdf_e.index:
            ifval = ifdf_e.loc[(year,issn),"if"]
            catif = ifdf_e.loc[(year,issn),"categoryMedianIf"]
            if len(ifval) == 1 and float(ifval) > 0.0:
                if len(catif) == 1 and float(catif) > 0.0:
                    return float(ifval), float(catif)
                else:
                    return float(ifval), float(ifval) # ha if-es az újság, de még nem elég régóta. nincs sok ilyen.
    return 0.0, 1.0

# a függvény visszaadja a folyóirat rangját (D1, Q1, Q2, ...)
def getrating(pub):
    if "ratings" not in pub:
        return ""
    for r in pub["ratings"]:
        if r["otype"] == "SjrRating" and "ranking" in r:
            return r["ranking"]
    return ""


def score_person(mtid,auth_year,cit_year):
    if not os.path.exists(f'big_ins/{mtid}.json'):
        raise Exception("Nincs letöltve a publikációs lista!")
    pubs = json.load(open(f'big_ins/{mtid}.json'))
    score = score_publist([mtid], pubs,auth_year,cit_year)
    score["mtid"] = int(mtid)
    return score



def score_publist(mtid_list, pubs,auth_year,cit_year):
    score = {"pub_count": 0, "q_paper": 0, "q_n_paper": 0, "q_book": 0, "if": 0, "relif": 0, "ifcnt": 0, "if_norm": 0, "relif_norm": 0, "h": 0, "i": 0, "first_pub": 2100, "last_pub": 0}
    cit_list = []
    useShare = False
    ix = 0
    for pub in pubs:
        if "publishedYear" in pub:
            if  pub["publishedYear"] >=auth_year:
                ix += 1
                if "error" in pub and pub["error"]!="VALIDATION_ERROR":
                    continue
                if "category" not in pub or pub["category"]["label"]!="Tudományos":
                    continue
                # compute share of the authos
                authors = 0
                share = len(mtid_list)/len(pub["authorships"]) if len(pub["authorships"])>0 else 0
                for a in pub["authorships"]:
                    if "author" in a and a["author"]["mtid"] in mtid_list and a["authorTyped"]:
                        authors += 1
                if authors < len(mtid_list):
                    continue
                # compute number of pages
                plength = 0
                try:
                    if "pageLength" in pub:
                        plength = int(pub["pageLength"])
                    elif "firstPage" in pub and "lastPage" in pub:
                        plength = int(pub["lastPage"]) - int(pub["firstPage"]) + 1            
                except:
                    pass
                # compute impact factor
                ifct, nrm = getif(pub)        
                # compute Q score
                if pub["otype"] == "JournalArticle" and "journal" in pub:
                    if "fullPublication" in pub and pub["fullPublication"] and "reviewType" in pub["journal"] and pub["journal"]["reviewType"]=="REVIEWED" and "subType" in pub and \
                        pub["subType"]["name"] in ["Szakcikk", "Összefoglaló cikk", "Konferenciaközlemény", "Rövid közlemény", "Sokszerzős vagy csoportos szerzőségű szakcikk"]:                
                        if ifct > 0:
                            totalscr = max(0.6,ifct)
                        elif "foreignEdition" in pub and pub["foreignEdition"]:
                            totalscr = 0.4
                        else:
                            totalscr = 0.3
                        score["q_paper"] += totalscr * share
                        score["q_n_paper"] += totalscr * share / nrm if nrm > 0 else totalscr * share
                        if ifct > 0:
                            score["if"] += ifct
                            score["relif"] += totalscr * share
                            score["ifcnt"] += 1
                            score["if_norm"] += ifct / nrm if nrm > 0 else ifct
                            score["relif_norm"] += totalscr * share / nrm if nrm > 0 else totalscr * share
                elif "fullPublication" in pub and pub["fullPublication"] and \
                    ((pub["type"]["label"]=="Könyvrészlet" and "subType" in pub and pub["subType"]["label"]=="Konferenciaközlemény (Könyvrészlet)") or pub["type"]["label"]=="Egyéb konferenciaközlemény"):                
                    if plength >= 4:
                        totalscr = 0.2 if "foreignLanguage" in pub and pub["foreignLanguage"] else 0.1
                        score["q_paper"] += totalscr * share
                        score["q_n_paper"] += totalscr * share
                elif pub["type"]["label"] == "Könyv" or \
                    (pub["type"]["label"] == "Könyvrészlet" and "subType" in pub and pub["subType"]["label"] in ["Könyvfejezet (Könyvrészlet)", "Szaktanulmány (Könyvrészlet)"]):
                    if plength >= 10:
                        if plength >= 100:
                            totalscr = 2 if "foreignLanguage" in pub and pub["foreignLanguage"] else 1
                        else:
                            totalscr = (0.2 if "foreignLanguage" in pub and pub["foreignLanguage"] else 0.1) * math.floor(plength/10)
                        score["q_book"] += totalscr * share
                # Process citations
                independentCitation=0
                if "pubStats" in pub:
                    for a in pub["pubStats"]["years"]:
                        if a["year"]>=cit_year:
                            independentCitation+=a["independentCitationCount"]
                if "independentCitationCount" in pub:
                    score["i"] += independentCitation
                    cit_list.append(independentCitation)
            
            
                if "publishedYear" in pub:
                    score["last_pub"] = max(score["last_pub"], pub["publishedYear"])
                    score["first_pub"] = min(score["first_pub"], pub["publishedYear"])
                score["pub_count"] += 1
    score["q"] = score["q_paper"] + np.minimum(score["q_book"], 3.0)
    score["qn"] = score["q_n_paper"] + np.minimum(score["q_book"], 3.0)
    # h-index
    cit_list.sort(reverse=True)
    while score["h"] < len(cit_list) and score["h"] < cit_list[score["h"]]:
        score["h"] += 1
        
    return score




def get_h_score(mtid,auth_year,cit_year):
    cit_list = []
    if os.path.exists(f'big_ins/{mtid}.json'):
        with open(f'big_ins/{mtid}.json') as f:
            try:
                js = json.load(f)
                for pub in js:
                    if "publishedYear" in pub:
                        if  pub["publishedYear"] >=auth_year:
                            independentCitation=0
                            if "pubStats" in pub:
                                for a in pub["pubStats"]["years"]:
                                    if a["year"]>=cit_year:
                                        independentCitation+=a["independentCitationCount"]
                            cit_list.append(pub[independentCitation])
            except:
                pass
    cit_list.sort(reverse=True)
    hindex = 0
    while hindex < len(cit_list) and hindex < cit_list[hindex]:
        hindex += 1
    return hindex

In [None]:
dta = {"MTMT": [], "pubCount": [], "qScore": [], "qnScore": [], "ifCount": [], "ifScore": [], "relifScore": [], "ifNormScore": [], "relifNormScore": [], "citations": [], "hIndex": [], "firstPub": [], "lastPub": [], "auth_year": [], "cit_year": []}
for auth_year in tqdm(range(1990,2024)):
    for cit_year in range(auth_year,2024):
        staff_flt["H index"] = [0 if np.isnan(mtid) else get_h_score(int(mtid),auth_year,cit_year) for mtid in staff_flt["MTMT"]]
        for idx, prs in staff_flt.iterrows():
            scr = score_person(prs["MTMT"],auth_year,cit_year)
            dta["MTMT"].append(scr["mtid"])
            dta["pubCount"].append(scr["pub_count"])
            dta["qScore"].append(scr["q"])
            dta["qnScore"].append(scr["qn"])
            dta["ifCount"].append(scr["ifcnt"])
            dta["ifScore"].append(scr["if"])
            dta["relifScore"].append(scr["relif"])
            dta["ifNormScore"].append(scr["if_norm"])
            dta["relifNormScore"].append(scr["relif_norm"])
            dta["citations"].append(scr["i"])
            dta["hIndex"].append(scr["h"])
            dta["firstPub"].append(scr["first_pub"])
            dta["lastPub"].append(scr["last_pub"])
            dta["auth_year"].append(auth_year)
            dta["cit_year"].append(cit_year)
node_person = pd.DataFrame(dta)
node_person.to_csv("big_aff.csv")

In [None]:
node_person.to_csv("big_aff.csv")

# pub no aff

In [None]:
def load_pub_for_author(mtid):
    tries = 1
    page = 1
    while True:
        os.system(f'wget -O tmp.json "https://m2.mtmt.hu/api/publication?cond=authors.mtid;eq;{mtid}&page={page}&fields=citations,pubStats&format=json" >wget.out 2>wget.err')
        with open("tmp.json", "r") as f:
            try:
                dta = json.load(f)
                if not os.path.exists(f"journals3/{mtid}{page}.json"):
                    json.dump(dta, open(f"journals3/{mtid}{page}.json", "w", encoding='utf8'), indent=4, ensure_ascii=False)
                if not dta["paging"]["last"]:
                    page += 1
                else:
                    break
            except:
                tries += 1
                print("Error, retrying")
                if tries > 30:
                    return []
staff_flt=pd.read_csv('people_flt.csv')
for mtid in tqdm(staff_flt["MTMT"]):
    load_pub_for_author(mtid)


In [None]:
df=pd.DataFrame(columns =['cit_year','publishedYear','name','authors','independentCitingPubCount','types'])
for files in tqdm(os.listdir('journals3/')):
    dta = json.load(open(f'journals3/{files}'))
    for x in dta["content"]:
        for cit_year in range(x["publishedYear"],2024):
            auth=""
            for y in x["authorships"]:
                if "author" in y:
                    auth=auth+str(y["author"]["mtid"])+","
            independentCitation=0
            if "pubStats" in x:
                for a in x["pubStats"]["years"]:
                    if a["year"]>=cit_year:
                        independentCitation+=a["independentCitationCount"]
            a=pd.Series({'cit_year':cit_year,'publishedYear':x["publishedYear"],'name':x["title"],'authors':auth,'independentCitingPubCount':independentCitation,'types':x["otype"]})
            df=pd.concat([df, a.to_frame().T], ignore_index=True)
df=df.drop_duplicates(subset=['name','cit_year'])
df = df[ (df['types'] != 'Thesis') & (df['types'] != 'Achievement') ]
df

In [None]:
piub1=df

In [None]:
x=piub1['authors'].str.split(',',expand=True)
x=x.drop([32], axis=1)
x=piub1.join(x)
staff_flt["MTMT"] = staff_flt["MTMT"].apply(str)
for i in range(32):
    mask = x[i].isin(staff_flt["MTMT"])
    x[i] = x[i].where(mask, None)
for i in range(32):
    y=staff_flt
    pubs=y.rename(columns={"MTMT": i})
    x=pd.merge(x, pubs[[i , "Név"]], how='left',  on=[i])
    x=x.drop([i], axis=1)
    x=x.rename(columns={"Név": i})
x=x.drop(["authors"], axis=1)
x['authors'] = x[x.columns[5:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
x=x[['cit_year','publishedYear','name','authors','independentCitingPubCount','types']]
x

In [None]:
x.to_csv("pub_no_aff.csv")

In [None]:
def load_pub_for_author(mtid):
    tries = 1
    page = 1
    while True:
        os.system(f'wget -O tmp.json "https://m2.mtmt.hu/api/publication?cond=authors.mtid;eq;{mtid}&cond=institutes;inia;10856&page={page}&fields=citations,pubStats&format=json" >wget.out 2>wget.err')
        with open("tmp.json", "r") as f:
            try:
                dta = json.load(f)
                if not os.path.exists(f"journals2/{mtid}{page}.json"):
                    json.dump(dta, open(f"journals2/{mtid}{page}.json", "w", encoding='utf8'), indent=4, ensure_ascii=False)
                if not dta["paging"]["last"]:
                    page += 1
                else:
                    break
            except:
                tries += 1
                print("Error, retrying")
                if tries > 30:
                    return []
staff_flt=pd.read_csv('people_flt.csv')
for mtid in tqdm(staff_flt["MTMT"]):
    load_pub_for_author(mtid)
df=pd.DataFrame(columns =['cit_year','publishedYear','name','authors','independentCitingPubCount','types'])
for files in tqdm(os.listdir('journals2/')):
    dta = json.load(open(f'journals2/{files}'))
    for x in dta["content"]:
        for cit_year in range(x["publishedYear"],2024):
            auth=""
            for y in x["authorships"]:
                if "author" in y:
                    auth=auth+str(y["author"]["mtid"])+","
            independentCitation=0
            if "pubStats" in x:
                for a in x["pubStats"]["years"]:
                    if a["year"]>=cit_year:
                        independentCitation+=a["independentCitationCount"]
            a=pd.Series({'cit_year':cit_year,'publishedYear':x["publishedYear"],'name':x["title"],'authors':auth,'independentCitingPubCount':independentCitation,'types':x["otype"]})
            df=pd.concat([df, a.to_frame().T], ignore_index=True)
df=df.drop_duplicates(subset=['name','cit_year'])
df = df[ (df['types'] != 'Thesis') & (df['types'] != 'Achievement') ]
df

In [None]:
x=df['authors'].str.split(',',expand=True)
x=x.drop([32], axis=1)
x=df.join(x)
staff_flt["MTMT"] = staff_flt["MTMT"].apply(str)
for i in range(32):
    mask = x[i].isin(staff_flt["MTMT"])
    x[i] = x[i].where(mask, None)
for i in range(32):
    y=staff_flt
    pubs=y.rename(columns={"MTMT": i})
    x=pd.merge(x, pubs[[i , "Név"]], how='left',  on=[i])
    x=x.drop([i], axis=1)
    x=x.rename(columns={"Név": i})
x=x.drop(["authors"], axis=1)
x['authors'] = x[x.columns[5:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
x=x[['cit_year','publishedYear','name','authors','independentCitingPubCount','types']]
x.to_csv("pub_aff.csv")