In [11]:
import pandas as pd

# 1. MacRoman 인코딩으로 파일 읽기
collabo_df = pd.read_csv('Paper_Dataset.csv', encoding='MacRoman', low_memory=False)

# 2. UTF-8로 다시 저장
collabo_df.to_csv('Paper_Dataset_UTF8.csv', index=False, encoding='utf-8')

print("파일을 UTF-8 형식으로 변환하여 저장했습니다.")


파일을 UTF-8 형식으로 변환하여 저장했습니다.


## Elsevier + Semantic 합쳐주기

In [None]:
import pandas as pd
import re

# Function to normalize titles
def normalize_title(title):
    # Convert to lowercase
    title = title.lower()
    # Remove special characters and multiple spaces
    title = re.sub(r'\s+', ' ', re.sub(r'[^a-zA-Z0-9\s]', '', title))
    return title.strip()

# Try different encodings in case of decoding errors
try:
    # Load the datasets using 'utf-8' encoding first
    elsevier_df = pd.read_csv('Collabo_Paper_Elsevier.csv', encoding='utf-8')
    sem_updated_df = pd.read_csv('Collabo_Paper_Sem_Updated.csv', encoding='utf-8')
except UnicodeDecodeError:
    # If 'utf-8' fails, try 'latin1' encoding
    elsevier_df = pd.read_csv('Collabo_Paper_Elsevier.csv', encoding='utf-8')
    sem_updated_df = pd.read_csv('Collabo_Paper_Sem_Updated.csv', encoding='utf-8')

# Normalize titles in both dataframes
elsevier_df['Normalized_Title'] = elsevier_df['Title'].apply(normalize_title)
sem_updated_df['Normalized_Title'] = sem_updated_df['Title'].apply(normalize_title)

# Drop duplicate titles in sem_updated_df, keeping only the first occurrence
sem_updated_df = sem_updated_df.drop_duplicates(subset=['Normalized_Title'], keep='first')

# Merge the datasets on the normalized title
merged_df = pd.merge(elsevier_df, sem_updated_df[['Normalized_Title', 'Abstract', 'Authors', 'Citation Count', 'Influential Citation Count']],
                     on='Normalized_Title', how='left')

# Drop the 'Normalized_Title' column, as it's no longer needed
merged_df = merged_df.drop(columns=['Normalized_Title'])

# Save the resulting dataframe to a new CSV file
merged_df.to_csv('Collabo_Paper_Dataset.csv', index=False, encoding='utf-8')

print("Merged dataset saved as 'Collabo_Paper_Dataset.csv'.")


## Dataset <- Industry Number (6T)

In [20]:
import pandas as pd

# 데이터 불러오기
collabo_df = pd.read_csv('Paper_Dataset_Test.csv', encoding='utf-8')
processed_df = pd.read_csv('processed_data_v7_231120.csv', encoding='utf-8')

# 각 데이터프레임의 열 이름 출력
print("Collabo_Paper_Dataset 열 이름:")
print(collabo_df.columns)

print("\nProcessed_data 열 이름:")
print(processed_df.columns)

  collabo_df = pd.read_csv('Paper_Dataset_Test.csv', encoding='utf-8')


Collabo_Paper_Dataset 열 이름:
Index(['과제고유번호', '과제수행년도', '성과발생년도', '성과발생부처명', '성과사업ID', '성과사업명', '성과고유번호',
       '논문_SCI구분_최종', 'Title', 'Journal', '논문번호', '논문_권', '논문_호', 'DOI',
       '논문_기여율_확정', 'Affiliation', 'Authors', 'Citation Count',
       'Influential Citation Count', 'Abstract'],
      dtype='object')

Processed_data 열 이름:
Index(['사업ID', '부처명', '총연구비', '정부연구비', '총연구기간_시작', '총연구기간_종료', '연구기간',
       '참여인원_공동위탁', '과학기술표준분류', '육T관련기술', '연구개발단계', '경제사회목적', '과제고유번호',
       '과제수행년도', '협업_매트릭스', '협업_type_new', '총_참여조직_수', '대기업_수', '중견_수', '중소_수',
       '대학_수', '연구기관_수', '기타_수', 'blau_index', '논문_개수', 'SCI_개수', '비SCI개수',
       '특허_개수', '출원_개수', '등록_개수', '국내_특허_개수', '국내_출원_개수', '국내_등록_개수',
       '해외_특허_개수', '해외_출원_개수', '해외_등록_개수', '성과_개수', '연도별_밀도', 'HHI', 'rv_HHI',
       'GDC', 'num_external_tie', 'avg_num_external_tie'],
      dtype='object')


In [22]:
import chardet

# 파일의 인코딩을 감지하기
with open('Paper_Dataset.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print("Paper_Dataset.csv 인코딩:", result['encoding'])

with open('processed_data_v7_231120.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print("processed_data_v7_231120.csv 인코딩:", result['encoding'])

Paper_Dataset.csv 인코딩: UTF-8-SIG
processed_data_v7_231120.csv 인코딩: UTF-8-SIG


In [24]:
import pandas as pd

# CSV 파일 불러오기 (BOM을 제거하기 위해 'utf-8-sig' 사용)
collabo_df = pd.read_csv('Paper_Dataset.csv', encoding='utf-8-sig')
processed_df = pd.read_csv('processed_data_v7_231120.csv', encoding='utf-8-sig')

# '과제고유번호'를 기준으로 '육T관련기술' 병합
merged_df = pd.merge(collabo_df, processed_df[['과제고유번호', '육T관련기술']],
                     on='과제고유번호', how='left')

# CSV 파일로 저장 (UTF-8로)
merged_df.to_csv('Paper_Dataset_2.csv', index=False, encoding='utf-8-sig')

print("병합 및 저장 완료!")


  collabo_df = pd.read_csv('Paper_Dataset.csv', encoding='utf-8-sig')


병합 및 저장 완료!


## Abstract 한줄로 만들기

In [26]:
import pandas as pd

# 1. CSV 파일 불러오기
collabo_df = pd.read_csv('Paper_Dataset.csv', encoding='utf-8-sig')  # BOM 제거를 위한 인코딩

# 2. 'Abstract' 열의 엔터를 공백으로 대체하여 한 줄로 만들기
collabo_df['Abstract'] = collabo_df['Abstract'].str.replace('\n', ' ', regex=True)

# 3. CSV 파일로 저장
collabo_df.to_csv('Paper_Dataset_Processed.csv', index=False, encoding='utf-8-sig')

print("Abstract 열의 줄바꿈 제거 및 저장 완료!")


  collabo_df = pd.read_csv('Paper_Dataset.csv', encoding='utf-8-sig')  # BOM 제거를 위한 인코딩


Abstract 열의 줄바꿈 제거 및 저장 완료!


## Affiliation 깨진 글씨 수정

In [None]:
import pandas as pd

# Step 1: Read the CSV file
file_path = 'Paper_Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig')

# Step 2: Check for rows where 'Affiliation' contains Korean characters or '?'
# A regex pattern to check for Korean characters
pattern = r'[\uac00-\ud7af?]'  # Korean characters range and '?'
filtered_rows = df[df['Affiliation'].str.contains(pattern, na=False)]

# Step 3: Output results and count
print(filtered_rows)
print("Number of rows containing Korean characters or '?':", len(filtered_rows))


In [43]:
import pandas as pd
import re

# Step 1: Read the CSV file with utf-8-sig encoding
df = pd.read_csv('Paper_Dataset.csv', encoding='utf-8-sig')

# Step 2: Initialize a list to store problematic organizations
problematic_orgs = []

# Step 3: Define a function to extract organizations containing Korean characters or "?"
def extract_problematic_affiliations(affiliation_str):
    # Check if the affiliation_str is a valid string
    if isinstance(affiliation_str, str):
        affiliations = affiliation_str.split('; ')
        for org in affiliations:
            if re.search(r'[\uac00-\ud7af]|[\?]', org):  # Check for Korean characters or "?"
                problematic_orgs.append(org)

# Step 4: Apply the function to the 'Affiliation' column
df['Affiliation'].apply(extract_problematic_affiliations)

# Step 5: Create the replacement dictionary
replacement_dict = {org: "" for org in set(problematic_orgs)}

# Step 6: Print the replacement dictionary in the desired format
print("replacement_dict = {")
for key in replacement_dict:
    print(f'    "{key}": "",')
print("}")


replacement_dict = {
    "Universit채tsklinikum Freiburg": "",
    "T챕l챕com Paris": "",
    "Universit채t Wien": "",
    "Universit횪 degli Studi della Campania Luigi Vanvitelli": "",
    "Max Planck In짯sti짯tute for Mar짯ine Mi짯cro짯bi짯o짯logy": "",
    "Universit챕 de Franche-Comt챕": "",
    "Universit챕 du Qu챕bec 횪 Chicoutimi": "",
    "Bundesanstalt f체r Geowissenschaften und Rohstoffe": "",
    "Technische Universit채t Bergakademie Freiberg": "",
    "Institut de Recherche en Informatique et Syst챔mes Al챕atoires": "",
    "H첩gskolen i Bergen": "",
    "Goethe-Universit채t Frankfurt am Main": "",
    "Universidad Aut처noma de Sinaloa": "",
    "횋cole Normale Sup챕rieure": "",
    "Tecnol처gico de Monterrey": "",
    "The Catholic University of Korea Eunpyeong St. Mary?셲 Hospital": "",
    "Katolicki Uniwersytet Lubelski Jana Paw흢a II": "",
    "Y캇ld캇z Teknik 횥niversitesi": "",
    "Instiutet f철r rymdfysik": "",
    "Institut f체r Chemische Technologie von Materialien": "",
    "Universit횪 di Pisa":

  df = pd.read_csv('Paper_Dataset.csv', encoding='utf-8-sig')


## Affiliation Dictionary 만들기

In [31]:
import pandas as pd

# Step 1: Read the CSV file
file_path = 'Paper_Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig',low_memory=False)

df['Affiliation'] = df['Affiliation'].str.replace("Universit채tsklinikum Freiburg", "University ofsklinikum Freiburg")
df['Affiliation'] = df['Affiliation'].str.replace("T챕l챕com Paris", "Telecom Paris")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕t Wien", "University of Wien")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi della Campania Luigi Vanvitelli", "Universita degli Studi della Campania Luigi Vanvitelli")
df['Affiliation'] = df['Affiliation'].str.replace("Max Planck In짯sti짯tute for Mar짯ine Mi짯cro짯bi짯o짯logy", "Max Planck Institute for Marine Microbiology")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Franche-Comt챕", "Universite de Franche-Comte")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 du Qu챕bec 횪 Chicoutimi", "Universite du Quebec a Chicoutimi")
df['Affiliation'] = df['Affiliation'].str.replace("Bundesanstalt f체r Geowissenschaften und Rohstoffe", "Bundesanstalt fur Geowissenschaften und Rohstoffe (BGR)")
df['Affiliation'] = df['Affiliation'].str.replace("Technische Universit채t Bergakademie Freiberg", "Technische University of Bergakademie Freiberg")
df['Affiliation'] = df['Affiliation'].str.replace("Institut de Recherche en Informatique et Syst챔mes Al챕atoires", "Institut de Recherche en Informatique et Systemes Aleatoires (IRISA)")
df['Affiliation'] = df['Affiliation'].str.replace("H첩gskolen i Bergen", "Høgskolen i Bergen")
df['Affiliation'] = df['Affiliation'].str.replace("Goethe-Universit채t Frankfurt am Main", "Goethe-University of Frankfurt am Main")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad Aut처noma de Sinaloa", "Universidad Autonoma de Sinaloa")
df['Affiliation'] = df['Affiliation'].str.replace("횋cole Normale Sup챕rieure", "Ecole Normale Superieure")
df['Affiliation'] = df['Affiliation'].str.replace("Tecnol처gico de Monterrey", "Tecnologico de Monterrey")
df['Affiliation'] = df['Affiliation'].str.replace("The Catholic University of Korea Eunpyeong St. Mary?셲 Hospital", "The Catholic University of Korea Eunpyeong St. Mary's Hospital")
df['Affiliation'] = df['Affiliation'].str.replace("Katolicki Uniwersytet Lubelski Jana Paw흢a II", "Katolicki Uniwersytet Lubelski Jana Pawla II")
df['Affiliation'] = df['Affiliation'].str.replace("Y캇ld캇z Teknik 횥niversitesi", "Yıldız Teknik Universitesi")
df['Affiliation'] = df['Affiliation'].str.replace("Instiutet f철r rymdfysik", "Institutet for rymdfysik")
df['Affiliation'] = df['Affiliation'].str.replace("Institut f체r Chemische Technologie von Materialien", "Institut fur Chemische Technologie von Materialien")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 di Pisa", "University of Pisa")
df['Affiliation'] = df['Affiliation'].str.replace("Technische Universit채t Braunschweig", "Technical University of Braunschweig")
df['Affiliation'] = df['Affiliation'].str.replace("National Research University ?쏮oscow Power Engineering Institute?? Voronezhskiy Gosudarstvenniy Universitet", "National Research University Moscow Power Engineering Institute Voronezh State University")
df['Affiliation'] = df['Affiliation'].str.replace("Leibniz-Institut f체r Kristallz체chtung", "Leibniz-Institut fur Kristallzuchtung")
df['Affiliation'] = df['Affiliation'].str.replace("University of Hawai軻i at M훮noa", "University of Hawai‘i at Manoa")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Versailles Saint-Quentin-en-Yvelines", "Universite de Versailles Saint-Quentin-en-Yvelines")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Lausanne (UNIL)", "Universite de Lausanne (UNIL)")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 d'Artois", "Universite d'Artois")
df['Affiliation'] = df['Affiliation'].str.replace("H척pital Universitaire San Jo찾o", "Hopital Universitaire San Joao")
df['Affiliation'] = df['Affiliation'].str.replace("Institut de Chimie et Proc챕d챕s pour l'Energie, l'Environnement et la Sant챕 (ICPEES)", "Institut de Chimie et Procedes pour l'Energie, l'Environnement et la Sante (ICPEES)")
df['Affiliation'] = df['Affiliation'].str.replace("Lule책 University of Technology", "Lulea University of Technology")
df['Affiliation'] = df['Affiliation'].str.replace("University of M체nster", "University of Munster")
df['Affiliation'] = df['Affiliation'].str.replace("Max-Planck-Institut f체r Biochemie", "Max-Planck-Institut fur Biochemie")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 du Littoral C척te d?쁎pale", "University of the Littoral Opal Coast")
df['Affiliation'] = df['Affiliation'].str.replace("Institut de Neurosciences Cognitives et Int챕gratives d?섲quitaine", "Institut de Neurosciences Cognitives et Integratives d'Aquitaine")
df['Affiliation'] = df['Affiliation'].str.replace("Direction Sant챕 Confort", "Direction Sante Confort")
df['Affiliation'] = df['Affiliation'].str.replace("Leibniz-Institut f체r Naturstoff-Forschung und Infektionsbiologie e. V. ??Hans-Kn철ll-Institut", "Leibniz-Institut fur Naturstoff-Forschung und Infektionsbiologie e. V. Hans-Knoll-Institut")
df['Affiliation'] = df['Affiliation'].str.replace("Friedrich-Schiller-Universit채t Jena", "Friedrich-Schiller-University of Jena")
df['Affiliation'] = df['Affiliation'].str.replace("Universidade de Bras챠lia", "Universidade de Brasilia")
df['Affiliation'] = df['Affiliation'].str.replace("Dipartimento di Fisica e Astronomia dell'Universit횪", "Dipartimento di Fisica e Astronomia dell'Universita")
df['Affiliation'] = df['Affiliation'].str.replace("Instytut Metali Niezelaznych Oddzia흢 w Poznaniu Centralne Laboratorium Akumulator처w i Ogniw", "Instytut Metali Niezelaznych Oddzial w Poznaniu Centralne Laboratorium Akumulatorow i Ogniw")
df['Affiliation'] = df['Affiliation'].str.replace("Uniwersytet Gda흦ski", "University of Gdansk")
df['Affiliation'] = df['Affiliation'].str.replace("Rheinland-Pf채lzische Technische Universit채t Kaiserslautern-Landau", "Rheinland-Pfalzische Technische University of Kaiserslautern-Landau")
df['Affiliation'] = df['Affiliation'].str.replace("National Research University ?쏮oscow Power Engineering Institute", "National Research University Moscow Power Engineering Institute")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Savoie Mont Blanc", "Universite Savoie Mont Blanc")
df['Affiliation'] = df['Affiliation'].str.replace("횋cole Polytechnique F챕d챕rale de Lausanne", "Ecole Polytechnique Federale de Lausanne")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Libre de Bruxelles", "Universite Libre de Bruxelles")
df['Affiliation'] = df['Affiliation'].str.replace("Firat 횥niversitesi", "Fırat Universitesi")
df['Affiliation'] = df['Affiliation'].str.replace("Institut Ru휃er Bo큄kovi훶", "Institut Ruder BoSkovic")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Poitiers", "Universite de Poitiers")
df['Affiliation'] = df['Affiliation'].str.replace("Technischen Universit채t Ilmenau", "Technische University of Ilmenau")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Tunis El Manar", "Universite de Tunis El Manar")
df['Affiliation'] = df['Affiliation'].str.replace("University of Polit챔cnica de Val챔ncia", "University of Politecnica de Valencia")
df['Affiliation'] = df['Affiliation'].str.replace("Bauhaus-Universit채t Weimar", "Bauhaus-University of Weimar")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Bordeaux", "Universite de Bordeaux")
df['Affiliation'] = df['Affiliation'].str.replace("C찼tedras CONACYT", "Catedras CONACYT")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Sassari", "Universita degli Studi di Sassari")
df['Affiliation'] = df['Affiliation'].str.replace("Helmholtz Zentrum f체r Umweltforschung", "Helmholtz Zentrum fur Umweltforschung")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Regensburg", "University of Regensburg")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad T챕cnica de Babahoyo", "Technical University of Babahoyo")
df['Affiliation'] = df['Affiliation'].str.replace("CEA LETI혻", "CEA LETI")
df['Affiliation'] = df['Affiliation'].str.replace("?cole Polytechnique F?d?rale de Lausanne", "Ecole Polytechnique Federale de Lausanne")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Siena", "Universita degli Studi di Siena")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 del Salento", "Universita del Salento")
df['Affiliation'] = df['Affiliation'].str.replace("National Institute of Horticultural 竊?Herbal Science", "National Institute of Horticultural & Herbal Science")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 Telematica Internazionale UNINETTUNO", "Universita Telematica Internazionale UNINETTUNO")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad P첬blica de Navarra", "Public University of Navarre")
df['Affiliation'] = df['Affiliation'].str.replace("Sveu훾ili큄te u Zagrebu, Prirodoslovno - Matemati훾ki Fakultet", "Faculty of Science, University of Zagreb")
df['Affiliation'] = df['Affiliation'].str.replace("Universidade de S찾o Paulo", "Universidade de São Paulo")
df['Affiliation'] = df['Affiliation'].str.replace("Fraunhofer-Institut f체r Angewandte Polymerforschung", "Fraunhofer-Institut fur Angewandte Polymerforschung")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Messina", "Universita degli Studi di Messina")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad Polit챕cnica de Madrid", "Universidad Politecnica de Madrid")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Palermo", "Universita degli Studi di Palermo")
df['Affiliation'] = df['Affiliation'].str.replace("Eski힊ehir Osmangazi 횥niversitesi", "Eskisehir Osmangazi University, Meselik Campus")
df['Affiliation'] = df['Affiliation'].str.replace("Gottfried Wilhelm Leibniz Universit채t Hannover", "Gottfried Wilhelm Leibniz University of Hannover")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Catania", "Universita degli Studi di Catania")
df['Affiliation'] = df['Affiliation'].str.replace("Institut Sup챕rieur d'Electronique de Paris (ISEP)", "Institut Superieur d'Electronique de Paris (ISEP)")
df['Affiliation'] = df['Affiliation'].str.replace("Instituto Federal de Educa챌찾o, Ci챗ncia e Tecnologia do Cear찼, Fortaleza", "Instituto Federal de Educacao, Ciencia e Tecnologia do Ceara, Fortaleza")
df['Affiliation'] = df['Affiliation'].str.replace("Martin-Luther-Universit채t Halle-Wittenberg", "Martin-Luther-University of Halle-Wittenberg")
df['Affiliation'] = df['Affiliation'].str.replace("EMBL?셲 European Bioinformatics Institute", "EMBL's European Bioinformatics Institute")
df['Affiliation'] = df['Affiliation'].str.replace("Karlsruher Institut f체r Technologie, Campus Nord", "Karlsruhe Institute of Technology Campus North")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Genova", "University of Genoa")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채tsmedizin Greifswald", "Greifswald Medical School")
df['Affiliation'] = df['Affiliation'].str.replace("IRSN Institut de Radioprotection et de Surete Nucleaire", "IRSN Institut de Radioprotection et de Surete Nucleaire")
df['Affiliation'] = df['Affiliation'].str.replace("Alma Mater Studiorum Universit횪 di Bologna", "Alma Mater Studiorum Universita di Bologna")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Paris Cit챕", "Universite Paris Cite")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Tours", "Universite de Tours")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Abou Bekr Belkaid Tlemcen", "Universite Abou Bekr Belkaid Tlemcen")
df['Affiliation'] = df['Affiliation'].str.replace("Laboratoire de l'Accelerateur Lineaire", "Laboratoire de l'Accelerateur Lineaire")
df['Affiliation'] = df['Affiliation'].str.replace("Centre Inria Sophia Antipolis - Mediterranee", "Centre Inria Sophia Antipolis - Mediterranee")
df['Affiliation'] = df['Affiliation'].str.replace("Medizinische Universit채t Graz", "Medizinische University of Graz")
df['Affiliation'] = df['Affiliation'].str.replace("Bilkent 횥niversitesi", "Bilkent Universitesi")
df['Affiliation'] = df['Affiliation'].str.replace("Technology Arts Sciences TH Koln", "Technology Arts Sciences TH Koln")
df['Affiliation'] = df['Affiliation'].str.replace("Transportøkonomisk institutt", "Institute of Transport Economics")
df['Affiliation'] = df['Affiliation'].str.replace("Centro de Investigaciones Energeticas, Medioambientales y Tecnologicas", "Centre for Energy, Environmental and Technological Research (CIEMAT)")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 di Trento", "Universita di Trento")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Saint-Joseph de Beyrouth", "Universite Saint-Joseph de Beyrouth")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Laval", "Universite Laval")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Technologie de Troyes", "Universite de Technologie de Troyes")
df['Affiliation'] = df['Affiliation'].str.replace("Creation and Love Women’s Hospital", "Creation and Love Women’s Hospital")
df['Affiliation'] = df['Affiliation'].str.replace("Laboratorio de Instrumentação e Fisica Experimental de Particulas", "Laboratory of Instrumentation and Experimental Particles Physics")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Picardie Jules Verne", "Universite de Picardie Jules Verne")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Claude Bernard Lyon 1", "Universite Claude Bernard Lyon 1")
df['Affiliation'] = df['Affiliation'].str.replace("Instituto Superior Tecnico", "Instituto Superior Tecnico")
df['Affiliation'] = df['Affiliation'].str.replace("Organizacion Sanitaria Integrada Goierri - Alto Urola", "Organizacion Sanitaria Integrada Goierri - Alto Urola")
df['Affiliation'] = df['Affiliation'].str.replace("University of des Saarlandes", "University of des Saarlandes")
df['Affiliation'] = df['Affiliation'].str.replace("Politechnika Wroclawska", "Politechnika Wroclawska")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Mons", "Universite de Mons")
df['Affiliation'] = df['Affiliation'].str.replace("Nantes Universit챕", "Nantes Universite")
df['Affiliation'] = df['Affiliation'].str.replace("Kyung-In Women?셲 University", "Kyung-In Women’s University")
df['Affiliation'] = df['Affiliation'].str.replace("Instituto de Investigaci처n Sanitaria de la Fundaci처n Jim챕nez D챠az", "Instituto de Investigacion Sanitaria de la Fundacion Jimenez Diaz")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Cagliari", "Universita degli Studi di Cagliari")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 Cattolica del Sacro Cuore, Campus di Brescia", "Universita Cattolica del Sacro Cuore, Campus di Brescia")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Paris-Saclay", "Universite Paris-Saclay")
df['Affiliation'] = df['Affiliation'].str.replace("Centro de Investigaci처n Biom챕dica en Red de C찼ncer", "Centro de Investigacion Biomedica en Red de Cancer")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채tsklinikum Heidelberg", "University Hospital Heidelberg")
df['Affiliation'] = df['Affiliation'].str.replace("Institut Universitaire de Radiophysique Appliqu챕e", "Institut Universitaire de Radiophysique Appliquee")
df['Affiliation'] = df['Affiliation'].str.replace("Instituto de Bioingenier챠a de Catalu챰a", "Institute for Bioengineering of Catalonia")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 della Calabria", "Universita della Calabria")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Napoli Federico II", "Universita degli Studi di Napoli Federico II")
df['Affiliation'] = df['Affiliation'].str.replace("Link철pings Universitet", "Linkopings Universitet")
df['Affiliation'] = df['Affiliation'].str.replace("Institut Mines T챕l챕com", "Institut Mines Telecom")
df['Affiliation'] = df['Affiliation'].str.replace("Laboratorio Nacional de Fusi처n", "Laboratorio Nacional de Fusion")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Padova", "Universita degli Studi di Padova")
df['Affiliation'] = df['Affiliation'].str.replace("Organisation Europ챕enne pour la Recherche Nucl챕aire", "Organisation Europeenne pour la Recherche Nucleaire")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Lyon", "Universite de Lyon")
df['Affiliation'] = df['Affiliation'].str.replace("University of de Val챔ncia", "University of de Valencia")
df['Affiliation'] = df['Affiliation'].str.replace("Bundesamt f체r Seeschiffahrt und Hydrographie", "Bundesamt fur Seeschifffahrt und Hydrographie")
df['Affiliation'] = df['Affiliation'].str.replace("Institut F챕d챕ratif de Recherche 49", "Institut Federatif de Recherche 49")
df['Affiliation'] = df['Affiliation'].str.replace("Instituto Nacional de Neurolog챠a y Neurocirug챠a", "National Institute of Neurology and Neurosurgery")
df['Affiliation'] = df['Affiliation'].str.replace("Institut de la Micro챕lectronique, Electromagn챕tisme et Photonique - Laboratoire d'Hyperfr챕quences et de Caract챕risation", "Institut de la Microelectronique, Electromagnetisme et Photonique - Laboratoire d'Hyperfrequences et de Caracterisation")
df['Affiliation'] = df['Affiliation'].str.replace("Rutgers University?밡ew Brunswick", "Rutgers University-New Brunswick")
df['Affiliation'] = df['Affiliation'].str.replace("Zentrum f체r Regenerative Therapien Dresden", "Zentrum fur Regenerative Therapien Dresden")
df['Affiliation'] = df['Affiliation'].str.replace("Max-Planck-Institut f체r Kolloid- und Grenzfl채chenforschung", " Max Planck Institute of Colloids and Interfaces")
df['Affiliation'] = df['Affiliation'].str.replace("Helmholtz?밵entrum Geesthacht", "Helmholtz-Zentrum Geesthacht")
df['Affiliation'] = df['Affiliation'].str.replace("횜rebro Universitet", "Orebro Universitet")
df['Affiliation'] = df['Affiliation'].str.replace("Laboratoire des Sciences des Proc챕d챕s et des Mat챕riaux", "Laboratoire des Sciences des Procedes et des Materiaux")
df['Affiliation'] = df['Affiliation'].str.replace("Centro Brasileiro de Pesquisas F챠sicas", "Centro Brasileiro de Pesquisas Fisicas")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Clermont Auvergne", "Universite Clermont Auvergne")
df['Affiliation'] = df['Affiliation'].str.replace("Bergische Universit채t Wuppertal", "Bergische University of Wuppertal")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad de Alcal찼", "Universidad de Alcala")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad de Le처n", "Universidad de Leon")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Z체rich", "University of Zurich")
df['Affiliation'] = df['Affiliation'].str.replace("Fundaci처n Agencia Aragonesa para la Investigaci처n y el Desarrollo (ARAID)", "Fundacion Agencia Aragonesa para la Investigacion y el Desarrollo (ARAID)")
df['Affiliation'] = df['Affiliation'].str.replace("Facult챕 des Sciences Semlalia", "Faculte des Sciences Semlalia")
df['Affiliation'] = df['Affiliation'].str.replace("횋cole Nationale Sup챕rieure d'Ing챕nieurs de Tunis", "Ecole Nationale Superieure d'Ingenieurs de Tunis")
df['Affiliation'] = df['Affiliation'].str.replace("Queen?셲 University", "Queen’s University")
df['Affiliation'] = df['Affiliation'].str.replace("Ludwig-Maximilians-Universit채t M체nchen", "Ludwig-Maximilians-University of Munchen")
df['Affiliation'] = df['Affiliation'].str.replace("H척pital Saint-Antoine", "Hospital Saint-Antoine Ap-Hp")
df['Affiliation'] = df['Affiliation'].str.replace("G챕olocalisation (AME-GEOLOC)", "Geolocalisation (AME-GEOLOC)")
df['Affiliation'] = df['Affiliation'].str.replace("Universidade Federal do Piau챠", "Universidade Federal do Piaui")
df['Affiliation'] = df['Affiliation'].str.replace("D梳죍 h沼뛠 Nguyen Tat Thanh", "Nguyen Tat Thanh University")
df['Affiliation'] = df['Affiliation'].str.replace("Klinikum der Universit채t Regensburg und Medizinische Fakult채t", "Klinikum der University of Regensburg und Medizinische Fakultat")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi dell'Aquila", "Universita degli Studi dell'Aquila")
df['Affiliation'] = df['Affiliation'].str.replace("Rheinisch-Westf채lische Technische Hochschule Aachen", "Rheinisch-Westfalische Technische Hochschule Aachen")
df['Affiliation'] = df['Affiliation'].str.replace("Karlsruher Institut f체r Technologie", "Karlsruher Institut fur Technologie")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Trieste", "Universita degli Studi di Trieste")
df['Affiliation'] = df['Affiliation'].str.replace("Centro de Investigacion y de Estudios Avanzados del Instituto Polit챕cnico Nacional", "Centro de Investigacion y de Estudios Avanzados del Instituto Politecnico Nacional")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Strasbourg", "Universite de Strasbourg")
df['Affiliation'] = df['Affiliation'].str.replace("University of Rzesz처w", "University of Rzeszow")
df['Affiliation'] = df['Affiliation'].str.replace("Hochschule F횄쩌r Technik Stuttgart", "Hochschule fur Technik Stuttgart")
df['Affiliation'] = df['Affiliation'].str.replace("Univerzita Palack?ho v Olomouci", "Univerzita Palackeho v Olomouci")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Toulouse III - Paul Sabatier", "Universite Toulouse III - Paul Sabatier")
df['Affiliation'] = df['Affiliation'].str.replace("Redeemer?쁲 University", "Redeemer's University")
df['Affiliation'] = df['Affiliation'].str.replace("Tr튼沼쓓g 휂梳죍 h沼뛠 S튼 ph梳죑 H횪 N沼셢", "Hanoi National University of Education")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Enna", "University of Enna")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Perpignan Via Domitia", "Universite de Perpignan Via Domitia")
df['Affiliation'] = df['Affiliation'].str.replace("Interfaces Traitements Organisation et DYnamique des Syst챔mes ??ITODYS", "Interfaces Traitements Organisation et DYnamique des Systemes (ITODYS)")
df['Affiliation'] = df['Affiliation'].str.replace("Bo휓azi챌i 횥niversitesi", "Boğaziçi Universitesi")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad Aut처noma de Chiapas", "Universidad Autonoma de Chiapas")
df['Affiliation'] = df['Affiliation'].str.replace("Centro de Investigaci?n y de Estudios Avanzados del Instituto Polit?cnico Nacional", "Centro de Investigacion y de Estudios Avanzados del Instituto Politecnico Nacional")
df['Affiliation'] = df['Affiliation'].str.replace("Brandenburgische Technische Universit채t Cottbus", "Brandenburgische Technische University of Cottbus")
df['Affiliation'] = df['Affiliation'].str.replace("Tartu 횥likooli Genoomika Instituut", "Tartu Ulikooli Genoomika Instituut")
df['Affiliation'] = df['Affiliation'].str.replace("Alfred-Wegener-Institut Helmholtz-Zentrum f체r Polar- und Meeresforschung", "Alfred-Wegener-Institut Helmholtz-Zentrum fur Polar- und Meeresforschung")
df['Affiliation'] = df['Affiliation'].str.replace("Christian-Albrechts-Universit채t zu Kiel", "Christian-Albrechts-University of zu Kiel")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad Nacional de C처rdoba", "Universidad Nacional de Cordoba")
df['Affiliation'] = df['Affiliation'].str.replace("Sveu훾ili큄te u Zagrebu, Geodetski fakultet", "SveuciliSte u Zagrebu, Geodetski fakultet")
df['Affiliation'] = df['Affiliation'].str.replace("Interactions h척tes-agents pathog챔nes - (IHAP)", "Interactions hôtes-agents pathogenes - (IHAP)")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad Mayor de San Andr챕s", "Universidad Mayor de San Andres")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Perugia", "Universita degli Studi di Perugia")
df['Affiliation'] = df['Affiliation'].str.replace("Freie Universit채t Berlin", "Freie University of Berlin")
df['Affiliation'] = df['Affiliation'].str.replace("D챕partement d'Informatique de l'ENS", "Departement d'Informatique de l'ENS")
df['Affiliation'] = df['Affiliation'].str.replace("Gangnam St. Mary?셲 One Eye Clinic", "Gangnam St. Mary's One Eye Clinic")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채tsklinikum Hamburg-Eppendorf", "University Medical Center Hamburg-Eppendorf ")
df['Affiliation'] = df['Affiliation'].str.replace("Muse챕 Canadien de la Nature", "Musee Canadien de la Nature")
df['Affiliation'] = df['Affiliation'].str.replace("Norges Milj첩- og Biovitenskapelige Universitet", "Norwegian University of Life Sciences")
df['Affiliation'] = df['Affiliation'].str.replace("Global Core Research Center for Ship and Offshore Plants (GCRE?륲OP)", "Global Core Research Center for Ship and Offshore Plants (GCRC-SOP)")
df['Affiliation'] = df['Affiliation'].str.replace("Centre de Recherche et des Technologies de L'Energie (CRTEn) BorjCedria B.P N째952050-Hammam Lif.", "Centre de Recherche et des Technologies de l'Energie (CRTEn) BorjCedria B.P N°952050-Hammam Lif.")
df['Affiliation'] = df['Affiliation'].str.replace("Technische Universit채t Berlin", "Technische University of Berlin")
df['Affiliation'] = df['Affiliation'].str.replace("Consejo Superior de Investigaciones Cient챠ficas", "Consejo Superior de Investigaciones Cientificas")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Toulouse", "Universite de Toulouse")
df['Affiliation'] = df['Affiliation'].str.replace("Wroc흢aw University of Environmental and Life Sciences", "Wroclaw University of Environmental and Life Sciences")
df['Affiliation'] = df['Affiliation'].str.replace("G철teborgs Universitet", "Goteborgs Universitet")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Lorraine", "Universite de Lorraine")
df['Affiliation'] = df['Affiliation'].str.replace("CHU de Qu챕bec-Universit챕 Laval", "CHU de Quebec-Universite Laval")
df['Affiliation'] = df['Affiliation'].str.replace("T횥B캅TAK Ulusal Metroloji Enstit체s체", "TUBİTAK Ulusal Metroloji Enstitusu")
df['Affiliation'] = df['Affiliation'].str.replace("Univerzita Jana Evangelisty Purkyne v 횣st챠 nad Labem", "University of Jan Evangelista in Usti nad Labem (UJEP)")
df['Affiliation'] = df['Affiliation'].str.replace("CSIC-UNIOVI-Principado de Asturias - Centro de Investigaci처n en Nanomateriales y Nanotecnolog챠a (CINN)", "CSIC-UNIOVI-Principado de Asturias - Centro de Investigacion en Nanomateriales y Nanotecnologia (CINN)")
df['Affiliation'] = df['Affiliation'].str.replace("Technische Universit채t Chemnitz", "Technische University of Chemnitz")
df['Affiliation'] = df['Affiliation'].str.replace("Bundesanstalt f체r Materialforschung und -Pr체fung", "Bundesanstalt fur Materialforschung und -Prufung")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi dell'Insubria", "Universita degli Studi dell'Insubria")
df['Affiliation'] = df['Affiliation'].str.replace("Philipps-Universit채t Marburg", "Philipps-University of Marburg")
df['Affiliation'] = df['Affiliation'].str.replace("Technick찼 Univerzita v Ko큄iciach", "Technicka Univerzita v KoSiciach")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad de Ja챕n", "Universidad de Jaen")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Greifswald", "University of Greifswald")
df['Affiliation'] = df['Affiliation'].str.replace("CSIC - Instituto de 횙ptica Daza de Vald챕s (IO)", "CSIC - Instituto de Optica Daza de Valdes (IO)")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad de Sevilla, Escuela T챕cnica Superior de Ingenier챠a", "Universidad de Sevilla, Escuela Tecnica Superior de Ingenieria")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Gustave Eiffel", "Universite Gustave Eiffel")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Catholique de Louvain", "Universite Catholique de Louvain")
df['Affiliation'] = df['Affiliation'].str.replace("M챕t챕o France", "Meteo France")
df['Affiliation'] = df['Affiliation'].str.replace("Uniwersytet Jagiello흦ski", "Uniwersytet Jagiellonski")
df['Affiliation'] = df['Affiliation'].str.replace("Laboratoire d'Oc챕anographie et du Climat : Exp챕rimentations et Approches Num챕riques", "Laboratoire d'Oceanographie et du Climat : Experimentations et Approches Numeriques")
df['Affiliation'] = df['Affiliation'].str.replace("H철gskolan i Bor책s", "Hogskolan i Boras")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Trier", "University of Trier")
df['Affiliation'] = df['Affiliation'].str.replace("CICECO ??Instituto de Materiais de Aveiro", "CICECO – Instituto de Materiais de Aveiro")
df['Affiliation'] = df['Affiliation'].str.replace("Aix Marseille Universit챕", "Aix Marseille Universite")
df['Affiliation'] = df['Affiliation'].str.replace("Forschungszentrum J체lich GmbH", "Forschungszentrum Julich GmbH")
df['Affiliation'] = df['Affiliation'].str.replace("Universidade Federal do Esp챠rito Santo", "Universidade Federal do Espirito Santo")
df['Affiliation'] = df['Affiliation'].str.replace("Laboratoire Aim챕 Cotton", "Laboratoire Aime Cotton")
df['Affiliation'] = df['Affiliation'].str.replace("CRISMAT - Laboratoire de Crystallographie et Sciences des Mat챕riaux", "CRISMAT - Laboratoire de Crystallographie et Sciences des Materiaux")
df['Affiliation'] = df['Affiliation'].str.replace("Instituto Polit챕cnico Nacional", "Instituto Politecnico Nacional")
df['Affiliation'] = df['Affiliation'].str.replace("Cheil General Hospital and Women?셲 Healthcare Center", "Cheil General Hospital and Women’s Healthcare Center")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Siegen", "University of Siegen")
df['Affiliation'] = df['Affiliation'].str.replace("K첩benhavns Universitet", "Kobenhavns Universitet")
df['Affiliation'] = df['Affiliation'].str.replace("횋cole Normale Sup챕rieure Paris-Saclay", "Ecole Normale Superieure Paris-Saclay")
df['Affiliation'] = df['Affiliation'].str.replace("Eberhard Karls Universit채t T체bingen", "Eberhard Karls University of Tubingen")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 C척te d'Azur", "Universite Cote d'Azur")
df['Affiliation'] = df['Affiliation'].str.replace("Institut 'Jo탑ef Stefan'", "Institut Jozef Stefan")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Stuttgart", "University of Stuttgart")
df['Affiliation'] = df['Affiliation'].str.replace("Z체hlke Group", "Zuhlke Group")
df['Affiliation'] = df['Affiliation'].str.replace("National Technical University of Ukraine ?쏧gor Sikorsky Kyiv Polytechnic Institute??Temperature dependence of the excitonic energy band gap in In(Ga)As nanostructures", "National Technical University of Ukraine – Igor Sikorsky Kyiv Polytechnic Institute")
df['Affiliation'] = df['Affiliation'].str.replace("Leibniz-Institut f체r Festk철rper- und Werkstoffforschung Dresden", "Leibniz-Institut fur Festkorper- und Werkstoffforschung Dresden")
df['Affiliation'] = df['Affiliation'].str.replace("Tr튼沼쓓g 휂梳죍 h沼뛠 C척ng nghi沼뇈 th횪nh ph沼?H沼?Ch챠 Minh", "HCMC University of Technology (HUTECH) – Sai Gon Campus")
df['Affiliation'] = df['Affiliation'].str.replace("Erciyes 횥niversitesi", "Erciyes Universitesi")
df['Affiliation'] = df['Affiliation'].str.replace("Centro de Investiga??o em Sistemas Confi?veis e de Tempo Real", "Centro de Investigacao em Sistemas Confiaveis e de Tempo Real")
df['Affiliation'] = df['Affiliation'].str.replace("Hanyang Women?셲 University", "Hanyang Women's University")
df['Affiliation'] = df['Affiliation'].str.replace("Tartu 횥likool", "Tartu Ulikool")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli studi di Bari Aldo Moro", "Universita degli Studi di Bari Aldo Moro")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 Ca' Foscari Venezia", "Universita Ca' Foscari Venezia")
df['Affiliation'] = df['Affiliation'].str.replace("횋lectroniques, Syst챔mes de Communication et Microsyst챔mes (ESYCOM)", "Electroniques, Systemes de Communication et Microsystemes (ESYCOM)")
df['Affiliation'] = df['Affiliation'].str.replace("Yak캇n Do휓u 횥niversitesi", "Yakin Dogu Universitesi")
df['Affiliation'] = df['Affiliation'].str.replace("Sveu훾ili큄te u Zagrebu, Fakultet Strojarstva i Brodogradnje", "SveuciliSte u Zagrebu, Fakultet Strojarstva i Brodogradnje")
df['Affiliation'] = df['Affiliation'].str.replace("Fraunhofer-Institut f체r Nachrichtentechnik Heinrich-Hertz-Institut", "Fraunhofer-Institut fur Nachrichtentechnik Heinrich-Hertz-Institut")
df['Affiliation'] = df['Affiliation'].str.replace("Gebze Teknik 횥niversitesi", "Gebze Teknik Universitesi")
df['Affiliation'] = df['Affiliation'].str.replace("University of Jyv채skyl채", "University of Jyvaskyla")
df['Affiliation'] = df['Affiliation'].str.replace("Benem챕rita Universidad Aut처noma de Puebla", "Benemerita Universidad Autonoma de Puebla")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Grenoble Alpes", "Universite Grenoble Alpes")
df['Affiliation'] = df['Affiliation'].str.replace("Medizinische Universit채t Wien", "Medizinische University of Wien")
df['Affiliation'] = df['Affiliation'].str.replace("Leibniz-Institut f?r Festk?rper- und Werkstoffforschung Dresden", "Leibniz-Institut fur Festkorper- und Werkstoffforschung Dresden")
df['Affiliation'] = df['Affiliation'].str.replace("Sapienza Universit횪 di Roma", "Sapienza Universita di Roma")
df['Affiliation'] = df['Affiliation'].str.replace("휂梳죍 h沼뛠 M沼?Th횪nh ph沼?H沼?Ch챠 Minh", "HCMC Open University - Campus 1")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Basel", "University of Basel")
df['Affiliation'] = df['Affiliation'].str.replace("Max-Planck-Institut f체r Eisenforschung GmbH", "Max-Planck-Institut fur Eisenforschung GmbH")
df['Affiliation'] = df['Affiliation'].str.replace("Laboratoire Plasma et Conversion d?섷nergie", "Laboratoire Plasma et Conversion d'Energie")
df['Affiliation'] = df['Affiliation'].str.replace("Soomyung Women?셲 University", "Soomyung Women's University")
df['Affiliation'] = df['Affiliation'].str.replace("University of Nebraska?밚incoln", "University of Nebraska–Lincoln")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Bonn", "University of Bonn")
df['Affiliation'] = df['Affiliation'].str.replace("Technische Universit채t Dresden", "Technische University of Dresden")
df['Affiliation'] = df['Affiliation'].str.replace("Institut Lumi챔re Mati챔re", "Institut Lumiere Matiere")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Ahmed Draia - Adrar", "Universite Ahmed Draia - Adrar")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad Aut처noma de Madrid", "Universidad Autonoma de Madrid")
df['Affiliation'] = df['Affiliation'].str.replace("J체lich Aachen Research Alliance (JARA)-Fundamentals of Future Information Technology", "Julich Aachen Research Alliance (JARA)-Fundamentals of Future Information Technology")
df['Affiliation'] = df['Affiliation'].str.replace("Sorbonne Universit챕", "Sorbonne Universite")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Montpellier", "Universite de Montpellier")
df['Affiliation'] = df['Affiliation'].str.replace("Institut de Recerca contra la Leuc챔mia Josep Carreras (IJC)", "Institut de Recerca contra la Leucemia Josep Carreras (IJC)")
df['Affiliation'] = df['Affiliation'].str.replace("A?셎harqiyah University", "A’Sharqiyah University")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Gen챔ve", "Universite de Geneve")
df['Affiliation'] = df['Affiliation'].str.replace("Universidade Tecnol처gica Federal do Paran찼", "Universidade Tecnologica Federal do Parana")
df['Affiliation'] = df['Affiliation'].str.replace("Gesellschaft f체r Anlagen- und Reaktorsicherheit mbH", "Gesellschaft fur Anlagen- und Reaktorsicherheit mbH")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 Badji Mokhtar - Annaba", "Universite Badji Mokhtar - Annaba")
df['Affiliation'] = df['Affiliation'].str.replace("Abdullah G체l 횥niversitesi", "Abdullah Gul Universitesi")
df['Affiliation'] = df['Affiliation'].str.replace("Sj챈llands Universitetshospital", "Roskilde Sygehus")
df['Affiliation'] = df['Affiliation'].str.replace("Institut des Mat챕riaux Poreux de Paris IMAP", "Institut des Materiaux Poreux de Paris IMAP")
df['Affiliation'] = df['Affiliation'].str.replace("University of Polit챕cnica de Catalunya", "University of Politecnica de Catalunya")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Augsburg", "University of Augsburg")
df['Affiliation'] = df['Affiliation'].str.replace("Deutsches Zentrum f체r Luft- und Raumfahrt (DLR)", "Deutsches Zentrum fur Luft- und Raumfahrt (DLR)")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Milano-Bicocca", "Universita degli Studi di Milano-Bicocca")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Jijel", "Universite de Jijel")
df['Affiliation'] = df['Affiliation'].str.replace("Institut Parisien de Chimie Mol챕culaire", "Institut Parisien de Chimie Moleculaire")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 McGill", "Universite McGill")
df['Affiliation'] = df['Affiliation'].str.replace("횋cole de Technologie Sup챕rieure", "Ecole de Technologie Superieure")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채tsklinikum des Saarlandes Medizinische Fakult채t der Universit채t des Saarlandes", " Saarland University Hospital")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad Aut처noma del Estado de Morelos", "Universidad Autonoma del Estado de Morelos")
df['Affiliation'] = df['Affiliation'].str.replace("Sardar Bahadur Khan Women?셲 University (SBKWU)", "Sardar Bahadur Khan Women’s University (SBKWU)")
df['Affiliation'] = df['Affiliation'].str.replace("Dumlupinar 횥niversitesi", "Dumlupinar Universitesi")
df['Affiliation'] = df['Affiliation'].str.replace("Charit챕 ??Universit채tsmedizin Berlin", "Charite - Universitatsmedizin Berlin")
df['Affiliation'] = df['Affiliation'].str.replace("Georg-August-Universit채t G철ttingen", "Georg-August-University of Gottingen")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채tsmedizin G철ttingen", "University Medicine Gottingen")
df['Affiliation'] = df['Affiliation'].str.replace("Zentrum f체r Technologietransfer und Telekommunikation (ZTT)", "Zentrum fur Technologietransfer und Telekommunikation (ZTT)")
df['Affiliation'] = df['Affiliation'].str.replace("Universidade Federal de Uberl창ndia", "Universidade Federal de Uberlandia")
df['Affiliation'] = df['Affiliation'].str.replace("Ministerio de Salud P첬blica", "Ministerio de Salud Publica")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad Aut처noma del Estado de M챕xico", "Universidad Autonoma del Estado de Mexico")
df['Affiliation'] = df['Affiliation'].str.replace("ARMTEC Tecnologia em Rob처tica", "ARMTEC Tecnologia em Robotica")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Innsbruck", "University of Innsbruck")
df['Affiliation'] = df['Affiliation'].str.replace("It채-Suomen yliopisto", "Ita-Suomen yliopisto")
df['Affiliation'] = df['Affiliation'].str.replace("Helmholtz-Zentrum Berlin f체r Materialien und Energie (HZB)", "Helmholtz-Zentrum Berlin fur Materialien und Energie (HZB)")
df['Affiliation'] = df['Affiliation'].str.replace("Medizinisches Laserzentrum L체beck", "Medizinisches Laserzentrum Lubeck")
df['Affiliation'] = df['Affiliation'].str.replace("E철tv철s Lor찼nd Tudom찼nyegyetem", "Eotvos Lorand Unievrsity")
df['Affiliation'] = df['Affiliation'].str.replace("Necmettin Erbakan 횥niversitesi", "Necmettin Erbakan Universitesi")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 d'Orl챕ans", "Universite d'Orleans")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Tunis El Manar, Ecole Nationale d'Ing챕nieurs de Tunis", "Universite de Tunis El Manar, Ecole Nationale d'Ingenieurs de Tunis")
df['Affiliation'] = df['Affiliation'].str.replace("Laboratoire de Physique et d'횋tude des Mat챕riaux", "Laboratoire de Physique et d'Etude des Materiaux")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Hamburg", "University of Hamburg")
df['Affiliation'] = df['Affiliation'].str.replace("Institut d'Investigaci처 Biomedica de Bellvitge", "Institut d'Investigacio Biomedica de Bellvitge")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Torino", "Universita degli Studi di Torino")
df['Affiliation'] = df['Affiliation'].str.replace("Institut des Mol챕cules et Mat챕riaux du Mans", "Institut des Molecules et Materiaux du Mans")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 Cattolica del Sacro Cuore, Campus di Roma", "Universita Cattolica del Sacro Cuore, Campus di Roma")
df['Affiliation'] = df['Affiliation'].str.replace("Friedrich-Alexander-Universit채t Erlangen-N체rnberg", "Friedrich-Alexander-University of Erlangen-Nurnberg")
df['Affiliation'] = df['Affiliation'].str.replace("Instituto de Telecomunica챌천es", "Instituto de Telecomunicacoes")
df['Affiliation'] = df['Affiliation'].str.replace("Ume책 Universitet", "Umea Universitet")
df['Affiliation'] = df['Affiliation'].str.replace("Fundaci처 Institut Universitari per a la recerca a l'Atenci처 Prim횪ria de Salut Jordi Gol i Gurina (IDIAPJGol)", "Fundacio Institut Universitari per a la recerca a l'Atencio Primaria de Salut Jordi Gol i Gurina (IDIAPJGol)")
df['Affiliation'] = df['Affiliation'].str.replace("Dongduck Women?셲 University", "Dongduk Women's University")
df['Affiliation'] = df['Affiliation'].str.replace("Narodowe Centrum Bada흦 J훳drowych, Otwock", "National Centre for Nuclear Research")
df['Affiliation'] = df['Affiliation'].str.replace("Fraunhofer-Institut F체r Organische Elektronik, Elektronenstrahl- Und Plasmatechnik", "Fraunhofer-Institut fur Organische Elektronik, Elektronenstrahl- und Plasmatechnik")
df['Affiliation'] = df['Affiliation'].str.replace("Tr튼沼쓓g 휂梳죍 h沼뛠 Giao th척ng v梳춏 t梳즜", "University of Transport and Communications")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Freiburg", "University of Freiburg")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Bremen", "University of Bremen")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 du Qu챕bec 횪 Montr챕al", "Universite du Quebec a Montreal")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Leipzig", "University of Leipzig")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 d'Evry Val d'Essonne", "Universite d'Evry Val d'Essonne")
df['Affiliation'] = df['Affiliation'].str.replace("D챕partement de Mod챕lisation des Syst챔mes et Structures", "Departement de Modelisation des Systemes et Structures")
df['Affiliation'] = df['Affiliation'].str.replace("O?셄eal Comprehensive Cancer Center", "Oregon Comprehensive Cancer Center")
df['Affiliation'] = df['Affiliation'].str.replace("Kyungin Women?셲 University", "Kyungin Women's University")
df['Affiliation'] = df['Affiliation'].str.replace("Sookmyung Women?셲 University", "Sookmyung Women's University")
df['Affiliation'] = df['Affiliation'].str.replace("Klinikum der Universit채t M체nchen", "Klinikum der University of Munchen")
df['Affiliation'] = df['Affiliation'].str.replace("횇ngstr철mlaboratoriet", "Angstrom Laboratory")
df['Affiliation'] = df['Affiliation'].str.replace("CISE-Centro de Investiga챌찾o em Sistemas Electromecatr처nicos", "CISE-Centro de Investigacao em Sistemas Eletromecatronicos")
df['Affiliation'] = df['Affiliation'].str.replace("Universitat Aut챵noma de Barcelona", "University of Autonoma de Barcelona")
df['Affiliation'] = df['Affiliation'].str.replace("Laboratoire National des Champs Magn챕tiques Intenses (LNCMI)", "Laboratoire National des Champs Magnetiques Intenses (LNCMI)")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Duisburg-Essen", "University of Duisburg-Essen")
df['Affiliation'] = df['Affiliation'].str.replace("Ecole Nationale Sup챕rieure de Chimie de Rennes", "Ecole Nationale Superieure de Chimie de Rennes")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t zu K철ln", "University of Cologne")
df['Affiliation'] = df['Affiliation'].str.replace("Uniwersytet 힃l훳ski w Katowicach", " University of Silesia in Katowice")
df['Affiliation'] = df['Affiliation'].str.replace("CSIC-INTA - Centro de Astrobiolog챠a (CAB)", "CSIC-INTA - Centro de Astrobiologia (CAB)")
df['Affiliation'] = df['Affiliation'].str.replace("Helmholtz-Institute Erlangen-N체renberg for Renewable Energy (HI ERN)", "Helmholtz-Institut Erlangen-Nurnberg fur Erneuerbare Energien (HI ERN)")
df['Affiliation'] = df['Affiliation'].str.replace("La Rochelle Universit챕", "La Rochelle Universite")
df['Affiliation'] = df['Affiliation'].str.replace("ETH Z체rich", "ETH Zurich")
df['Affiliation'] = df['Affiliation'].str.replace("Julius-Maximilians-Universit채t W체rzburg", "Julius-Maximilians-University of Wurzburg")
df['Affiliation'] = df['Affiliation'].str.replace("Centrale M챕diterran챕e", "Centrale Mediterranee")
df['Affiliation'] = df['Affiliation'].str.replace("National Technical University of Ukraine ?쏧gor Sikorsky Kyiv Polytechnic Institute??", "National Technical University of Ukraine Igor Sikorsky Kyiv Polytechnic Institute")
df['Affiliation'] = df['Affiliation'].str.replace("Centre de Recherches en Canc챕rologie de Toulouse", "Centre de Recherches en Cancerologie de Toulouse")
df['Affiliation'] = df['Affiliation'].str.replace("Centre 횋nergie Mat챕riaux T챕l챕communications", "Centre Energie Materiaux Telecommunications")
df['Affiliation'] = df['Affiliation'].str.replace("Uniwersytet Miko흢aja Kopernika w Toruniu", "Nicolaus Copernicus University")
df['Affiliation'] = df['Affiliation'].str.replace("University of Wroc흢aw", "University of Wroclaw")
df['Affiliation'] = df['Affiliation'].str.replace("T챕l챕com SudParis", "Telecom SudParis")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채tsklinikum M체nster", "Munster University Hospital")
df['Affiliation'] = df['Affiliation'].str.replace("Centre de recherche du CHU de Qu챕bec-Universit챕 Laval", "Centre de recherche du CHU de Quebec-Universite Laval")
df['Affiliation'] = df['Affiliation'].str.replace("Univerzita Komensk챕ho v Bratislave", "Comenius University Bratislava")
df['Affiliation'] = df['Affiliation'].str.replace("Humboldt-Universit채t zu Berlin", "Humboldt-University of zu Berlin")
df['Affiliation'] = df['Affiliation'].str.replace("Medizinische Universit채t Wien, Zentrum f체r Medizinische Physik und Biomedizinische Technik", "Medizinische University of Wien, Zentrum fur Medizinische Physik und Biomedizinische Technik")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 des Sciences et de la Technologie Houari Boumediene", "Universite des Sciences et de la Technologie Houari Boumediene")
df['Affiliation'] = df['Affiliation'].str.replace("Medizinische Fakult채t, RWTH Aachen University", "Medizinische Fakultat, RWTH Aachen University")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Foggia", "Universita degli Studi di Foggia")
df['Affiliation'] = df['Affiliation'].str.replace("B체hler AG", "Buhler AG")
df['Affiliation'] = df['Affiliation'].str.replace("Justus-Liebig-Universit채t Gie횩en", "Justus-Liebig-University of Giessen")
df['Affiliation'] = df['Affiliation'].str.replace("Szegedi Tudom찼nyegyetem (SZTE)", "Szegedi Tudomanyegyetem (SZTE)")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Brescia", "Universita degli Studi di Brescia")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Potsdam", "University of Potsdam")
df['Affiliation'] = df['Affiliation'].str.replace("IN2P3 - Institut National de Physique Nucl챕aire et de Physique Des Particules", "IN2P3 - Institut National de Physique Nucleaire et de Physique des Particules")
df['Affiliation'] = df['Affiliation'].str.replace("Centre de D챕veloppement des Technologies Avanc챕es", "Advanced Technology Development Centre")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Heidelberg", "University of Heidelberg")
df['Affiliation'] = df['Affiliation'].str.replace("Erzincan Binali Y캇ld캇r캇m 횥niversitesi", "Erzincan Binali Yıldırım Universitesi")
df['Affiliation'] = df['Affiliation'].str.replace("Pavol Jozef 힋af찼rik University in Ko큄ice", "Pavol Jozef Safarik University in Kosice")
df['Affiliation'] = df['Affiliation'].str.replace("Pusan ?뗢딳ational University Dental Hospital", "Pusan National University Dental Hospital")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Roma", "Universita degli Studi di Roma")
df['Affiliation'] = df['Affiliation'].str.replace("ECPM 횋cole Europ챕enne de Chimie, Polym챔res et Mat챕riaux de Strasbourg", "ECPM Ecole Europeenne de Chimie, Polymeres et Materiaux de Strasbourg")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Neuch창tel", "Universite de Neuchatel")
df['Affiliation'] = df['Affiliation'].str.replace("Universidad Nacional Aut처noma de M챕xico", "Universidad Nacional Autonoma de Mexico")
df['Affiliation'] = df['Affiliation'].str.replace("Heinrich-Heine-Universit채t D체sseldorf", "Heinrich-Heine-University Dusseldorf")
df['Affiliation'] = df['Affiliation'].str.replace("Universit채t Bayreuth", "University of Bayreuth")
df['Affiliation'] = df['Affiliation'].str.replace("University of Jyv?skyl?", "University of Jyvaskyla")
df['Affiliation'] = df['Affiliation'].str.replace("Universit챕 de Lille", "Universite de Lille")
df['Affiliation'] = df['Affiliation'].str.replace("Institut de Recherches Cliniques de Montr챕al", "Institut de Recherches Cliniques de Montreal")
df['Affiliation'] = df['Affiliation'].str.replace("Universit횪 degli Studi di Salerno", "University of Salerno")
df['Affiliation'] = df['Affiliation'].str.replace("Instituci처 Catalana de Recerca i Estudis Avan챌ats", "Institucio Catalana de Recerca i Estudis Avancats, ICREA")

# Step 4: Save the updated DataFrame to a new CSV file
df.to_csv('Paper_Dataset_Dict.csv', index=False, encoding='utf-8-sig')

# Step 3: Retrieve the 13249th affiliation (index starts from 0, so we use 13248)
affiliation_13249 = df.iloc[13247]['Affiliation']  # iloc counts all rows including blanks
print(f"Affiliation at row 13249 (with blanks included): {affiliation_13249}")


Affiliation at row 13249 (with blanks included): Institucio Catalana de Recerca i Estudis Avancats, ICREA; ETH Zurich; University of Autonoma de Barcelona


## Missing Paper용 API

In [33]:
import pandas as pd

# Step 1: Read the CSV file
file_path = 'Paper_Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Step 2: Count missing values based on the conditions
affiliation_only_missing_count = df['Affiliation'].isna().sum() - df['Authors'].isna().sum()
authors_only_missing_count = df['Authors'].isna().sum() - df['Affiliation'].isna().sum()
both_missing_count = df[(df['Affiliation'].isna()) & (df['Authors'].isna())].shape[0]

# Step 3: Print the results
print(f"Affiliation만 비어있는 개수: {df['Affiliation'].isna().sum() - both_missing_count}")
print(f"Authors만 비어있는 개수: {df['Authors'].isna().sum() - both_missing_count}")
print(f"둘다 비어있는 개수: {both_missing_count}")


Affiliation만 비어있는 개수: 2123
Authors만 비어있는 개수: 4327
둘다 비어있는 개수: 1744


In [20]:
import pandas as pd

# Step 1: Read the CSV file
file_path = 'Paper_Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Step 2: Function to count missing values for Affiliation and Authors for each 육T관련기술 group
def count_missing_by_group(group):
    both_missing_count = group[(group['Affiliation'].isna()) & (group['Authors'].isna())].shape[0]
    affiliation_only_missing_count = group['Affiliation'].isna().sum() - both_missing_count
    authors_only_missing_count = group['Authors'].isna().sum() - both_missing_count
    
    return affiliation_only_missing_count, authors_only_missing_count, both_missing_count

# Step 3: Loop over each 육T관련기술 value (1 to 7) and count missing values
for value in range(1, 8):
    group = df[df['육T관련기술'] == value]
    
    affiliation_only_missing, authors_only_missing, both_missing = count_missing_by_group(group)
    
    print(f"육T관련기술 = {value}")
    print(f"Affiliation만 비어있는 개수: {affiliation_only_missing}")
    print(f"Authors만 비어있는 개수: {authors_only_missing}")
    print(f"둘다 비어있는 개수: {both_missing}")
    print("-" * 30)



육T관련기술 = 1
Affiliation만 비어있는 개수: 670
Authors만 비어있는 개수: 1022
둘다 비어있는 개수: 611
------------------------------
육T관련기술 = 2
Affiliation만 비어있는 개수: 199
Authors만 비어있는 개수: 638
둘다 비어있는 개수: 291
------------------------------
육T관련기술 = 3
Affiliation만 비어있는 개수: 196
Authors만 비어있는 개수: 329
둘다 비어있는 개수: 140
------------------------------
육T관련기술 = 4
Affiliation만 비어있는 개수: 13
Authors만 비어있는 개수: 40
둘다 비어있는 개수: 9
------------------------------
육T관련기술 = 5
Affiliation만 비어있는 개수: 643
Authors만 비어있는 개수: 1327
둘다 비어있는 개수: 473
------------------------------
육T관련기술 = 6
Affiliation만 비어있는 개수: 19
Authors만 비어있는 개수: 41
둘다 비어있는 개수: 9
------------------------------
육T관련기술 = 7
Affiliation만 비어있는 개수: 383
Authors만 비어있는 개수: 929
둘다 비어있는 개수: 209
------------------------------


In [34]:
# Step 1: Filter rows where either 'Affiliation' or 'Authors' is empty
filtered_df = df[df['Affiliation'].isnull() | df['Authors'].isnull()]

# Step 2: Group by 'Journal' and count the occurrences
journal_counts = filtered_df['Journal'].value_counts()

# Step 3: Display the result
print(journal_counts)

Journal
JOURNAL OF HIGH ENERGY PHYSICS               222
JOURNAL OF NANOSCIENCE AND NANOTECHNOLOGY    203
PHYSICS LETTERS B                            162
JOURNAL OF ALLOYS AND COMPOUNDS              120
ACS APPLIED MATERIALS & INTERFACES           105
                                            ... 
Antioxidants                                   1
JOURNAL OF CRANIOFACIAL SURGERY                1
Micro & Nano Letters                           1
GLOBAL CHALLENGES                              1
ANNALS OF THORACIC SURGERY                     1
Name: count, Length: 1720, dtype: int64


## SJR Category

In [4]:
## Unique jounral 추출

import pandas as pd

# Step 1: Read the CSV file
file_path = 'Paper_Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Step 1: Extract unique journal names
unique_journals = df['Journal'].drop_duplicates().reset_index(drop=True)

# Display the first 10 unique journal names and the total count of unique journals
unique_journals_list = unique_journals.tolist()
unique_journals_count = len(unique_journals_list)

print(unique_journals_list[:10])  # Print first 10 unique journal names
print(f"Total unique journals: {unique_journals_count}")


['JOURNAL OF ENVIRONMENTAL RADIOACTIVITY', 'JOURNAL OF THE AMERICAN SOCIETY FOR MASS SPECTROMETRY', 'JOURNAL OF HAZARDOUS MATERIALS', 'ATW-INTERNATIONAL JOURNAL FOR NUCLEAR POWER', 'ANNALS OF NUCLEAR ENERGY', 'SCIENCE AND TECHNOLOGY OF NUCLEAR INSTALLATIONS', 'INTERNATIONAL JOURNAL OF PRESSURE VESSELS AND PIPING', 'SWISS JOURNAL OF GEOSCIENCES', 'NUCLEAR ENGINEERING AND TECHNOLOGY', 'NUCLEAR INSTRUMENTS & METHODS IN PHYSICS RESEARCH SECTION A-ACCELERATORS SPECTROMETERS DETECTORS AND ASSOCIATED EQUIPMENT']
Total unique journals: 3922


In [31]:
# import requests
# from bs4 import BeautifulSoup

# def get_journal_link(journal_name):
#     # Prepare the search URL
#     search_url = f"https://www.scimagojr.com/journalsearch.php?q={journal_name.replace(' ', '+')}"
#     print(f"Search URL: {search_url}")  # Print search URL

#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
#     }
    
#     # Fetch the search results page
#     response = requests.get(search_url, headers=headers)
    
#     # Check if the request was successful
#     if response.status_code != 200:
#         print("Failed to retrieve the webpage.")
#         return None
    
#     # Parse the HTML content
#     soup = BeautifulSoup(response.text, 'html.parser')
    
#     # Find the first journal link
#     first_journal = soup.select_one('a[href*="journalsearch.php?q="]')
    
#     # Check if the link was found
#     if first_journal:
#         # Construct the full URL
#         journal_url = f"https://www.scimagojr.com/{first_journal['href']}"
#         return journal_url
#     else:
#         print("Journal not found.")
#         return None

# # Example usage
# journal_name = "Journal of Environmental Radioactivity"
# link = get_journal_link(journal_name)
# if link:
#     print("Journal Link:", link)


Search URL: https://www.scimagojr.com/journalsearch.php?q=Journal+of+Environmental+Radioactivity
Journal Link: https://www.scimagojr.com/journalsearch.php?q=23388&tip=sid&clean=0


#### Step 1 (get journal link)

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Load the dataset
file_path = 'Paper_Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Get unique journal names from the entire dataset
unique_journals = df['Journal'].drop_duplicates().reset_index(drop=True)

def get_journal_link(journal_name):
    # Prepare the search URL
    search_url = f"https://www.scimagojr.com/journalsearch.php?q={journal_name.replace(' ', '+')}"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
    }
    
    # Fetch the search results page
    response = requests.get(search_url, headers=headers)
    
    # Check if the request was successful
    if response.status_code != 200:
        return None
    
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the first journal link
    first_journal = soup.select_one('a[href*="journalsearch.php?q="]')
    
    # Check if the link was found
    if first_journal:
        # Construct the full URL
        journal_url = f"https://www.scimagojr.com/{first_journal['href']}"
        return journal_url
    else:
        return None  # Return None if no link was found

# Create a DataFrame to store the journal links for all unique journals
journal_links = []
total_count = len(unique_journals)
not_found_count = 0  # Initialize counter for not found links

# Loop through each unique journal and get the journal link
for index, journal in enumerate(unique_journals):
    link = get_journal_link(journal)
    journal_links.append(link)
    
    if link is None:
        not_found_count += 1  # Increment the count if the link is not found

    # Notify after processing every 100 journals
    if (index + 1) % 100 == 0 or (index + 1) == total_count:
        print(f"Processed {index + 1}/{total_count} journals.")

# Create a DataFrame for the unique journals and their links
links_df = pd.DataFrame({
    'Journal': unique_journals,
    'Journal Link': journal_links
})

# Merge the links back into the original DataFrame
df_subset = df.merge(links_df, on='Journal', how='left')

# Save the updated DataFrame to a new CSV file
df_subset.to_csv('Journal_Link.csv', index=False, encoding='utf-8-sig')

# Print the count of journals with no links
print(f"Number of journals with no links: {not_found_count}")


Processed 100/3922 journals.
Processed 200/3922 journals.
Processed 300/3922 journals.
Processed 400/3922 journals.
Processed 500/3922 journals.
Processed 600/3922 journals.
Processed 700/3922 journals.
Processed 800/3922 journals.
Processed 900/3922 journals.
Processed 1000/3922 journals.
Processed 1100/3922 journals.
Processed 1200/3922 journals.
Processed 1300/3922 journals.
Processed 1400/3922 journals.
Processed 1500/3922 journals.
Processed 1600/3922 journals.
Processed 1700/3922 journals.
Processed 1800/3922 journals.
Processed 1900/3922 journals.
Processed 2000/3922 journals.
Processed 2100/3922 journals.
Processed 2200/3922 journals.
Processed 2300/3922 journals.
Processed 2400/3922 journals.
Processed 2500/3922 journals.
Processed 2600/3922 journals.
Processed 2700/3922 journals.
Processed 2800/3922 journals.
Processed 2900/3922 journals.
Processed 3000/3922 journals.
Processed 3100/3922 journals.
Processed 3200/3922 journals.
Processed 3300/3922 journals.
Processed 3400/3922

In [8]:
# 다중 검색되는 저널 -> 첫 번째로 링크 수집
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Load the dataset
file_path = 'Paper_Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Filter journals where 'Journal_Link' is not NaN and 'Scope_All' is NaN
filtered_df = df[(~df['Journal_Link'].isna()) & (df['Scope_All'].isna())]

# Get unique journal names from the filtered dataset
unique_journals = filtered_df['Journal'].drop_duplicates().reset_index(drop=True)

def get_first_journal_link(journal_name):
    # Prepare the search URL
    search_url = f"https://www.scimagojr.com/journalsearch.php?q={journal_name.replace(' ', '+')}"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
    }

    # Fetch the search results page
    response = requests.get(search_url, headers=headers)
    
    # Check if the request was successful
    if response.status_code != 200:
        return None
    
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the first journal link (first search result using XPath-like method)
    first_journal = soup.select_one('div.search_results a')
    
    # Check if the link was found
    if first_journal:
        # Construct the full URL for the journal
        journal_url = f"https://www.scimagojr.com/{first_journal['href']}"
        return journal_url
    else:
        return None  # Return None if no link was found

# Create a DataFrame to store the journal links for all unique journals
journal_links = []
total_count = len(unique_journals)
not_found_count = 0  # Initialize counter for not found links

# Loop through each unique journal and get the journal link
for index, journal in enumerate(unique_journals):
    link = get_first_journal_link(journal)
    journal_links.append(link)
    
    if link is None:
        not_found_count += 1  # Increment the count if the link is not found

    # Notify after processing every 100 journals
    if (index + 1) % 100 == 0 or (index + 1) == total_count:
        print(f"Processed {index + 1}/{total_count} journals.")

# Create a DataFrame for the unique journals and their links
links_df = pd.DataFrame({
    'Journal': unique_journals,
    'Journal Link': journal_links
})

# Merge the links back into the filtered DataFrame
filtered_df = filtered_df.merge(links_df, on='Journal', how='left')

# Save the updated DataFrame to a new CSV file
filtered_df.to_csv('Journal_Link_Updated.csv', index=False, encoding='utf-8-sig')

# Print the count of journals with no links
print(f"Number of journals with no links: {not_found_count}")


Processed 100/235 journals.
Processed 200/235 journals.
Processed 235/235 journals.
Number of journals with no links: 0


In [11]:
import pandas as pd

# Load the original dataset and the updated journal links dataset
file_path = 'Paper_Dataset.csv'
df_paper = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

file_path_updated = 'Journal_Link_Updated.csv'
df_updated_links = pd.read_csv(file_path_updated, encoding='utf-8-sig', low_memory=False)

# Drop duplicates in the updated links DataFrame based on 'Journal'
df_updated_links_unique = df_updated_links.drop_duplicates(subset='Journal')

# Filter rows in Paper_Dataset.csv where 'Scope_All' is empty and 'Journal_Link' is not NaN
mask = (df_paper['Scope_All'].isna()) & (~df_paper['Journal_Link'].isna())

# Merge the datasets based on 'Journal' to update 'Journal_Link' where 'Scope_All' is empty
df_paper.loc[mask, 'Journal_Link'] = df_paper.loc[mask, 'Journal'].map(df_updated_links_unique.set_index('Journal')['Journal Link'])

# Save the modified dataset to a new CSV file
df_paper.to_csv('Paper_Dataset_Updated.csv', index=False, encoding='utf-8-sig')

print("Dataset updated and saved to 'Paper_Dataset_Updated.csv'.")


Dataset updated and saved to 'Paper_Dataset_Updated.csv'.


#### Step 2 (get All / Tree category)

In [None]:
# # 10개에 30초 걸림
# import pandas as pd
# import requests
# from bs4 import BeautifulSoup

# def get_journal_scope_all(journal_url):
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
#     }
    
#     response = requests.get(journal_url, headers=headers)
    
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.text, 'html.parser')
#         subject_area_header = soup.find('h2', string="Subject Area and Category")
        
#         if subject_area_header:
#             top_level_subjects = []
#             for ul in subject_area_header.find_all_next('ul'):
#                 for li in ul.find_all('li', recursive=False):
#                     a_tag = li.find('a')
#                     if a_tag:
#                         top_level_subjects.append(a_tag.text.strip())
#             return "; ".join(top_level_subjects)
        
#         return None
#     else:
#         return None

# def get_journal_scope_tree(journal_url):
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
#     }
    
#     response = requests.get(journal_url, headers=headers)
    
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.text, 'html.parser')
#         subject_area_header = soup.find('h2', string="Subject Area and Category")
        
#         if subject_area_header:
#             top_level_subjects = []
#             for sibling in subject_area_header.find_all_next():
#                 if sibling.name == 'h2':
#                     break
#                 if sibling.name == 'ul':
#                     for li in sibling.find_all('li', recursive=False):
#                         a_tag = li.find('a')
#                         if a_tag and not li.find('ul', class_='treecategory'):
#                             top_level_subjects.append(a_tag.text.strip())
#             return "; ".join(top_level_subjects)
        
#         return None
#     else:
#         return None

# # Load the dataset
# file_path = 'Paper_Dataset.csv'
# df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# # Trim whitespace from column names
# df.columns = df.columns.str.strip()

# # Get unique journal links from the dataset
# unique_journals = df['Journal_Link'].drop_duplicates().reset_index(drop=True)

# # Create a DataFrame to store the results
# results = pd.DataFrame(unique_journals, columns=['Journal_Link'])

# # Initialize lists to store the scopes
# scope_all_list = []
# scope_tree_list = []

# # Iterate over each unique journal link
# for journal_link in results['Journal_Link']:
#     # Check if journal_link is valid
#     if pd.isna(journal_link):
#         scope_all_list.append(None)
#         scope_tree_list.append(None)
#         continue  # Skip to the next link
    
#     scope_all = get_journal_scope_all(journal_link)
#     scope_tree = get_journal_scope_tree(journal_link)
    
#     scope_all_list.append(scope_all)
#     scope_tree_list.append(scope_tree)

#     # Notify after processing every 100 journals
#     if (index + 1) % 100 == 0:
#         print(f"Processed {index + 1}/{len(results)} journals.")


# # Add the scopes to the results DataFrame
# results['Scope_All'] = scope_all_list
# results['Scope_Tree'] = scope_tree_list

# # Merge results back into the original DataFrame based on the journal links
# df = df.merge(results, on='Journal_Link', how='left')

# # Save the updated DataFrame to a new CSV file
# df.to_csv('Updated_Paper_Dataset.csv', index=False, encoding='utf-8-sig')

# print("Scope_All and Scope_Tree have been added to the dataset, and the updated dataset has been saved.")


In [1]:
# 10개에 18초 걸림
import pandas as pd
import requests
from bs4 import BeautifulSoup

def get_journal_scopes(journal_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
    }
    
    response = requests.get(journal_url, headers=headers)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        subject_area_header = soup.find('h2', string="Subject Area and Category")
        
        if subject_area_header:
            top_level_subjects = []
            scope_tree_subjects = []
            
            # Traverse through the elements after the subject area header
            for sibling in subject_area_header.find_all_next():
                if sibling.name == 'h2':
                    break  # Stop if we encounter another header
                if sibling.name == 'ul':
                    for li in sibling.find_all('li', recursive=False):
                        a_tag = li.find('a')
                        if a_tag:
                            # Add to Scope_All
                            top_level_subjects.append(a_tag.text.strip())
                            # Only add to Scope_Tree if not a treecategory
                            if not li.find('ul', class_='treecategory'):
                                scope_tree_subjects.append(a_tag.text.strip())
            
            # Join the subject areas into a single string
            scope_all = "; ".join(top_level_subjects)
            scope_tree = "; ".join(scope_tree_subjects)
            return scope_all, scope_tree
        
        return None, None
    else:
        return None, None

# Load the dataset
file_path = 'Paper_Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Trim whitespace from column names
df.columns = df.columns.str.strip()

# Get unique journal links from the dataset
unique_journals = df['Journal_Link'].drop_duplicates().reset_index(drop=True)

# Create a DataFrame to store the results
results = pd.DataFrame(unique_journals, columns=['Journal_Link'])

# Initialize lists to store the scopes
scope_all_list = []
scope_tree_list = []

# Iterate over each unique journal link
for index, journal_link in enumerate(results['Journal_Link']):
    # Check if journal_link is valid
    if pd.isna(journal_link):
        scope_all_list.append(None)
        scope_tree_list.append(None)
        continue  # Skip to the next link
    
    scope_all, scope_tree = get_journal_scopes(journal_link)
    
    scope_all_list.append(scope_all)
    scope_tree_list.append(scope_tree)
    
    # Notify after processing every 100 journals
    if (index + 1) % 100 == 0:
        print(f"Processed {index + 1}/{len(results)} journals.")

# Add the scopes to the results DataFrame
results['Scope_All'] = scope_all_list
results['Scope_Tree'] = scope_tree_list

# Merge results back into the original DataFrame based on the journal links
df = df.merge(results, on='Journal_Link', how='left')

# Save the updated DataFrame to a new CSV file
df.to_csv('Paper_Scope.csv', index=False, encoding='utf-8-sig')

print("Scope_All and Scope_Tree have been added to the dataset, and the updated dataset has been saved.")


Processed 100/2893 journals.
Processed 200/2893 journals.
Processed 300/2893 journals.
Processed 400/2893 journals.
Processed 500/2893 journals.
Processed 600/2893 journals.
Processed 700/2893 journals.
Processed 800/2893 journals.
Processed 900/2893 journals.
Processed 1000/2893 journals.
Processed 1100/2893 journals.
Processed 1200/2893 journals.
Processed 1300/2893 journals.
Processed 1400/2893 journals.
Processed 1500/2893 journals.
Processed 1600/2893 journals.
Processed 1700/2893 journals.
Processed 1800/2893 journals.
Processed 1900/2893 journals.
Processed 2000/2893 journals.
Processed 2100/2893 journals.
Processed 2200/2893 journals.
Processed 2300/2893 journals.
Processed 2400/2893 journals.
Processed 2500/2893 journals.
Processed 2600/2893 journals.
Processed 2700/2893 journals.
Processed 2800/2893 journals.
Scope_All and Scope_Tree have been added to the dataset, and the updated dataset has been saved.


In [29]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def get_journal_scopes(journal_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
    }
    
    response = requests.get(journal_url, headers=headers)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        subject_area_header = soup.find('h2', string="Subject Area and Category")
        
        if subject_area_header:
            top_level_subjects = []
            scope_tree_subjects = []
            
            # Traverse through the elements after the subject area header
            for sibling in subject_area_header.find_all_next():
                if sibling.name == 'h2':
                    break  # Stop if we encounter another header
                if sibling.name == 'ul':
                    for li in sibling.find_all('li', recursive=False):
                        a_tag = li.find('a')
                        if a_tag:
                            # Add to Scope_All
                            top_level_subjects.append(a_tag.text.strip())
                            # Only add to Scope_Tree if not a treecategory
                            if not li.find('ul', class_='treecategory'):
                                scope_tree_subjects.append(a_tag.text.strip())
            
            # Join the subject areas into a single string
            scope_all = "; ".join(top_level_subjects)
            scope_tree = "; ".join(scope_tree_subjects)
            return scope_all, scope_tree
        
        return None, None
    else:
        return None, None

# Load the dataset
file_path = 'Paper_Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Trim whitespace from column names
df.columns = df.columns.str.strip()

# Get the rows where 'Scope_All' is missing (NaN)
missing_scope_df = df[df['Scope_All'].isna() & df['Journal_Link'].notna()]

# Create a DataFrame to store the results for missing data
results = pd.DataFrame(missing_scope_df['Journal_Link'].drop_duplicates().reset_index(drop=True), columns=['Journal_Link'])

# Initialize lists to store the scopes
scope_all_list = []
scope_tree_list = []

# Iterate over each unique journal link for missing scope data
for index, journal_link in enumerate(results['Journal_Link']):
    # Collect the scope data for missing Scope_All
    scope_all, scope_tree = get_journal_scopes(journal_link)
    
    scope_all_list.append(scope_all)
    scope_tree_list.append(scope_tree)
    
    # Notify after processing every 100 journals
    if (index + 1) % 100 == 0:
        print(f"Processed {index + 1}/{len(results)} missing scope journals.")

# Add the scopes to the results DataFrame
results['Scope_All'] = scope_all_list
results['Scope_Tree'] = scope_tree_list

# Merge only the updated scopes back into the original DataFrame
df_updated = df.merge(results, on='Journal_Link', how='left', suffixes=('', '_new'))

# Update only rows where Scope_All was missing
df_updated['Scope_All'] = df_updated['Scope_All'].combine_first(df_updated['Scope_All_new'])
df_updated['Scope_Tree'] = df_updated['Scope_Tree'].combine_first(df_updated['Scope_Tree_new'])

# Drop the temporary columns
df_updated.drop(columns=['Scope_All_new', 'Scope_Tree_new'], inplace=True)

# Save the updated DataFrame to a new CSV file
df_updated.to_csv('Paper_Scope_Updated.csv', index=False, encoding='utf-8-sig')

print("Missing Scope_All and Scope_Tree have been collected and added to the dataset, and the updated dataset has been saved.")


Processed 100/109 missing scope journals.
Missing Scope_All and Scope_Tree have been collected and added to the dataset, and the updated dataset has been saved.


#### Step 3 All - Tree = Top

In [30]:
import pandas as pd

# Load the dataset that already has Scope_All and Scope_Tree
file_path = 'Paper_Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Function to calculate Scope_Top
def calculate_scope_top(scope_all, scope_tree):
    if pd.isna(scope_all) or pd.isna(scope_tree):
        return None
    
    # Split the subjects and remove any leading/trailing spaces
    list1 = [subject.strip() for subject in scope_all.split(';')]
    list2 = [subject.strip() for subject in scope_tree.split(';')]
    
    # Convert lists to sets for easy comparison
    set1 = set(list1)
    set2 = set(list2)
    
    # Find subjects that are in scope_all but not in scope_tree
    unique_subjects = set1 - set2
    
    # Join the unique subjects back into a single string
    return "; ".join(unique_subjects)

# Apply the function to calculate Scope_Top for each row
df['Scope_Top'] = df.apply(lambda row: calculate_scope_top(row['Scope_All'], row['Scope_Tree']), axis=1)

# Save the updated DataFrame to a new CSV file
df.to_csv('Paper_Scope_Top.csv', index=False, encoding='utf-8-sig')


In [33]:
# import requests
# from bs4 import BeautifulSoup

# def get_journal_subject_areas(journal_url):
#     # Step 2: Set headers to mimic a web browser
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
#     }
    
#     # Send a GET request to the journal URL with headers
#     response = requests.get(journal_url, headers=headers)
    
#     if response.status_code == 200:
#         # Parse the HTML content
#         soup = BeautifulSoup(response.text, 'html.parser')

#         # Find the Subject Area and Category section
#         subject_area_header = soup.find('h2', string="Subject Area and Category")
#         if subject_area_header:
#             # Find the <ul> that follows the header
#             subject_area_list = subject_area_header.find_next('ul')
#             if subject_area_list:
#                 # Extract all the <a> tags in the <ul>
#                 subject_areas = [a.text for a in subject_area_list.find_all('a')]
#                 # Join the subject areas into a single string
#                 search_scope = "; ".join(subject_areas)
#                 return f'Search Scope: "{search_scope}"'
#         return "Subject area not found."
#     else:
#         return f"Failed to retrieve journal detail page: {response.status_code}"

# # Example usage
# journal_url = "https://www.scimagojr.com/journalsearch.php?q=23388&tip=sid&clean=0"
# subject_areas = get_journal_subject_areas(journal_url)
# print(subject_areas)


Search Scope: "Environmental Science; Environmental Chemistry; Health, Toxicology and Mutagenesis; Pollution; Waste Management and Disposal"


In [35]:
# import requests
# from bs4 import BeautifulSoup

# def get_journal_subject_areas(journal_url):
#     # Step 2: Set headers to mimic a web browser
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
#     }
    
#     # Send a GET request to the journal URL with headers
#     response = requests.get(journal_url, headers=headers)
    
#     if response.status_code == 200:
#         # Parse the HTML content
#         soup = BeautifulSoup(response.text, 'html.parser')

#         # Find the Subject Area and Category section
#         subject_area_header = soup.find('h2', string="Subject Area and Category")
#         if subject_area_header:
#             # Find the first <ul> that follows the header
#             subject_area_list = subject_area_header.find_next('ul')
#             if subject_area_list:
#                 # Extract top-level <li> items that contain <a> tags
#                 top_level_subjects = []
#                 for li in subject_area_list.find_all('li', recursive=False):
#                     # Check if it has an <a> tag and capture its text
#                     a_tag = li.find('a')
#                     if a_tag:
#                         top_level_subjects.append(a_tag.text.strip())
#                 # Join the subject areas into a single string
#                 search_scope = "; ".join(top_level_subjects)
#                 return f'Search Scope: "{search_scope}"'
#         return "Subject area not found."
#     else:
#         return f"Failed to retrieve journal detail page: {response.status_code}"

# # Example usage
# journal_url = "https://www.scimagojr.com/journalsearch.php?q=23388&tip=sid&clean=0"
# subject_areas = get_journal_subject_areas(journal_url)
# print(subject_areas)


Search Scope: "Environmental Science"


#### ALL Category (Top + Tree)

In [45]:
# import requests
# from bs4 import BeautifulSoup

# def get_journal_subject_areas(journal_url):
#     # Step 2: Set headers to mimic a web browser
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
#     }
    
#     # Send a GET request to the journal URL with headers
#     response = requests.get(journal_url, headers=headers)
    
#     if response.status_code == 200:
#         # Parse the HTML content
#         soup = BeautifulSoup(response.text, 'html.parser')

#         # Find the Subject Area and Category section
#         subject_area_header = soup.find('h2', string="Subject Area and Category")
#         if subject_area_header:
#             # Initialize a list to collect subject areas
#             top_level_subjects = []
            
#             # Find all <ul> elements that follow the header
#             for ul in subject_area_header.find_all_next('ul'):
#                 # Extract the top-level <li> items that contain <a> tags
#                 for li in ul.find_all('li', recursive=False):
#                     a_tag = li.find('a')
#                     if a_tag:
#                         top_level_subjects.append(a_tag.text.strip())
            
#             # Join the subject areas into a single string
#             search_scope = "; ".join(top_level_subjects)
#             return f'Search Scope: "{search_scope}"'
#         return "Subject area not found."
#     else:
#         return f"Failed to retrieve journal detail page: {response.status_code}"

# # Example usage
# journal_url = "https://www.scimagojr.com/journalsearch.php?q=29514&tip=sid&clean=0"
# subject_areas = get_journal_subject_areas(journal_url)
# print(subject_areas)


Search Scope: "Health Professions; Radiological and Ultrasound Technology; Medicine; Medicine (miscellaneous); Public Health, Environmental and Occupational Health; Radiology, Nuclear Medicine and Imaging; Physics and Astronomy; Radiation"


#### Tree category 만 (세부)

In [44]:
# import requests
# from bs4 import BeautifulSoup

# def get_journal_subject_areas(journal_url):
#     # Step 2: Set headers to mimic a web browser
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
#     }
    
#     # Send a GET request to the journal URL with headers
#     response = requests.get(journal_url, headers=headers)
    
#     if response.status_code == 200:
#         # Parse the HTML content
#         soup = BeautifulSoup(response.text, 'html.parser')

#         # Find the Subject Area and Category section
#         subject_area_header = soup.find('h2', string="Subject Area and Category")
#         if subject_area_header:
#             # Initialize a list to collect top-level subject areas
#             top_level_subjects = []
            
#             # Find all <ul> elements that follow the header until the next header
#             for sibling in subject_area_header.find_all_next():
#                 # Stop if we encounter another header (h2)
#                 if sibling.name == 'h2':
#                     break
#                 # Look for <ul> elements only and extract top-level <li> items
#                 if sibling.name == 'ul':
#                     for li in sibling.find_all('li', recursive=False):
#                         a_tag = li.find('a')
#                         if a_tag:
#                             # Only add if not a treecategory
#                             if not li.find('ul', class_='treecategory'):
#                                 top_level_subjects.append(a_tag.text.strip())

#             # Join the subject areas into a single string
#             search_scope = "; ".join(top_level_subjects)
#             return f'Search Scope: "{search_scope}"'
#         return "Subject area not found."
#     else:
#         return f"Failed to retrieve journal detail page: {response.status_code}"

# # Example usage
# journal_url = "https://www.scimagojr.com/journalsearch.php?q=29514&tip=sid&clean=0"
# subject_areas = get_journal_subject_areas(journal_url)
# print(subject_areas)


Search Scope: "Radiological and Ultrasound Technology; Medicine (miscellaneous); Public Health, Environmental and Occupational Health; Radiology, Nuclear Medicine and Imaging; Radiation"


#### ALL - Tree

In [46]:
def find_unique_subjects(scope1, scope2):
    # Split the search scopes into lists and strip whitespace
    list1 = [subject.strip() for subject in scope1.split(';')]
    list2 = [subject.strip() for subject in scope2.split(';')]
    
    # Convert lists to sets for easy comparison
    set1 = set(list1)
    set2 = set(list2)
    
    # Find subjects that are in scope1 but not in scope2
    unique_subjects = set1 - set2
    
    # Join the unique subjects into a single string
    return f'Search Scope: "{"; ".join(unique_subjects)}"' if unique_subjects else 'No unique subjects found.'

# Example inputs
scope1 = "Health Professions; Radiological and Ultrasound Technology; Medicine; Medicine (miscellaneous); Public Health, Environmental and Occupational Health; Radiology, Nuclear Medicine and Imaging; Physics and Astronomy; Radiation"
scope2 = "Radiological and Ultrasound Technology; Medicine (miscellaneous); Public Health, Environmental and Occupational Health; Radiology, Nuclear Medicine and Imaging; Radiation"

# Get unique subjects
output = find_unique_subjects(scope1, scope2)
print(output)

Search Scope: "Medicine; Health Professions; Physics and Astronomy"


#### 자잘구리 데이터 처리

In [21]:
import pandas as pd

# Step 1: Load the dataset
file_path = 'Paper_Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Step 2: Trim whitespace from column names
df.columns = df.columns.str.strip()

# Step 3: Define the journal criteria and new values
journal_name = "applied sciences-basel"
journal_link = "https://www.scimagojr.com/journalsearch.php?q=21100829268&tip=sid&clean=0"
scope_all = "Chemical Engineering; Fluid Flow and Transfer Processes; Process Chemistry and Technology; Computer Science; Computer Science Applications; Engineering; Engineering (miscellaneous); Materials Science; Materials Science (miscellaneous); Physics and Astronomy; Instrumentation"
scope_top = "Chemical Engineering; Computer Science; Engineering; Materials Science; Physics and Astronomy"
scope_tree = "Fluid Flow and Transfer Processes; Process Chemistry and Technology; Computer Science Applications; Engineering (miscellaneous); Materials Science (miscellaneous); Instrumentation"

# Step 4: Update the DataFrame
mask = df['Journal'].str.lower() == journal_name.lower()  # Case insensitive comparison
df.loc[mask, 'Journal_Link'] = journal_link
df.loc[mask, 'Scope_All'] = scope_all
df.loc[mask, 'Scope_Top'] = scope_top
df.loc[mask, 'Scope_Tree'] = scope_tree

# Step 5: Save the updated DataFrame to a new CSV file
df.to_csv('Updated_Paper_Dataset.csv', index=False, encoding='utf-8-sig')

print("The specified fields have been filled in for the 'applied sciences-basel' journal, and the updated dataset has been saved.")


The specified fields have been filled in for the 'applied sciences-basel' journal, and the updated dataset has been saved.


In [None]:
# Journal -> Journal_Link 수동 입력
# Input data as a multiline string
input_data = """
Journal: Space Weather an AGU journal
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=4000151603&tip=sid&clean=0

Journal: SPACE WEATHER-THE INTERNATIONAL JOURNAL OF RESEARCH AND APPLICATIONS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=4000151603&tip=sid&clean=0

Journal: ACTA CRYSTALLOGRAPHICA SECTION F-STRUCTURAL BIOLOGY AND CRYSTALLIZATION COMMUNICATIONS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100326880&tip=sid&clean=0

Journal: ACTA CRYSTALLOGRAPHICA SECTION D-BIOLOGICAL CRYSTALLOGRAPHY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100778657&tip=sid&clean=0

Journal: JOURNAL OF SURFACE INVESTIGATION-X-RAY SYNCHROTRON AND NEUTRON TECHNIQUES
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=63269&tip=sid&clean=0

Journal: MATERIALS SCIENCE AND ENGINEERING A-STRUCTURAL MATERIALS PROPERTIES MICROSTRUCTURE AND PROCESSING
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=17811&tip=sid&clean=0

Journal: VIRUSES-BASEL
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=19700188364&tip=sid&clean=0

Journal: ANNUAL REVIEW OF CHEMICAL AND BIOMOLECULAR ENGINEERING, VOL 3
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=19700188418&tip=sid&clean=0

Journal: CLUSTER COMPUTING-THE JOURNAL OF NETWORKS SOFTWARE TOOLS AND APPLICATIONS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=24596&tip=sid&clean=0

Journal: JOURNAL OF AGRICULTURAL METEOROLOGY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=30706&tip=sid&clean=0

Journal: SYMMETRY-BASEL
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100201542&tip=sid&clean=0 

Journal: KOREAN JOURNAL OF HORTICULTURAL SCIENCE & TECHNOLOGY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=19900192027&tip=sid&clean=0

Journal: Agronomy-Basel
Journal_Link: http://scimagojr.com/journalsearch.php?q=15639&tip=sid&clean=0 

Journal: Plants-Basel
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100788294&tip=sid&clean=0

Journal: JOURNAL OF SIGNAL PROCESSING SYSTEMS FOR SIGNAL IMAGE AND VIDEO TECHNOLOGY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=11400153333&tip=sid&clean=0

Journal: FUTURE GENERATION COMPUTER SYSTEMS-THE INTERNATIONAL JOURNAL OF GRID COMPUTING AND ESCIENCE
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=12264&tip=sid&clean=0

Journal: INTERNATIONAL JOURNAL OF REFRIGERATION-REVUE INTERNATIONALE DU FROID
Journal_Link: http://scimagojr.com/journalsearch.php?q=16113&tip=sid&clean=0

Journal: JOURNAL OF THE KOREAN SURGICAL SOCIETY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100326082&tip=sid&clean=0 

Journal: JOURNAL OF ENGINEERING FOR GAS TURBINES AND POWER-TRANSACTIONS OF THE ASME
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=20962&tip=sid&clean=0

Journal: MICROSYSTEM TECHNOLOGIES-MICRO-AND NANOSYSTEMS-INFORMATION STORAGE AND PROCESSING SYSTEMS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=26738&tip=sid&clean=0 

Journal: ARCHIVES OF OTOLARYNGOLOGY-HEAD & NECK SURGERY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100200823&tip=sid&clean=0

Journal: JOURNAL OF CHEMICAL AND ENGINEERING DATA
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=24158&tip=sid&clean=0

Journal: JOURNAL OF ENGINEERING MATERIALS AND TECHNOLOGY-TRANSACTIONS OF THE ASME
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21155&tip=sid&clean=0

Journal: HYDROLOGICAL SCIENCES JOURNAL-JOURNAL DES SCIENCES HYDROLOGIQUES
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=29470&tip=sid&clean=0

Journal: CANADIAN JOURNAL OF OPHTHALMOLOGY-JOURNAL CANADIEN D OPHTALMOLOGIE
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=13766&tip=sid&clean=0

Journal: MATERIALS SCIENCE AND ENGINEERING B-ADVANCED FUNCTIONAL SOLID-STATE MATERIALS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=17812&tip=sid&clean=0

Journal: JOURNAL OF MANUFACTURING SCIENCE AND ENGINEERING-TRANSACTIONS OF THE ASME
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=20966&tip=sid&clean=0 

Journal: ANNALS OF OCCUPATIONAL HYGIENE
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100821119&tip=sid&clean=0

Journal: The Korean Journal of Physiology & Pharmacology
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=23173&tip=sid&clean=0

Journal: JOURNAL OF MOLECULAR CATALYSIS A-CHEMICAL
Journal_Link: http://scimagojr.com/journalsearch.php?q=17619&tip=sid&clean=0

Journal: EARTHQUAKES AND STRUCTURES
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=19700188258&tip=sid

Journal: TEHNICKI VJESNIK-TECHNICAL GAZETTE
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=14569&tip=sid&clean=0

Journal: ADSORPTION-JOURNAL OF THE INTERNATIONAL ADSORPTION SOCIETY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=16302&tip=sid&clean=0 

Journal: JOURNAL OF INFLAMMATION-LONDON
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=40695&tip=sid&clean=0

Journal: Transactions of Nonferrous Metals Society of China (English Edition)
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=27854&tip=sid&clean=0

Journal: JOURNAL OF COATINGS TECHNOLOGY AND RESEARCH
Journal_Link: http://scimagojr.com/journalsearch.php?q=12725&tip=sid&clean=0

Journal: Nanoscale reseach Letters
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21101151625&tip=sid&clean=0

Journal: ACSPhotonics
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100368207&tip=sid&clean=0

Journal: ABSTRACTS OF PAPERS OF THE AMERICAN CHEMICAL SOCIETY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=22680&tip=sid&clean=0

Journal: ACM SIGCOMM COMPUTER COMMUNICATION REVIEW
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=13683&tip=sid&clean=0

Journal: ACS Applied Materials and Interfaces
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=19700171101&tip=sid&clean=0

Journal: ACTA VETERINARIA-BEOGRAD
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=17060&tip=sid&clean=0

Journal: Agriculture-Basel
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100781511&tip=sid&clean=0

Journal: AGRONOMY-BASEL
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100447811&tip=sid&clean=0

Journal: ALGAL RESEARCH-BIOMASS BIOFUELS AND BIOPRODUCTS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100201089&tip=sid&clean=0

Journal: ALTEX-ALTERNATIVES TO ANIMAL EXPERIMENTATION
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=26767&tip=sid&clean=0

Journal: Alzheimers & Dementia
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=3600148102&tip=sid&clean=0

Journal: Alzheimers Research & Therapy
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=19700174935&tip=sid&clean=0

Journal: ANNALS OF ANATOMY-ANATOMISCHER ANZEIGER
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=27472&tip=sid&clean=0

Journal: Antibiotics-Basel
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100469670&tip=sid&clean=0

Journal: ANTONIE VAN LEEUWENHOEK INTERNATIONAL JOURNAL OF GENERAL AND MOLECULAR MICROBIOLOGY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=14944&tip=sid&clean=0

Journal: APPLIED SCIENCES BASEL
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100829268&tip=sid&clean=0

Journal: ARCHIVES OF OTOLARYNGOLOGY-HEAD & NECK SURGERY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100200823&tip=sid&clean=0

Journal: ATW-INTERNATIONAL JOURNAL FOR NUCLEAR POWER
Journal_Link: http://scimagojr.com/journalsearch.php?q=29351&tip=sid&clean=0

Journal: Biosensors-Basel
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100293900&tip=sid&clean=0

Journal: CENTRAL EUROPEAN JOURNAL OF GEOSCIENCES
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=4400151402&tip=sid&clean=0

Journal: CLUSTER COMPUTING-THE JOURNAL OF NETWORKS SOFTWARE TOOLS AND APPLICATIONS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=24596&tip=sid&clean=0

Journal: CMC-Computers Materials & Continua
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=24364&tip=sid&clean=0

Journal: CURRENT MEDICAL IMAGING REVIEWS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=4700152432&tip=sid&clean=0

Journal: Diabetes Metabolic Syndrome and Obesity-Targets and Therapy
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=19700174905&tip=sid&clean=0

Journal: Diversity-Basel
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=6000187990&tip=sid&clean=0

Journal: EKSPLOATACJA I NIEZAWODNOSC-MAINTENANCE AND RELIABILITY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=19700182638&tip=sid&clean=0

Journal: ENGINEERING SCIENCE AND TECHNOLOGY-AN INTERNATIONAL JOURNAL-JESTECH
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100806003&tip=sid&clean=0

Journal: FERMENTATION-BASEL
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100900055&tip=sid&clean=0

Journal: FUTURE GENERATION COMPUTER SYSTEMS-THE INTERNATIONAL JOURNAL OF ESCIENCE
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=12264&tip=sid&clean=0

Journal: GRAEFES ARCHIVE FOR CLINICAL AND EXPERIMENTAL OPHTHALMOLOGY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=13904&tip=sid&clean=0

Journal: HEPATOBILIARY SURGERY AND NUTRITION
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21101019769&tip=sid&clean=0

Journal: HYDROLOGICAL SCIENCES JOURNAL-JOURNAL DES SCIENCES HYDROLOGIQUES
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=29470&tip=sid&clean=0

Journal: IEEE transactions on applied superconductivity : a publication of the IEEE Superconductivity Committee
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=26026&tip=sid&clean=0

Journal: IEEE TRANSACTIONS ON INFORMATION TECHNOLOGY IN BIOMEDICINE
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100256982&tip=sid&clean=0

Journal: IIE TRANSACTIONS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100794597&tip=sid&clean=0

Journal: INJURY-INTERNATIONAL JOURNAL OF THE CARE OF THE INJURED
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=29801&tip=sid&clean=0

Journal: INORGANIC CHEMISTRY COMMUNICATIONS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=25267&tip=sid&clean=0

Journal: J. Alloys Compd.
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=12325&tip=sid&clean=0

Journal: J. of Ceramic Precessing Research
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=22026&tip=sid&clean=0

Journal: JNCI-JOURNAL OF THE NATIONAL CANCER INSTITUTE
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=12459&tip=sid&clean=0

Journal: Jouranl of Ceramic Processing Research
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=22026&tip=sid&clean=0

Journal: JOURNAL OF AGRICULTURAL METEOROLOGY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=30706&tip=sid&clean=0

Journal: JOURNAL OF ALZHEIMERS DISEASE
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=16246&tip=sid&clean=0

Journal: JOURNAL OF BONE AND JOINT SURGERY-AMERICAN VOLUME
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=12198&tip=sid&clean=0

Journal: JOURNAL OF CLOUD COMPUTING-ADVANCES SYSTEMS AND APPLICATIONS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100383744&tip=sid&clean=0

Journal: JOURNAL OF CRANIO-MAXILLOFACIAL SURGERY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21658&tip=sid&clean=0

Journal: JOURNAL OF ENVIRONMENTAL SCIENCES-CHINA
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=23393&tip=sid&clean=0

Journal: JOURNAL OF FOOD SCIENCE AND TECHNOLOGY-MYSORE
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=20617&tip=sid&clean=0

Journal: JOURNAL OF MACROMOLECULAR SCIENCE PART A-PURE AND APPLIED CHEMISTRY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=25891&tip=sid&clean=0

Journal: JOURNAL OF MACROMOLECULAR SCIENCE PART B-PHYSICS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=28522&tip=sid&clean=0

Journal: JOURNAL OF MANUFACTURING SCIENCE AND ENGINEERING-TRANSACTIONS OF THE ASME
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=20966&tip=sid&clean=0

Journal: JOURNAL OF MATERIALS RESEARCH AND TECHNOLOGY-JMR&T
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100383742&tip=sid&clean=0

Journal: JOURNAL OF MECHANISMS AND ROBOTICS-TRANSACTIONS OF THE ASME
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=19700186816&tip=sid&clean=0

Journal: JOURNAL OF OFFSHORE MECHANICS AND ARCTIC ENGINEERING-TRANSACTIONS OF THE ASME
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=20985&tip=sid&clean=0

Journal: JOURNAL OF PHYSIOLOGY-LONDON
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=23478&tip=sid&clean=0

Journal: JOURNAL OF PROSTHODONTICS-IMPLANT ESTHETIC AND RECONSTRUCTIVE DENTISTRY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=26177&tip=sid&clean=0

Journal: JOURNAL OF SIGNAL PROCESSING SYSTEMS FOR SIGNAL IMAGE AND VIDEO TECHNOLOGY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=11400153333&tip=sid&clean=0

Journal: JOURNAL OF SURFACE INVESTIGATION-X-RAY SYNCHROTRON AND NEUTRON TECHNIQUES
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=63269&tip=sid&clean=0

Journal: JOURNAL OF THE FRANKLIN INSTITUTE-ENGINEERING AND APPLIED MATHEMATICS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=27959&tip=sid&clean=0

Journal: JOURNAL OF TRIBOLOGY-TRANSACTIONS OF THE ASME
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=17014&tip=sid&clean=0

Journal: Jove-Journal of Visualized Experiments
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=19900191993&tip=sid&clean=0

Journal: KOREAN JOURNAL OF HORTICULTURAL SCIENCE & TECHNOLOGY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=19900192027&tip=sid&clean=0

Journal: LWT-FOOD SCIENCE AND TECHNOLOGY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=22475&tip=sid&clean=0

Journal: MARINE ECOLOGY-AN EVOLUTIONARY PERSPECTIVE
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=12169&tip=sid&clean=0

Journal: MATERIALS SCIENCE-MEDZIAGOTYRA
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=19300157032&tip=sid

Journal: MEMBRANE AND WATER TREATMENT
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100265344&tip=sid&clean=0

Journal: MICROCHIMICA ACTA
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=24072&tip=sid

Journal: Natural Hazards and Earth System Sciences Discussions
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=51166&tip=sid&clean=0

Journal: PHYSICA STATUS SOLIDI B-BASIC SOLID STATE PHYSICS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=29140&tip=sid&clean=0

Journal: PLANT GENETIC RESOURCES-CHARACTERIZATION AND UTILIZATION
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=98120&tip=sid&clean=0

Journal: POLIMEROS-CIENCIA E TECNOLOGIA
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=5000156907&tip=sid&clean=0

Journal: PRECISION ENGINEERING-JOURNAL OF THE INTERNATIONAL SOCIETIES FOR PRECISION ENGINEERING AND NANOTECHNOLOGY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=13790&tip=sid&clean=0

Journal: PUBLICATIONS OF THE ASTRONOMICAL SOCIETY OF JAPAN
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=27778&tip=sid&clean=0

Journal: RETINA-THE JOURNAL OF RETINAL AND VITREOUS DISEASES
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=15094&tip=sid

Journal: REVISTA BRASILEIRA DE FARMACOGNOSIA-BRAZILIAN JOURNAL OF PHARMACOGNOSY
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=5400152628&tip=sid&exact=no

Journal: Sensors (Switzerland)
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=130124&tip=sid

Journal: SIMULATION-TRANSACTIONS OF THE SOCIETY FOR MODELING AND SIMULATION INTERNATIONAL
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=14452&tip=sid&clean=0

Journal: Spanish journal of agricultural research = Revista de investigaci oacute;n agraria : SJAR
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=5800179591&tip=sid

Journal: STRUCTURAL HEALTH MONITORING-AN INTERNATIONAL JOURNAL
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=5800179591&tip=sid

Journal: Sustainability (Switzerland)
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100240100&tip=sid

Journal: SYNTHESIS-STUTTGART
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=26508&tip=sid&clean=0

Journal: The American Journal of Chinese Medicine
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=18134&tip=sid&clean=0

Journal: Turkish Journal of Biochemistry-Turk Biyokimya Dergisi
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=17600155132&tip=sid&clean=0

Journal: WORLD JOURNAL OF MENS HEALTH
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=21100943924&tip=sid&clean=0

Journal: WORLD WIDE WEB-INTERNET AND WEB INFORMATION SYSTEMS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=14965&tip=sid&clean=0

Journal: ZEITSCHRIFT FUR PHYSIKALISCHE CHEMIE-INTERNATIONAL JOURNAL OF RESEARCH IN PHYSICAL CHEMISTRY & CHEMICAL PHYSICS
Journal_Link: https://www.scimagojr.com/journalsearch.php?q=23775&tip=sid

"""

# 줄 단위로 데이터를 나눠 리스트로 변환
lines = input_data.strip().split("\n")

# 딕셔너리로 변환
journal_dict = {}
for i in range(0, len(lines), 3):  # 3줄씩 처리
    journal_name = lines[i].split("Journal: ")[1].strip()
    journal_link = lines[i+1].split("Journal_Link: ")[1].strip()
    journal_dict[journal_name] = journal_link

# 원하는 형식으로 출력
print("{")
for journal, link in journal_dict.items():
    print(f"    '{journal}': '{link}',")
print("}")


In [28]:
import pandas as pd

# Step 1: Load the dataset
file_path = 'Paper_Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Step 2: Define the dictionary of journal names and their corresponding links
manual_journal_links = {
    'Space Weather an AGU journal': 'https://www.scimagojr.com/journalsearch.php?q=4000151603&tip=sid&clean=0',
    'SPACE WEATHER-THE INTERNATIONAL JOURNAL OF RESEARCH AND APPLICATIONS': 'https://www.scimagojr.com/journalsearch.php?q=4000151603&tip=sid&clean=0',
    'ACTA CRYSTALLOGRAPHICA SECTION F-STRUCTURAL BIOLOGY AND CRYSTALLIZATION COMMUNICATIONS': 'https://www.scimagojr.com/journalsearch.php?q=21100326880&tip=sid&clean=0',
    'ACTA CRYSTALLOGRAPHICA SECTION D-BIOLOGICAL CRYSTALLOGRAPHY': 'https://www.scimagojr.com/journalsearch.php?q=21100778657&tip=sid&clean=0',
    'JOURNAL OF SURFACE INVESTIGATION-X-RAY SYNCHROTRON AND NEUTRON TECHNIQUES': 'https://www.scimagojr.com/journalsearch.php?q=63269&tip=sid&clean=0',
    'MATERIALS SCIENCE AND ENGINEERING A-STRUCTURAL MATERIALS PROPERTIES MICROSTRUCTURE AND PROCESSING': 'https://www.scimagojr.com/journalsearch.php?q=17811&tip=sid&clean=0',
    'VIRUSES-BASEL': 'https://www.scimagojr.com/journalsearch.php?q=19700188364&tip=sid&clean=0',
    'ANNUAL REVIEW OF CHEMICAL AND BIOMOLECULAR ENGINEERING, VOL 3': 'https://www.scimagojr.com/journalsearch.php?q=19700188418&tip=sid&clean=0',
    'CLUSTER COMPUTING-THE JOURNAL OF NETWORKS SOFTWARE TOOLS AND APPLICATIONS': 'https://www.scimagojr.com/journalsearch.php?q=24596&tip=sid&clean=0',
    'JOURNAL OF AGRICULTURAL METEOROLOGY': 'https://www.scimagojr.com/journalsearch.php?q=30706&tip=sid&clean=0',
    'SYMMETRY-BASEL': 'https://www.scimagojr.com/journalsearch.php?q=21100201542&tip=sid&clean=0',
    'KOREAN JOURNAL OF HORTICULTURAL SCIENCE & TECHNOLOGY': 'https://www.scimagojr.com/journalsearch.php?q=19900192027&tip=sid&clean=0',
    'Agronomy-Basel': 'http://scimagojr.com/journalsearch.php?q=15639&tip=sid&clean=0',
    'Plants-Basel': 'https://www.scimagojr.com/journalsearch.php?q=21100788294&tip=sid&clean=0',
    'JOURNAL OF SIGNAL PROCESSING SYSTEMS FOR SIGNAL IMAGE AND VIDEO TECHNOLOGY': 'https://www.scimagojr.com/journalsearch.php?q=11400153333&tip=sid&clean=0',
    'FUTURE GENERATION COMPUTER SYSTEMS-THE INTERNATIONAL JOURNAL OF GRID COMPUTING AND ESCIENCE': 'https://www.scimagojr.com/journalsearch.php?q=12264&tip=sid&clean=0',
    'INTERNATIONAL JOURNAL OF REFRIGERATION-REVUE INTERNATIONALE DU FROID': 'http://scimagojr.com/journalsearch.php?q=16113&tip=sid&clean=0',
    'JOURNAL OF THE KOREAN SURGICAL SOCIETY': 'https://www.scimagojr.com/journalsearch.php?q=21100326082&tip=sid&clean=0',
    'JOURNAL OF ENGINEERING FOR GAS TURBINES AND POWER-TRANSACTIONS OF THE ASME': 'https://www.scimagojr.com/journalsearch.php?q=20962&tip=sid&clean=0',
    'MICROSYSTEM TECHNOLOGIES-MICRO-AND NANOSYSTEMS-INFORMATION STORAGE AND PROCESSING SYSTEMS': 'https://www.scimagojr.com/journalsearch.php?q=26738&tip=sid&clean=0',
    'ARCHIVES OF OTOLARYNGOLOGY-HEAD & NECK SURGERY': 'https://www.scimagojr.com/journalsearch.php?q=21100200823&tip=sid&clean=0',
    'JOURNAL OF CHEMICAL AND ENGINEERING DATA': 'https://www.scimagojr.com/journalsearch.php?q=24158&tip=sid&clean=0',
    'JOURNAL OF ENGINEERING MATERIALS AND TECHNOLOGY-TRANSACTIONS OF THE ASME': 'https://www.scimagojr.com/journalsearch.php?q=21155&tip=sid&clean=0',
    'HYDROLOGICAL SCIENCES JOURNAL-JOURNAL DES SCIENCES HYDROLOGIQUES': 'https://www.scimagojr.com/journalsearch.php?q=29470&tip=sid&clean=0',
    'CANADIAN JOURNAL OF OPHTHALMOLOGY-JOURNAL CANADIEN D OPHTALMOLOGIE': 'https://www.scimagojr.com/journalsearch.php?q=13766&tip=sid&clean=0',
    'MATERIALS SCIENCE AND ENGINEERING B-ADVANCED FUNCTIONAL SOLID-STATE MATERIALS': 'https://www.scimagojr.com/journalsearch.php?q=17812&tip=sid&clean=0',
    'JOURNAL OF MANUFACTURING SCIENCE AND ENGINEERING-TRANSACTIONS OF THE ASME': 'https://www.scimagojr.com/journalsearch.php?q=20966&tip=sid&clean=0',
    'ANNALS OF OCCUPATIONAL HYGIENE': 'https://www.scimagojr.com/journalsearch.php?q=21100821119&tip=sid&clean=0',
    'The Korean Journal of Physiology & Pharmacology': 'https://www.scimagojr.com/journalsearch.php?q=23173&tip=sid&clean=0',
    'JOURNAL OF MOLECULAR CATALYSIS A-CHEMICAL': 'http://scimagojr.com/journalsearch.php?q=17619&tip=sid&clean=0',
    'EARTHQUAKES AND STRUCTURES': 'https://www.scimagojr.com/journalsearch.php?q=19700188258&tip=sid',
    'TEHNICKI VJESNIK-TECHNICAL GAZETTE': 'https://www.scimagojr.com/journalsearch.php?q=14569&tip=sid&clean=0',
    'ADSORPTION-JOURNAL OF THE INTERNATIONAL ADSORPTION SOCIETY': 'https://www.scimagojr.com/journalsearch.php?q=16302&tip=sid&clean=0',
    'JOURNAL OF INFLAMMATION-LONDON': 'https://www.scimagojr.com/journalsearch.php?q=40695&tip=sid&clean=0',
    'Transactions of Nonferrous Metals Society of China (English Edition)': 'https://www.scimagojr.com/journalsearch.php?q=27854&tip=sid&clean=0',
    'JOURNAL OF COATINGS TECHNOLOGY AND RESEARCH': 'http://scimagojr.com/journalsearch.php?q=12725&tip=sid&clean=0',
    'Nanoscale reseach Letters': 'https://www.scimagojr.com/journalsearch.php?q=21101151625&tip=sid&clean=0',
    'ACSPhotonics': 'https://www.scimagojr.com/journalsearch.php?q=21100368207&tip=sid&clean=0',
    'ABSTRACTS OF PAPERS OF THE AMERICAN CHEMICAL SOCIETY': 'https://www.scimagojr.com/journalsearch.php?q=22680&tip=sid&clean=0',
    'ACM SIGCOMM COMPUTER COMMUNICATION REVIEW': 'https://www.scimagojr.com/journalsearch.php?q=13683&tip=sid&clean=0',
    'ACS Applied Materials and Interfaces': 'https://www.scimagojr.com/journalsearch.php?q=19700171101&tip=sid&clean=0',
    'ACTA VETERINARIA-BEOGRAD': 'https://www.scimagojr.com/journalsearch.php?q=17060&tip=sid&clean=0',
    'Agriculture-Basel': 'https://www.scimagojr.com/journalsearch.php?q=21100781511&tip=sid&clean=0',
    'AGRONOMY-BASEL': 'https://www.scimagojr.com/journalsearch.php?q=21100447811&tip=sid&clean=0',
    'ALGAL RESEARCH-BIOMASS BIOFUELS AND BIOPRODUCTS': 'https://www.scimagojr.com/journalsearch.php?q=21100201089&tip=sid&clean=0',
    'ALTEX-ALTERNATIVES TO ANIMAL EXPERIMENTATION': 'https://www.scimagojr.com/journalsearch.php?q=26767&tip=sid&clean=0',
    'Alzheimers & Dementia': 'https://www.scimagojr.com/journalsearch.php?q=3600148102&tip=sid&clean=0',
    'Alzheimers Research & Therapy': 'https://www.scimagojr.com/journalsearch.php?q=19700174935&tip=sid&clean=0',
    'ANNALS OF ANATOMY-ANATOMISCHER ANZEIGER': 'https://www.scimagojr.com/journalsearch.php?q=27472&tip=sid&clean=0',
    'Antibiotics-Basel': 'https://www.scimagojr.com/journalsearch.php?q=21100469670&tip=sid&clean=0',
    'ANTONIE VAN LEEUWENHOEK INTERNATIONAL JOURNAL OF GENERAL AND MOLECULAR MICROBIOLOGY': 'https://www.scimagojr.com/journalsearch.php?q=14944&tip=sid&clean=0',
    'APPLIED SCIENCES BASEL': 'https://www.scimagojr.com/journalsearch.php?q=21100829268&tip=sid&clean=0',
    'ATW-INTERNATIONAL JOURNAL FOR NUCLEAR POWER': 'http://scimagojr.com/journalsearch.php?q=29351&tip=sid&clean=0',
    'Biosensors-Basel': 'https://www.scimagojr.com/journalsearch.php?q=21100293900&tip=sid&clean=0',
    'CENTRAL EUROPEAN JOURNAL OF GEOSCIENCES': 'https://www.scimagojr.com/journalsearch.php?q=4400151402&tip=sid&clean=0',
    'CMC-Computers Materials & Continua': 'https://www.scimagojr.com/journalsearch.php?q=24364&tip=sid&clean=0',
    'CURRENT MEDICAL IMAGING REVIEWS': 'https://www.scimagojr.com/journalsearch.php?q=4700152432&tip=sid&clean=0',
    'Diabetes Metabolic Syndrome and Obesity-Targets and Therapy': 'https://www.scimagojr.com/journalsearch.php?q=19700174905&tip=sid&clean=0',
    'Diversity-Basel': 'https://www.scimagojr.com/journalsearch.php?q=6000187990&tip=sid&clean=0',
    'EKSPLOATACJA I NIEZAWODNOSC-MAINTENANCE AND RELIABILITY': 'https://www.scimagojr.com/journalsearch.php?q=19700182638&tip=sid&clean=0',
    'ENGINEERING SCIENCE AND TECHNOLOGY-AN INTERNATIONAL JOURNAL-JESTECH': 'https://www.scimagojr.com/journalsearch.php?q=21100806003&tip=sid&clean=0',
    'FERMENTATION-BASEL': 'https://www.scimagojr.com/journalsearch.php?q=21100900055&tip=sid&clean=0',
    'FUTURE GENERATION COMPUTER SYSTEMS-THE INTERNATIONAL JOURNAL OF ESCIENCE': 'https://www.scimagojr.com/journalsearch.php?q=12264&tip=sid&clean=0',
    'GRAEFES ARCHIVE FOR CLINICAL AND EXPERIMENTAL OPHTHALMOLOGY': 'https://www.scimagojr.com/journalsearch.php?q=13904&tip=sid&clean=0',
    'HEPATOBILIARY SURGERY AND NUTRITION': 'https://www.scimagojr.com/journalsearch.php?q=21101019769&tip=sid&clean=0',
    'IEEE transactions on applied superconductivity : a publication of the IEEE Superconductivity Committee': 'https://www.scimagojr.com/journalsearch.php?q=26026&tip=sid&clean=0',
    'IEEE TRANSACTIONS ON INFORMATION TECHNOLOGY IN BIOMEDICINE': 'https://www.scimagojr.com/journalsearch.php?q=21100256982&tip=sid&clean=0',
    'IIE TRANSACTIONS': 'https://www.scimagojr.com/journalsearch.php?q=21100794597&tip=sid&clean=0',
    'INJURY-INTERNATIONAL JOURNAL OF THE CARE OF THE INJURED': 'https://www.scimagojr.com/journalsearch.php?q=29801&tip=sid&clean=0',
    'INORGANIC CHEMISTRY COMMUNICATIONS': 'https://www.scimagojr.com/journalsearch.php?q=25267&tip=sid&clean=0',
    'J. Alloys Compd.': 'https://www.scimagojr.com/journalsearch.php?q=12325&tip=sid&clean=0',
    'J. of Ceramic Precessing Research': 'https://www.scimagojr.com/journalsearch.php?q=22026&tip=sid&clean=0',
    'JNCI-JOURNAL OF THE NATIONAL CANCER INSTITUTE': 'https://www.scimagojr.com/journalsearch.php?q=12459&tip=sid&clean=0',
    'Jouranl of Ceramic Processing Research': 'https://www.scimagojr.com/journalsearch.php?q=22026&tip=sid&clean=0',
    'JOURNAL OF ALZHEIMERS DISEASE': 'https://www.scimagojr.com/journalsearch.php?q=16246&tip=sid&clean=0',
    'JOURNAL OF BONE AND JOINT SURGERY-AMERICAN VOLUME': 'https://www.scimagojr.com/journalsearch.php?q=12198&tip=sid&clean=0',
    'JOURNAL OF CLOUD COMPUTING-ADVANCES SYSTEMS AND APPLICATIONS': 'https://www.scimagojr.com/journalsearch.php?q=21100383744&tip=sid&clean=0',
    'JOURNAL OF CRANIO-MAXILLOFACIAL SURGERY': 'https://www.scimagojr.com/journalsearch.php?q=21658&tip=sid&clean=0',
    'JOURNAL OF ENVIRONMENTAL SCIENCES-CHINA': 'https://www.scimagojr.com/journalsearch.php?q=23393&tip=sid&clean=0',
    'JOURNAL OF FOOD SCIENCE AND TECHNOLOGY-MYSORE': 'https://www.scimagojr.com/journalsearch.php?q=20617&tip=sid&clean=0',
    'JOURNAL OF MACROMOLECULAR SCIENCE PART A-PURE AND APPLIED CHEMISTRY': 'https://www.scimagojr.com/journalsearch.php?q=25891&tip=sid&clean=0',
    'JOURNAL OF MACROMOLECULAR SCIENCE PART B-PHYSICS': 'https://www.scimagojr.com/journalsearch.php?q=28522&tip=sid&clean=0',
    'JOURNAL OF MATERIALS RESEARCH AND TECHNOLOGY-JMR&T': 'https://www.scimagojr.com/journalsearch.php?q=21100383742&tip=sid&clean=0',
    'JOURNAL OF MECHANISMS AND ROBOTICS-TRANSACTIONS OF THE ASME': 'https://www.scimagojr.com/journalsearch.php?q=19700186816&tip=sid&clean=0',
    'JOURNAL OF OFFSHORE MECHANICS AND ARCTIC ENGINEERING-TRANSACTIONS OF THE ASME': 'https://www.scimagojr.com/journalsearch.php?q=20985&tip=sid&clean=0',
    'JOURNAL OF PHYSIOLOGY-LONDON': 'https://www.scimagojr.com/journalsearch.php?q=23478&tip=sid&clean=0',
    'JOURNAL OF PROSTHODONTICS-IMPLANT ESTHETIC AND RECONSTRUCTIVE DENTISTRY': 'https://www.scimagojr.com/journalsearch.php?q=26177&tip=sid&clean=0',
    'JOURNAL OF THE FRANKLIN INSTITUTE-ENGINEERING AND APPLIED MATHEMATICS': 'https://www.scimagojr.com/journalsearch.php?q=27959&tip=sid&clean=0',
    'JOURNAL OF TRIBOLOGY-TRANSACTIONS OF THE ASME': 'https://www.scimagojr.com/journalsearch.php?q=17014&tip=sid&clean=0',
    'Jove-Journal of Visualized Experiments': 'https://www.scimagojr.com/journalsearch.php?q=19900191993&tip=sid&clean=0',
    'LWT-FOOD SCIENCE AND TECHNOLOGY': 'https://www.scimagojr.com/journalsearch.php?q=22475&tip=sid&clean=0',
    'MARINE ECOLOGY-AN EVOLUTIONARY PERSPECTIVE': 'https://www.scimagojr.com/journalsearch.php?q=12169&tip=sid&clean=0',
    'MATERIALS SCIENCE-MEDZIAGOTYRA': 'https://www.scimagojr.com/journalsearch.php?q=19300157032&tip=sid',
    'MEMBRANE AND WATER TREATMENT': 'https://www.scimagojr.com/journalsearch.php?q=21100265344&tip=sid&clean=0',
    'MICROCHIMICA ACTA': 'https://www.scimagojr.com/journalsearch.php?q=24072&tip=sid',
    'Natural Hazards and Earth System Sciences Discussions': 'https://www.scimagojr.com/journalsearch.php?q=51166&tip=sid&clean=0',
    'PHYSICA STATUS SOLIDI B-BASIC SOLID STATE PHYSICS': 'https://www.scimagojr.com/journalsearch.php?q=29140&tip=sid&clean=0',
    'PLANT GENETIC RESOURCES-CHARACTERIZATION AND UTILIZATION': 'https://www.scimagojr.com/journalsearch.php?q=98120&tip=sid&clean=0',
    'POLIMEROS-CIENCIA E TECNOLOGIA': 'https://www.scimagojr.com/journalsearch.php?q=5000156907&tip=sid&clean=0',
    'PRECISION ENGINEERING-JOURNAL OF THE INTERNATIONAL SOCIETIES FOR PRECISION ENGINEERING AND NANOTECHNOLOGY': 'https://www.scimagojr.com/journalsearch.php?q=13790&tip=sid&clean=0',
    'PUBLICATIONS OF THE ASTRONOMICAL SOCIETY OF JAPAN': 'https://www.scimagojr.com/journalsearch.php?q=27778&tip=sid&clean=0',
    'RETINA-THE JOURNAL OF RETINAL AND VITREOUS DISEASES': 'https://www.scimagojr.com/journalsearch.php?q=15094&tip=sid',
    'REVISTA BRASILEIRA DE FARMACOGNOSIA-BRAZILIAN JOURNAL OF PHARMACOGNOSY': 'https://www.scimagojr.com/journalsearch.php?q=5400152628&tip=sid&exact=no',
    'Sensors (Switzerland)': 'https://www.scimagojr.com/journalsearch.php?q=130124&tip=sid',
    'SIMULATION-TRANSACTIONS OF THE SOCIETY FOR MODELING AND SIMULATION INTERNATIONAL': 'https://www.scimagojr.com/journalsearch.php?q=14452&tip=sid&clean=0',
    'Spanish journal of agricultural research = Revista de investigaci oacute;n agraria : SJAR': 'https://www.scimagojr.com/journalsearch.php?q=5800179591&tip=sid',
    'STRUCTURAL HEALTH MONITORING-AN INTERNATIONAL JOURNAL': 'https://www.scimagojr.com/journalsearch.php?q=5800179591&tip=sid',
    'Sustainability (Switzerland)': 'https://www.scimagojr.com/journalsearch.php?q=21100240100&tip=sid',
    'SYNTHESIS-STUTTGART': 'https://www.scimagojr.com/journalsearch.php?q=26508&tip=sid&clean=0',
    'The American Journal of Chinese Medicine': 'https://www.scimagojr.com/journalsearch.php?q=18134&tip=sid&clean=0',
    'Turkish Journal of Biochemistry-Turk Biyokimya Dergisi': 'https://www.scimagojr.com/journalsearch.php?q=17600155132&tip=sid&clean=0',
    'WORLD JOURNAL OF MENS HEALTH': 'https://www.scimagojr.com/journalsearch.php?q=21100943924&tip=sid&clean=0',
    'WORLD WIDE WEB-INTERNET AND WEB INFORMATION SYSTEMS': 'https://www.scimagojr.com/journalsearch.php?q=14965&tip=sid&clean=0',
    'ZEITSCHRIFT FUR PHYSIKALISCHE CHEMIE-INTERNATIONAL JOURNAL OF RESEARCH IN PHYSICAL CHEMISTRY & CHEMICAL PHYSICS': 'https://www.scimagojr.com/journalsearch.php?q=23775&tip=sid',
    'INFORMATION-AN INTERNATIONAL INTERDISCIPLINARY JOURNAL': 'https://www.scimagojr.com/journalsearch.php?q=21100201065&tip=sid&clean=0'
}


# Step 1: Normalize the Journal names in the DataFrame
def normalize_journal_name(journal_name):
    # Remove spaces and convert to lowercase
    return journal_name.replace("-", "").replace(" ", "").lower()

# Apply normalization to the 'Journal' column
df['Normalized_Journal'] = df['Journal'].apply(normalize_journal_name)

# Step 2: Create a normalized version of the manual_journal_links dictionary
normalized_manual_links = {normalize_journal_name(k): v for k, v in manual_journal_links.items()}

# Step 3: Fill missing 'Journal_Link' values using the normalized dictionary
df['Journal_Link'] = df.apply(
    lambda row: normalized_manual_links.get(normalize_journal_name(row['Journal']), row['Journal_Link']),
    axis=1
)

# Drop the normalized journal column if no longer needed
df.drop(columns=['Normalized_Journal'], inplace=True)

# Step 4: Save the updated DataFrame to a new CSV file
df.to_csv('Updated_Paper_Dataset.csv', index=False, encoding='utf-8-sig')

print("Missing 'Journal_Link' fields have been filled based on your manual entries, and the updated dataset has been saved.")


Missing 'Journal_Link' fields have been filled based on your manual entries, and the updated dataset has been saved.


#### Organization Name Translation

In [None]:
# # Step 0: Import required libraries
# import pandas as pd
# import deepl

# # Step 1: Load the dataset
# file_path = 'Project_Dataset.csv'
# df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# # Step 2: Extract all organizations from the entire dataset and remove duplicates
# def extract_unique_organizations(df_column):
#     all_orgs = []
#     for org_string in df_column:
#         org_list = org_string.split(',')
#         all_orgs.extend([org.strip() for org in org_list])  # Add each organization after stripping spaces
#     unique_orgs = list(set(all_orgs))  # Get unique organizations
#     return unique_orgs

# # Get the unique organization list from the '협업_매트릭스' column
# unique_org_list = extract_unique_organizations(df['협업_매트릭스'])

# # Step 3: Translate the unique organization list using DeepL API
# auth_key = '87c57d81-4f86-4fed-b8cc-39dbc3d0de99:fx'  # Replace with your actual DeepL API key
# translator = deepl.Translator(auth_key)

# # Step 3: Translate the unique organization list using DeepL API with progress tracking
# def translate_organizations(org_list, translator):
#     translated_orgs = []
#     total_orgs = len(org_list)  # Get the total number of organizations
#     ten_percent = total_orgs // 10  # Calculate the number of organizations that represent 10%

#     for i, org in enumerate(org_list, 1):  # Enumerate to track the index (starting from 1)
#         result = translator.translate_text(org, target_lang="EN-US")  # Translate to English
#         translated_orgs.append(result.text)  # Append translated text to the list
        
#         # Print progress every 10%
#         if i % ten_percent == 0 or i == total_orgs:  # Check for 10% milestones or the last item
#             progress_percentage = (i / total_orgs) * 100
#             print(f"Progress: {progress_percentage:.0f}% - Processed {i}/{total_orgs} organizations.")

#     return translated_orgs

# # Use the translator to translate the organization list
# translated_org_list = translate_organizations(unique_org_list, translator)

In [3]:
# # Create a dictionary to map Korean to English
# org_translation_dict = dict(zip(unique_org_list, translated_org_list))

# # Step 5: Replace Korean names with English names in '협업_매트릭스'
# def replace_org_names(org_column, translation_dict):
#     updated_orgs = []
#     for org_string in org_column:
#         for korean_org, english_org in translation_dict.items():
#             org_string = org_string.replace(korean_org, english_org)  # Replace Korean org names with English
#         updated_orgs.append(org_string)
#     return updated_orgs

# # Apply the replacement to the '협업_매트릭스' column
# df['Org_List'] = replace_org_names(df['협업_매트릭스'], org_translation_dict)

# # Step 6: Save the updated DataFrame to a new CSV file
# output_file_path = 'Project_Dataset_Translated.csv'
# df.to_csv(output_file_path, index=False, encoding='utf-8-sig')

In [12]:
# import deepl

# auth_key = "87c57d81-4f86-4fed-b8cc-39dbc3d0de99:fx"  # Replace with your key
# translator = deepl.Translator(auth_key)

# result = translator.translate_text("Hello, world!", target_lang="FR")
# print(result.text)  # "Bonjour, le monde !"

Bonjour à tous !


In [8]:
# Step 0: Import required libraries
import pandas as pd
import deepl

# Step 1: Load the dataset
file_path = 'Paper_Dataset_rv.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Step 2: Filter the data for '육T관련기술' == 2
df_filtered = df[df['육T관련기술'] == 2]

# Step 3: Extract unique organizations from the 'Affiliation' column
def extract_unique_organizations(df_column):
    all_orgs = set()  # Using a set to avoid duplicates
    for org_string in df_column.dropna():  # Handle any NaN values
        org_list = org_string.split(';')  # Split by ";"
        all_orgs.update([org.strip() for org in org_list])  # Strip spaces and add to set
    return list(all_orgs)  # Convert back to list for further processing

# Create a unique organization list from the 'Affiliation' column
unique_org_list = extract_unique_organizations(df_filtered['Affiliation'])

# Step 4: Translate the unique organization list to Korean using DeepL API
auth_key = '87c57d81-4f86-4fed-b8cc-39dbc3d0de99:fx'  # Replace with your actual DeepL API key
translator = deepl.Translator(auth_key)

def translate_organizations(org_list, translator):
    translated_orgs = []
    for idx, org in enumerate(org_list, start=1):  # start=1 to begin the count at 1
        if org:  # Only translate if the organization name is not empty
            result = translator.translate_text(org, target_lang="KO")  # Translate to Korean
            translated_orgs.append(result.text)
        else:
            translated_orgs.append('')  # Append an empty string if the organization name is empty
        
        # Print progress every 50 translations
        if idx % 50 == 0:
            print(f"Translation progress: {idx} / {len(org_list)}")

    return translated_orgs

# Translate the unique organization list
translated_org_list = translate_organizations(unique_org_list, translator)

# Step 5: Create a dictionary mapping English to Korean organizations
org_translation_dict = dict(zip(unique_org_list, translated_org_list))

# Step 6: Translate 'Affiliation' column in the filtered DataFrame
def translate_affiliations(affiliation_column, translation_dict):
    translated_affiliations = []
    for org_string in affiliation_column.fillna(''):  # Handle NaN as empty strings
        org_list = org_string.split(';')
        translated_orgs = [translation_dict.get(org.strip(), org.strip()) for org in org_list]
        translated_affiliations.append('; '.join(translated_orgs))
    return translated_affiliations

# Apply the translation to the 'Affiliation' column
df_filtered['Affiliation_KR'] = translate_affiliations(df_filtered['Affiliation'], org_translation_dict)

# Step 7: Save the updated DataFrame with translated affiliations to a new CSV file
output_file_path = 'Paper_Dataset_translated.csv'
df_filtered.to_csv(output_file_path, index=False, encoding='utf-8-sig')

# Print confirmation message
print(f"The translated dataset has been saved as '{output_file_path}'")


Translation progress: 50 / 2768
Translation progress: 100 / 2768
Translation progress: 150 / 2768
Translation progress: 200 / 2768
Translation progress: 250 / 2768
Translation progress: 300 / 2768
Translation progress: 350 / 2768
Translation progress: 400 / 2768
Translation progress: 450 / 2768
Translation progress: 500 / 2768
Translation progress: 550 / 2768
Translation progress: 600 / 2768
Translation progress: 650 / 2768
Translation progress: 700 / 2768
Translation progress: 750 / 2768
Translation progress: 800 / 2768
Translation progress: 850 / 2768
Translation progress: 900 / 2768
Translation progress: 950 / 2768
Translation progress: 1000 / 2768
Translation progress: 1050 / 2768
Translation progress: 1100 / 2768
Translation progress: 1150 / 2768
Translation progress: 1200 / 2768
Translation progress: 1250 / 2768
Translation progress: 1300 / 2768
Translation progress: 1350 / 2768
Translation progress: 1400 / 2768
Translation progress: 1450 / 2768
Translation progress: 1500 / 2768


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Affiliation_KR'] = translate_affiliations(df_filtered['Affiliation'], org_translation_dict)


The translated dataset has been saved as 'Paper_Dataset_translated.csv'


In [16]:
import pandas as pd

# Convert the list to a DataFrame
df = pd.DataFrame(translated_org_list, columns=['Organization_Deepl'])

# Specify the file name
file_name = 'translated_org_list.csv'

# Write to CSV file
df.to_csv(file_name, index=False, encoding='utf-8-sig')

In [11]:
# Step 0: Import required libraries
import pandas as pd

# Step 1: Load the dataset
file_path = 'Paper_Dataset_rv.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Step 2: Filter the data for '육T관련기술' == 2
df_filtered = df[df['육T관련기술'] == 2]

# Step 3: Extract unique organizations from the 'Affiliation' column and save to CSV
def extract_unique_organizations(df_column):
    all_orgs = set()  # Using a set to avoid duplicates
    for org_string in df_column.dropna():  # Handle any NaN values
        org_list = org_string.split(',')  # Split by ";"
        all_orgs.update([org.strip() for org in org_list])  # Strip spaces and add to set
    return list(all_orgs)  # Convert back to list for further processing

# Create and save unique organization list from 'Affiliation'
unique_org_list = extract_unique_organizations(df_filtered['협업_매트릭스'])
unique_org_df = pd.DataFrame(unique_org_list, columns=['Organization_KOR'])
unique_org_df.to_csv('project_unique_organizations.csv', index=False, encoding='utf-8-sig')

# Print confirmation message
print("Unique organization list saved as 'unique_organizations.csv'")


Unique organization list saved as 'unique_organizations.csv'


In [17]:
import pandas as pd

# Step 1: Load the datasets
translated_org_df = pd.read_csv('translated_org_list.csv', encoding='utf-8-sig')
matching_dict_df = pd.read_csv('matching_dict_Korean.csv', encoding='utf-8-sig')

# Step 2: Identify unmatched Organization_Deepl entries
# Check which entries in 'Organization_Deepl' are not in 'Organization_KOR'
unmatched_deepl_orgs = translated_org_df[~translated_org_df['Organization_Deepl'].isin(matching_dict_df['Organization_KOR'])]

# Step 3: Output the unmatched entries
unmatched_deepl_orgs.to_csv('unmatched_organizations.csv', index=False, encoding='utf-8-sig')

# Print confirmation message
print("Unmatched organizations saved to 'unmatched_organizations.csv'")


Unmatched organizations saved to 'unmatched_organizations.csv'


In [None]:
# Step 0: Import required libraries
import pandas as pd
import deepl

# Step 1: Load the dataset
file_path = 'Paper_Dataset_rv.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig', low_memory=False)

# Step 2: Filter the data for '육T관련기술' == 2
df_filtered = df[df['육T관련기술'] == 2]

# Step 3: Extract unique organizations from the 'Affiliation' column
def extract_unique_organizations(df_column):
    all_orgs = set()  # Using a set to avoid duplicates
    for org_string in df_column.dropna():  # Handle any NaN values
        org_list = org_string.split(';')  # Split by ";"
        all_orgs.update([org.strip() for org in org_list])  # Strip spaces and add to set
    return list(all_orgs)  # Convert back to list for further processing

# Create a unique organization list from the 'Affiliation' column
unique_org_list = extract_unique_organizations(df_filtered['Affiliation'])

# Step 4: Translate the unique organization list to Korean using DeepL API
auth_key = '87c57d81-4f86-4fed-b8cc-39dbc3d0de99:fx'  # Replace with your actual DeepL API key
translator = deepl.Translator(auth_key)

def translate_organizations(org_list, translator):
    translated_orgs = []
    for idx, org in enumerate(org_list, start=1):  # start=1 to begin the count at 1
        if org:  # Only translate if the organization name is not empty
            result = translator.translate_text(org, target_lang="KO")  # Translate to Korean
            translated_orgs.append(result.text)
        else:
            translated_orgs.append('')  # Append an empty string if the organization name is empty
        
        # Print progress every 50 translations
        if idx % 50 == 0:
            print(f"Translation progress: {idx} / {len(org_list)}")

    return translated_orgs

# Translate the unique organization list
translated_org_list = translate_organizations(unique_org_list, translator)

# Step 5: Create a dictionary mapping English to Korean organizations
org_translation_dict = dict(zip(unique_org_list, translated_org_list))

# Step 6: Translate 'Affiliation' column in the filtered DataFrame
def translate_affiliations(affiliation_column, translation_dict):
    translated_affiliations = []
    for org_string in affiliation_column.fillna(''):  # Handle NaN as empty strings
        org_list = org_string.split(';')
        translated_orgs = [translation_dict.get(org.strip(), org.strip()) for org in org_list]
        translated_affiliations.append('; '.join(translated_orgs))
    return translated_affiliations

# Apply the translation to the 'Affiliation' column
df_filtered['Affiliation_KR'] = translate_affiliations(df_filtered['Affiliation'], org_translation_dict)

# Step 7: Save the updated DataFrame with translated affiliations to a new CSV file
output_file_path = 'Paper_Dataset_translated.csv'
df_filtered.to_csv(output_file_path, index=False, encoding='utf-8-sig')

# Print confirmation message
print(f"The translated dataset has been saved as '{output_file_path}'")


Translation progress: 50 / 2768
Translation progress: 100 / 2768
Translation progress: 150 / 2768
Translation progress: 200 / 2768
Translation progress: 250 / 2768
Translation progress: 300 / 2768
Translation progress: 350 / 2768
Translation progress: 400 / 2768
Translation progress: 450 / 2768
Translation progress: 500 / 2768
Translation progress: 550 / 2768
Translation progress: 600 / 2768
Translation progress: 650 / 2768
Translation progress: 700 / 2768
Translation progress: 750 / 2768
Translation progress: 800 / 2768
Translation progress: 850 / 2768
Translation progress: 900 / 2768
Translation progress: 950 / 2768
Translation progress: 1000 / 2768
Translation progress: 1050 / 2768
Translation progress: 1100 / 2768
Translation progress: 1150 / 2768
Translation progress: 1200 / 2768
Translation progress: 1250 / 2768
Translation progress: 1300 / 2768
Translation progress: 1350 / 2768
Translation progress: 1400 / 2768
Translation progress: 1450 / 2768
Translation progress: 1500 / 2768


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Affiliation_KR'] = translate_affiliations(df_filtered['Affiliation'], org_translation_dict)


The translated dataset has been saved as 'Paper_Dataset_translated.csv'


In [33]:
# Step 6: Translate 'Affiliation' column in the filtered DataFrame
def translate_affiliations(affiliation_column, translation_dict):
    translated_affiliations = []
    for org_string in affiliation_column.fillna(''):  # Handle NaN as empty strings
        org_list = org_string.split(';')
        translated_orgs = [translation_dict.get(org.strip(), org.strip()) for org in org_list]
        translated_affiliations.append('; '.join(translated_orgs))
    return translated_affiliations

# Apply the translation to the 'Affiliation' column
df_filtered['Affiliation_KR'] = translate_affiliations(df_filtered['Affiliation'], org_translation_dict)

# Step 7: Save the updated DataFrame with translated affiliations to a new CSV file
output_file_path = 'Paper_Dataset_translated.csv'
df_filtered.to_csv(output_file_path, index=False, encoding='utf-8-sig')

# Print confirmation message
print(f"The translated dataset has been saved as '{output_file_path}'")

The translated dataset has been saved as 'Paper_Dataset_translated.csv'


In [19]:
import pandas as pd

# 파일 경로에 'Project_Dataset.csv'를 사용하세요.
dataset_path = 'Project_Dataset.csv'  # 파일이 있는 경로로 변경하세요
project_data = pd.read_csv(dataset_path)

# 조건: '육T관련기술'이 2이고 '논문_개수'가 1 이상인 데이터 필터링
filtered_data = project_data[(project_data['육T관련기술'] == 2) & (project_data['논문_개수'] >= 1)]

# 결과를 새로운 CSV 파일로 저장
output_path = 'Filtered_Project_Dataset.csv'  # 저장할 경로를 원하는 대로 설정하세요
filtered_data.to_csv(output_path, index=False, encoding = 'utf-8-sig')

print(f"필터링된 데이터가 '{output_path}'에 저장되었습니다.")


필터링된 데이터가 'Filtered_Project_Dataset.csv'에 저장되었습니다.


#### Matching-Dict (Deepl - KOR)

In [32]:
import pandas as pd

# Load the main dataset
df_filtered = pd.read_csv('Paper_Dataset_rv.csv')

# Load the updated organization matching dictionary with new translations
translation_df = pd.read_csv('matching_dict_kor.csv')
translation_dict = dict(zip(translation_df['Organization_ENG'], translation_df['Organization_KOR']))

# Define the function to translate affiliations using the updated dictionary
def translate_affiliations(affiliation_column, translation_dict):
    translated_affiliations = []
    for org_string in affiliation_column.fillna(''):  # Handle NaN as empty strings
        # Ensure each organization name is treated as a string before processing
        org_list = str(org_string).split(';')
        translated_orgs = [translation_dict.get(org.strip(), org.strip()) for org in org_list]
        # Filter out any non-string items in translated_orgs before joining
        translated_affiliations.append('; '.join([org for org in translated_orgs if isinstance(org, str)]))
    return translated_affiliations

# Apply the updated translation to the 'Affiliation' column
df_filtered['Affiliation'] = translate_affiliations(df_filtered['Affiliation'], translation_dict)

# Save the updated DataFrame with new translations
output_file_path = 'Paper_Dataset_420.csv'
df_filtered.to_csv(output_file_path, index=False, encoding='utf-8-sig')

# Print confirmation message
print(f"The translated dataset has been saved as '{output_file_path}'")

  df_filtered = pd.read_csv('Paper_Dataset_rv.csv')


The translated dataset has been saved as 'Paper_Dataset_translated.csv'


In [1]:
import pandas as pd

# Load the CSV files
org_a = pd.read_csv('ORG-A(Project).csv')
org_b = pd.read_csv('ORG-B(Paper).csv')

# Create the desired column order for the output
column_order = ['과제고유번호', '협업_매트릭스', 'Affiliation_KR', 'Affiliation', 'Title', 'Authors', 'blau_index']

# Merge based on the '과제고유번호' column using a left join to retain all rows from ORG-A
merged_df = pd.merge(org_a, org_b, on='과제고유번호', how='left')

# Sort values by '과제고유번호' to ensure rows with the same identifier appear together
merged_df = merged_df.sort_values(by='과제고유번호')

# Set the '협업_매트릭스', 'blau_index' columns to display only once per '과제고유번호' and leave others as empty
merged_df['협업_매트릭스'] = merged_df['협업_매트릭스'].where(~merged_df.duplicated('과제고유번호'))
merged_df['blau_index'] = merged_df['blau_index'].where(~merged_df.duplicated('과제고유번호'))

# Rearrange the columns in the specified order
merged_df = merged_df[column_order]

# Export the result to a new CSV file
merged_df.to_csv('Merged_Output.csv', index=False, encoding='utf-8-sig')

print("Merged CSV saved as 'Merged_Output.csv'")


Merged CSV saved as 'Merged_Output.csv'


In [3]:
import pandas as pd

# Load both CSV files
file1 = pd.read_csv('matching_dict_kor.csv')
file2 = pd.read_csv('translated_org_list.csv')

# Find missing rows from file2 in file1 based on 'Organization_ENG'
missing_rows = file2[~file2['Organization_ENG'].isin(file1['Organization_ENG'])]

# Append missing rows to file1
updated_file1 = pd.concat([file1, missing_rows], ignore_index=True)

# Save the result to a new CSV file
updated_file1.to_csv('Updated_matching_dict_kor.csv', index=False, encoding='utf-8-sig')

print("Updated CSV saved as 'Updated_matching_dict_kor.csv'")


Updated CSV saved as 'Updated_matching_dict_kor.csv'


#### CSV to Excel

In [21]:
import pandas as pd

# CSV 파일을 읽어옵니다.
project_df = pd.read_csv('Project_Dataset.csv')
paper_df = pd.read_csv('Paper_Dataset_All.csv')

# '육T관련기술'이 2인 행만 필터링합니다.
filtered_paper_df = paper_df[paper_df['육T관련기술'] == 2]

# '과제고유번호'를 기준으로 merge하여 'blau_index' 값을 가져옵니다.
merged_df = filtered_paper_df.merge(project_df[['과제고유번호', 'blau_index']], on='과제고유번호', how='left')

# 필요한 열만 선택합니다.
final_df = merged_df[['ID', '과제고유번호', 'Scope_Top_Count', 'blau_index']]

# 선택한 DataFrame을 DTA 파일로 내보냅니다.
final_df.to_stata('Blau_Scope_Test.dta')

print("변환이 완료되었습니다. DTA 파일로 저장되었습니다.")


변환이 완료되었습니다. DTA 파일로 저장되었습니다.


  paper_df = pd.read_csv('Paper_Dataset_All.csv')
C:\Users\river\AppData\Local\Temp\ipykernel_10816\855627331.py:17: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    과제고유번호   ->   ______

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)

  final_df.to_stata('Blau_Scope_Test.dta')


In [41]:
import pandas as pd

# Step 1: Load the CSV files
paper_df = pd.read_csv("Paper_Dataset_Bio.csv")
matching_dict_df = pd.read_csv("dict_kor.csv")

# Step 2: Create a dictionary for English-Korean name mapping
eng_to_kor_dict = dict(zip(matching_dict_df['Organization_ENG'], matching_dict_df['Organization_KOR']))

def translate_affiliation(affiliation):
    # 리스트로 분리
    org_names = affiliation.split(';')  
    korean_names = []

    for name in org_names:
        name = name.strip()  # 공백 제거
        
        if isinstance(name, str):  # 문자열인지 확인
            # 영어 이름이 Org_ENG에 있는지 확인
            if name in eng_to_kor_dict:  # Org_ENG에 존재하면
                korean_translation = eng_to_kor_dict[name]
                if korean_translation:  # 번역이 있을 경우
                    korean_names.append(korean_translation)
            else:
                # Org_ENG에 없으면 영어 그대로 반환
                korean_names.append(name)

    # 번역된 조직명만 반환 (빈 값은 제외)
    return '; '.join([str(korean_name) for korean_name in korean_names if isinstance(korean_name, str)])


# Apply the translation function to create the 'Affiliation_KR' column
paper_df['Affiliation_KR'] = paper_df['Affiliation'].apply(translate_affiliation)

# Step 4: Save the result to a new CSV file if needed
paper_df.to_csv("Translated_Paper_Dataset_Bio.csv", index=False, encoding='utf-8-sig')