In [1]:
import os
import re
import pandas as pd
from collections import Counter

In [2]:
path = './wos_data'
file_name_list = os.listdir(path)
file_name_list

['savedrecs (1).txt',
 'savedrecs (5).txt',
 'savedrecs.txt',
 'savedrecs (4).txt',
 'savedrecs (3).txt',
 'savedrecs (2).txt']

In [3]:
texts = []
for i in file_name_list:
    with open(path+'/'+i,'r') as f:
        texts.append(f.read())
        
str_texts = ' '.join(texts)

### 常用字段说明  
UT 入藏号，作为文献的唯一标识，务必保留  
AU 作者简称  
AF 作者全称  
TI 文献标题  
SO 出版物名称  
DT 文献类型  
DE 作者关键词  
C1 作者地址
RP 通讯作者地址  
FU 基金资助机构
PY 出版年

In [4]:
extract_fields = ['UT','AU','TI','SO','DT','DE','C1','RP','FU','PY']

def fields_extract(fields,data = str_texts):
    
    docs = re.split('ER\n\n',data)[:-1]
    doc_num = len(docs)
    
    field_contents = []
    
    for field in fields:
        
        #全部文本匹配
        pattern = re.compile(f'\n{field} (.*?)(?=\n[^ ])',re.S) #使.包含换行符
        content = pattern.findall(data)
        
        if len(content) != doc_num:
            
            #单篇文本匹配
            content = [pattern.search(doc)[1] if pattern.search(doc) else 'Unknown' for doc in docs]
        
        #文本清洗
        if field == 'AU': #作者
            content = [re.sub('\n[ ]*',';',i) for i in content]
        
        elif field == 'TI' or field == 'DE': #题名、关键词
            content = [re.sub('\n[ ]*',' ',i) for i in content]
            
        elif field == 'C1': #作者通讯地址
            content = [re.sub('.\n[ ]*',';',i) for i in content]
        
        field_contents.append(content)
    
    field_contents_dict = dict(zip(fields,field_contents))
    df = pd.DataFrame.from_dict(field_contents_dict)
    
    #将'Unknown'替换为空值
    df.replace('Unknown',pd.NA,inplace=True)
    
    #按照年份进行排序
    if 'PY' in df.columns:
        df.sort_values(by=['PY'],inplace=True)
        
    #删除UT字段重复的数据
    if 'UT' in df.columns:
        df.drop_duplicates(subset=['UT'],ignore_index=True,inplace=True)
    
    return df
    
df = fields_extract(extract_fields)
df

Unnamed: 0,UT,AU,TI,SO,DT,DE,C1,RP,FU,PY
0,WOS:A1994PE63400009,"BERGEMANN, W;TOLLE, H",DOCUMENTATION AND COMPUTED ANALYSIS OF DATA OF...,ZEITSCHRIFT FUR GERONTOLOGIE,Article,"DISCHARGE LETTER, COMPUTER GENERATED; ACUTE CA...",,"BERGEMANN, W (corresponding author), AKAD LEHR...",,1994
1,WOS:A1995RG55500006,"BELAND, F;LEMAY, A",DILEMMAS AND VALUES FOR LONG-TERM-CARE POLICIES,CANADIAN JOURNAL ON AGING-REVUE CANADIENNE DU ...,Article,LONG-TERM CARE; FUNCTIONAL INCAPACITIES; EXPEN...,,"BELAND, F (corresponding author), UNIV MONTREA...",,1995
2,WOS:A1997BJ29H00008,"Guglielmelli, E;Laschi, C;Teti, G;Fontanelli, ...",A modular and distributed supervisory system f...,8TH INTERNATIONAL CONFERENCE ON ADVANCED ROBOT...,Proceedings Paper,modularity; distribution; supervision,,"Guglielmelli, E (corresponding author), SCUOLA...",,1997
3,WOS:A1997BJ42X00201,"Fiorini, P;Ali, K;Seraji, H",Health care robotics: A progress report,1997 IEEE INTERNATIONAL CONFERENCE ON ROBOTICS...,Proceedings Paper,,,"Fiorini, P (corresponding author), CALTECH,JET...",,1997
4,WOS:000077035900460,"Schraft, RD;Schaeffer, C;May, T",Care-O-bot (TM): The concept of a system for a...,IECON '98 - PROCEEDINGS OF THE 24TH ANNUAL CON...,Proceedings Paper,,"Fraunhofer Inst Prod Tech & Automatisierung, I...","Schraft, RD (corresponding author), Fraunhofer...",,1998
...,...,...,...,...,...,...,...,...,...,...
2628,WOS:000732122300001,"Do, H;Welch, KC;Sheng, WH",SoHAM: A Sound-Based Human Activity Monitoring...,IEEE TRANSACTIONS ON AUTOMATION SCIENCE AND EN...,Article; Early Access,Monitoring; Hidden Markov models; Robots; Serv...,"[Do, Ha Manh] Plume Design Inc, Palo Alto, CA ...","Sheng, WH (corresponding author), Oklahoma Sta...",National Science Foundation (NSF)National Scie...,
2629,WOS:000658253400001,"Ghayvat, H;Gope, P",Smart aging monitoring and early dementia reco...,NEURAL COMPUTING & APPLICATIONS,Article; Early Access,Transfer learning; Pre-trained deep learning m...,"[Ghayvat, Hemant] Tech Univ Denmark, Denmark F...","Ghayvat, H (corresponding author), Tech Univ D...",Linnaeus University,
2630,WOS:000662090700001,"Watfa, MK;Akili, A",Factors influencing elders' intention to use a...,INTERNATIONAL JOURNAL OF CONSTRUCTION MANAGEMENT,Article; Early Access,Elder's community; theory of planned behaviour...,"[Watfa, Mohamed K.; Akili, Aya] Univ Wollongon...","Watfa, MK (corresponding author), Univ Wollong...",,
2631,WOS:000677856300002,"Yang, P;Bi, GS;Qi, J;Wang, XL;Yang, Y;Xu, LD",Multimodal Wearable Intelligence for Dementia ...,INFORMATION SYSTEMS FRONTIERS,Article; Early Access,Wearable intelligence; Healthcare; Dementia; I...,"[Yang, Po] Univ Sheffield, Dept Comp Sci, Shef...","Yang, P (corresponding author), Univ Sheffield...",Yunnan University's Research Innovation Fund f...,


In [None]:
df.to_excel('wos_extract_fields_result.xlsx',index=True)