In [None]:
import re
import pandas as pd

class Paper:
    def __init__(self, PT, AU, BA, BE, GP, AF, BF, CA, TI, SO, SE, BS, LA, DT, CT, CY, CL, SP, HO, DE, ID, AB, C1, C3, RP, EM, RI, OI, FU, FP, FX, CR, NR, TC, Z9, U1, U2, PU, PI, PA, SN, EI, BN, J9, JI, PD, PY, VL, IS, PN, SU, SI, MA, BP, EP, AR, DI, DL, D2, EA, PG, WC, WE, SC, GA, PM, OA, HC, HP, DA, UT):
        self.PT = PT
        self.AU = AU
        self.BA = BA
        self.BE = BE
        self.GP = GP
        self.AF = AF
        self.BF = BF
        self.CA = CA
        self.TI = TI
        self.SO = SO
        self.SE = SE
        self.BS = BS
        self.LA = LA
        self.DT = DT
        self.CT = CT
        self.CY = CY
        self.CL = CL
        self.SP = SP
        self.HO = HO
        self.DE = DE
        self.ID = ID
        self.AB = AB
        self.C1 = C1
        self.C3 = C3
        self.RP = RP
        self.EM = EM
        self.RI = RI
        self.OI = OI
        self.FU = FU
        self.FP = FP
        self.FX = FX
        self.CR = CR
        self.NR = NR
        self.TC = TC
        self.Z9 = Z9
        self.U1 = U1
        self.U2 = U2
        self.PU = PU
        self.PI = PI
        self.PA = PA
        self.SN = SN
        self.EI = EI
        self.BN = BN
        self.J9 = J9
        self.JI = JI
        self.PD = PD
        self.PY = PY
        self.VL = VL
        self.IS = IS
        self.PN = PN
        self.SU = SU
        self.SI = SI
        self.MA = MA
        self.BP = BP
        self.EP = EP
        self.AR = AR
        self.DI = DI
        self.DL = DL
        self.D2 = D2
        self.EA = EA
        self.PG = PG
        self.WC = WC
        self.WE = WE
        self.SC = SC
        self.GA = GA
        self.PM = PM
        self.OA = OA
        self.HC = HC
        self.HP = HP
        self.DA = DA
        self.UT = UT
    def generate_papers_from_file(file_path):
        papers = []
        with open(file_path, 'r') as file:
            reader = csv.DictReader(file, delimiter='\t')
            
            for row_data in reader:
                papers.append(Paper(**row_data))
        return papers
    def get_table1_info(self):
        return [self.UT, self.PY, self.SO, self.SN, self.DI, self.IS, self.VL]

    def get_table2_info(self):
        return [self.UT, self.AB]

    def get_table3_info(self):
        return [self.UT, self.TI]

    def get_table4_info(self):
        return split_author_names(self.UT, self.AF)

    def get_table5_info(self):
        return preprocess_and_merge_data(self.UT, self.AF)

    def get_table6_info(self):
        return [self.UT, self.CR]

# 辅助函数，分割作者姓名
def split_author_names(ut, author_str):
    authors = author_str.split('; ')
    if ut is None or author_str is None:
        return []  # 返回一个空列表以防止空数据
    return [{
        'UT_author': ut + '_' + author.strip(),   # 复合主键
        'UT': ut,
        'full_name': author.strip(),
        'family_name': author.split(', ')[0],
        'given_name': ' '.join(author.split(', ')[1:]),
        'author_order': i + 1
    } for i, author in enumerate(authors) if author]

# 辅助函数，处理作者单位
def preprocess_and_merge_data(UT, c1_string, AF):
    def split_and_order(row, column_name):
        items = row[column_name].replace(';;', '; ').split(';')
        items = [item.strip() for item in items]
        sorted_items = [(item, index + 1) for index, item in enumerate(items)]
        return sorted_items

    def extract_author_affiliation_to_dataframe(UT_list, c1_string_list):
        author_affiliation_data = []
        affiliation_order_data = []
        for UT, c1_string in zip(UT_list, c1_string_list):
            author_pattern = r'\[(.*?)\]'
            author_info_list = re.findall(author_pattern, c1_string)
            author_list = [re.findall(r'\w+\s*,\s*\w+', info) for info in author_info_list]

            affiliation_pattern = r'\]\s*(.*?)(?=\s*\[|$)'
            affiliation_list = re.findall(affiliation_pattern, c1_string)
            affiliation_str = '; '.join(affiliation_list)
            affiliation_order_data.append({'UT': UT, 'Author_Affiliation': affiliation_str})
            for i, authors in enumerate(author_list):
                for author in authors:
                    author_affiliation_data.append({'UT': UT, 'Author': author.strip(), 'Author_Affiliation': affiliation_list[i]})

        author_df = pd.DataFrame(author_affiliation_data)
        affiliation_df = pd.DataFrame(affiliation_order_data)
        return author_df, affiliation_df

    author_df, affiliation_df = extract_author_affiliation_to_dataframe(UT, c1_string)
    author_df['Author_Affiliation'] = author_df['Author_Affiliation'].str.split('; ')
    author_df_expanded = author_df.explode('Author_Affiliation').reset_index(drop=True)
    author_df_expanded['Author_Affiliation'] = author_df_expanded['Author_Affiliation'].apply(lambda x: x.rstrip(';') if isinstance(x, str) else x)

    affiliation_df['Affiliation_Order'] = affiliation_df.apply(lambda row: split_and_order(row, 'Author_Affiliation'), axis=1)
    affiliation_df_expanded = affiliation_df.explode('Affiliation_Order')
    affiliation_df_expanded[['Author_Affiliation', 'Affiliation_Order']] = pd.DataFrame(affiliation_df_expanded['Affiliation_Order'].tolist(), index=affiliation_df_expanded.index)

    af_df = pd.DataFrame({'UT': UT, 'AF': AF})
    af_df['AF_Order'] = af_df.apply(lambda row: split_and_order(row, 'AF'), axis=1)
    af_df_expanded = af_df.explode('AF_Order')
    af_df_expanded[['AF', 'AF_Order']] = pd.DataFrame(af_df_expanded['AF_Order'].tolist(), index=af_df_expanded.index)
    af_df_expanded.rename(columns={'AF': 'Author'}, inplace=True)

    merged_df = pd.merge(author_df_expanded, affiliation_df_expanded, on=['UT', 'Author_Affiliation'], how='outer')
    final_df = pd.merge(af_df_expanded, merged_df, on=['UT', 'Author'], how='left')
    return final_df
    

# 读取文件并生成 Paper 对象
def generate_papers_from_file(file_path):
    papers = []
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file, delimiter='\t')
        for row_data in reader:
            print(row_data)
            papers.append(Paper(**row_data))
    return papers

# 将 Paper 对象写入 CSV 文件
def write_papers_to_csv(papers, file_name, table_number):
    with open(file_name, 'w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file)
        # 写入表头
        writer.writerow(['UT'] + tables[f'table{table_number}']['columns'][1:])
        # 写入数据
        for paper in papers:
            info_func = getattr(paper, f'get_table{table_number}_info')
            writer.writerow(info_func())

# 生成 Paper 对象列表
papers = generate_papers_from_file(r'C:\Users\W10\Downloads\qje2014_2023_2.txt')

# 将 Paper 对象写入 CSV 文件
for i in range(1, 7):
    write_papers_to_csv(papers, f'table{i}.csv', i)