In [1]:
from docx import Document
import pandas as pd
import os,sys
import re
import copy
import traceback
import random

### 待提取的表格名称列表

In [2]:
# 云康免疫
immuneDict = [
    {'title': '受检者信息', 'prefix': '受检者信息', 'isHeader': False},
    {'title': '3.4 免疫治疗药物提示', 'prefix': '免疫治疗药物提示', 'isHeader': True}, 
    {'title': '5.1 肿瘤突变负荷（TMB）', 'prefix': '肿瘤突变负荷（TMB）', 'isHeader': False},
    {'title': '5.2 微卫星不稳定性（MSI）', 'prefix': '微卫星不稳定性（MSI）', 'isHeader': True},
    {'title': '5.3 免疫治疗获益相关基因', 'prefix': '免疫治疗获益相关基因', 'isHeader': True},
    {'title': '5.5 免疫治疗耐药/快速进展相关基因', 'prefix': '免疫治疗耐药/快速进展相关基因', 'isHeader': True},
    {'title': '5.5 HLA分型解析', 'prefix': '5.5 HLA分型解析', 'isHeader': True}, 
]
# 云康靶向
targetDict = [
    {'title': '受检者信息', 'prefix': '受检者信息', 'isHeader': False},
    {'title': '3.3 FDA获批/NCCN指南推荐的靶向药物相关基因变异', 'prefix': 'FDA获批/NCCN指南推荐的靶向药物相关基因变异', 'isHeader': True}, 
    {'title': '4.2 其他与肿瘤发生相关变异', 'prefix': '其他与肿瘤发生相关变异', 'isHeader': True}, 
    {'title': '4.3 基因变异注释', 'prefix': '基因变异注释', 'isHeader': True}, 
    {'title': '4.4 潜在获益药物/潜在耐药药物解析', 'prefix': '潜在获益药物/潜在耐药药物解析', 'isHeader': True}, 
    {'title': '4.5 相关临床研究', 'prefix': '相关临床研究', 'isHeader': True}, 
]
# 云康化疗
chemotherapyDict = [
    {'title': '受检者信息', 'prefix': '受检者信息', 'isHeader': False},
    {'title': '3.5 化疗用药提示', 'prefix': '化疗用药提示', 'isHeader': True}, 
    {'title': '六、化疗药物相关检测结果解析', 'prefix': '化疗药物相关检测结果解析', 'isHeader': True}, 
]
# 云康遗传性肿瘤
inheritanceTumourDict = [
    {'title': '受检者信息', 'prefix': '受检者信息', 'isHeader': False},
    {'title': '7.1 遗传性肿瘤基因变异解析', 'prefix': '7遗传性肿瘤基因变异解析', 'isHeader': True}, 
]
# 云康检出变异
detectionAbnormalDict = [
    {'title': '受检者信息', 'prefix': '受检者信息', 'isHeader': False},
    {'title': '8.1体细胞变异结果汇总', 'prefix': '体细胞变异结果汇总', 'isHeader': True},
    {'title': '8.2 胚系变异结果汇总', 'prefix': '胚系变异结果汇总', 'isHeader': True}, 
]

In [3]:
class Process(object):
    """处理类"""
    
    def __init__(self,path):
        self.document = Document(path)  # 
        self.paragraphs = self.document.paragraphs
        self.allTables = self.document.tables
        
    def get_table(self,specText,isHeader):
        """
        获取表格
        specText: 对应表在docx文件中的标题
        isHeader:True需要将第一行作为表头，False：不需要
        """
        specText = specText.encode('utf-8').decode('utf-8')
        for aPara in self.paragraphs:
            if aPara.text == specText:
                ele = aPara._p.getnext()
                while (ele.tag != '' and ele.tag[-3:] != 'tbl'):
                    ele = ele.getnext()
                if ele.tag != '':
                    for aTable in self.allTables:
                        if aTable._tbl == ele:
                            table_columns = []  # 获取表头(默认第一行为表头)
                            table_content = []  # 获取表内容
                            index = 0
                            for i in range(len(aTable.rows)):  
                                table_row = []  # 获取每一行数据
                                for j in range(len(aTable.columns)):
                                    text = aTable.cell(i, j).text
                                    if i == 0:
                                        if isHeader:
                                            table_columns.append(text)
                                        else:
                                            table_row.append(text)
                                    else:
                                        table_row.append(text)
                                    index += 1
                                if len(table_row) == 0:
                                    continue
                                table_content.append(table_row)
#                             print("table_columns",table_columns)
                            if not table_content:
                                continue
                            if table_columns:
        
                                df = pd.DataFrame(data=table_content,columns=table_columns)
                            else:
                                df = pd.DataFrame(data=table_content,columns=None)
                            return df
        return pd.DataFrame()
    
    def re_stack_base_info_df(self,df):
        """拆分基本信息数据并重新组合为指定格式"""
        columns = ["姓名","性别","年龄","送检医院","送检科室","送检医生","门诊号","样本类型","原病理号","条码号","病理(临床)诊断","检测号",
                  "治疗史","肿瘤家族史","样本采集日期","接收日期","报告日期"]
        # 在df中获取为columns中任意一个符合的数据，并取其同一行下右边列的数据
        re_dict = {}
        columns_ = df.columns.to_list()
        for t_col in columns:
            try:
                cls = df[df.applymap(lambda x:True if x==t_col else False)].dropna(axis=0,how="all").dropna(axis=1,how="all").columns.to_list()
                index = df[df.applymap(lambda x:True if x==t_col else False)].dropna(axis=0,how="all").index.to_list()
                if index and cls:
                    column_index = columns_.index(cls[0])
                    rss = df.iloc[index[0],column_index + 1]
                    next_column_value = [rss]
            except Exception as e:
                next_column_value = [""]
            re_dict[t_col] = next_column_value
        re_df = pd.DataFrame.from_dict(data=re_dict,orient='columns')
        return re_df

    def base_table(self,specText,isHeader):
        """处理基础信息表格"""
        df = self.get_table(specText,isHeader)
        df = df.loc[:,~df.columns.duplicated("last")]   # 字段去重
        df = self.re_stack_base_info_df(df)
        return df
    
    def process_same_columns(self,df):
        """处理字段名称不一致问题"""
        a_columns = df.columns.to_list()
        for col in a_columns:
            rrs = re.findall(r"[基][因]+",col)
            if rrs:
                index = a_columns.index(col)
                a_columns[index] = rrs[0]
            rrs = re.findall(r"[突][变][丰][度]+",col)
            if rrs:
                index = a_columns.index(col)
                a_columns[index] = "突变丰度/拷贝数"
        df.columns = a_columns
        return df
    
    def re_stack_benifi_data(self,df,type_name):
        """
            拆分可能获益的药物数据并重新组合为指定格式
            type_name:为当前的df添加一个类型
        """
        condition_first = r"[ABCD]级+"
        condition_second = r"[ABCD]+"
        first_result_list = []
        master_result_list = []
        set_str = "药物" 
        class_str = "级别"
        type_item = "类型"
        for col in df.columns.to_list():
            index = 0
            first_result = re.findall(condition_first,col)
            if first_result:
                first_result_list.append(col)
            else:
                master_result_list.append(col)

        result_df_list = []
        for first in first_result_list:
            tem_list = master_result_list + [first]
            ndf = copy.deepcopy(df[tem_list])
    #         print("tem_list",tem_list)
            try:
                ndf[set_str] = ndf[first].map(lambda x:str(x).split("\n") if (str(x).split("\n") and "/" not in str(x).split("\n")) else [])
            except Exception as e:
                print("master_result_list",master_result_list)
                print("first",first)
                print("ndf2",ndf)
                print("df",df)
                print("tem_list",tem_list)
                raise ValueError("出错了")
            ndf = ndf.explode(set_str)
            ndf = ndf.dropna()
            first_result = re.findall(condition_first,first)[0]
            second_result = re.findall(condition_second,first_result)[0]
            ndf[class_str] = second_result
            ndf[type_item] = type_name
            result_df_list.append(ndf)

        return pd.concat(result_df_list,join="inner")
    
    def sensitivity(self,title,isHeader):
     # 可能获益的药物类型
        df = self.get_table(title,isHeader)
        table_df = self.process_same_columns(df)
        print(table_df)
        df = self.re_stack_benifi_data(table_df,"敏感")
        return df
            
    def drugfast(self,title,isHeader):
    # 可能耐药的药物类型
        df = self.get_table(title,isHeader)
        table_df = self.process_same_columns(df)
        df = self.re_stack_benifi_data(table_df,"耐药")
        return df


### 获取路径下所有word文档的目录

In [4]:
def get_all_docx(file_path):
        """获取路径下所有word文档的目录"""
        files_list = []
        file_suffix_list = [".docx"]  # 定义要提取的文件后缀
        for path,file_dir,files in os.walk(file_path):
            files_list = files
        re_list = []
        for file in files_list:
            file_name,file_suffix = os.path.splitext(file)
            if file_suffix in file_suffix_list:
                re_list.append(file)
        return re_list

### 获取路径下所有word文档的目录

In [5]:
def get_all_xlsx(file_path):
    """获取路径下所有word文档的目录"""
    files_list = []
    file_suffix_list = [".xlsx"]  # 定义要提取的文件后缀
    for path,file_dir,files in os.walk(file_path):
        files_list = files
    re_list = []
    for file in files_list:
        file_name,file_suffix = os.path.splitext(file)
        if file_suffix in file_suffix_list:
            re_list.append(file)
    return re_list

In [6]:
p = Process(r"E:\other\Word文档解读\data\442092614200-严爱凤.docx")
p.get_table("5.3 微卫星不稳定性（MSI）",True)


Unnamed: 0,检测微卫星数目,微卫星不稳定性分值,参考阈值,结果判定
0,105,0.010,0.4,MSS
1,潜在临床意义\n微卫星是指分布在人类基因组里的简单重复序列，又被称作短串连重复 （Short...,潜在临床意义\n微卫星是指分布在人类基因组里的简单重复序列，又被称作短串连重复 （Short...,潜在临床意义\n微卫星是指分布在人类基因组里的简单重复序列，又被称作短串连重复 （Short...,潜在临床意义\n微卫星是指分布在人类基因组里的简单重复序列，又被称作短串连重复 （Short...


### 云康免疫

In [7]:
class ImmuneProcess(Process):
    """云康免疫"""
    def __init__(self,path):
        super().__init__(path)
    
    def immuneCure(self,specText,isHeader):
        """免疫治疗药物提示处理"""
        df = self.get_table(specText,isHeader)
        return df
    
    def MSI(self,specText,isHeader):
        """微卫星不稳定性（MSI）"""
        df = self.get_table(specText,isHeader)
        if not df.empty:
            ndf = df.iloc[0:1]
            if len(df) >=2 and "检测微卫星数目" in df.columns.to_list():
                ndf.loc[:,"潜在临床意义"] =  df.at[1,"检测微卫星数目"]
                return ndf
        return df
    
    def immuneProfit(self,specText,isHeader):
        """免疫治疗获益相关基因"""
        df = self.get_table(specText,isHeader)
#         print(df)
        return df

    def drugfast(self,specText,isHeader):
        "免疫治疗耐药/快速进展相关基因"
        df = self.get_table(specText,isHeader)
        return df
    
    def TMB(self,specText,isHeader):
        """肿瘤突变负荷（TMB）"""
        df = self.get_table(specText,isHeader)
        if len(df) < 6:
            return df
        ndf = df.iloc[0:3]
        ndf.reset_index([0],inplace=True)
        ndf = ndf.T
        _ndf = ndf.drop(index=["index"],axis=1)
        # 重新定义列
        ncl = _ndf.iloc[0]
        n_ndf = _ndf.iloc[1:]
        n_ndf.columns = ncl
        # 将其它行的数据添加到该df中
        n_ndf.loc[:,"潜在临床意义"] = df.iloc[5,0]
        n_ndf = n_ndf.reset_index(drop=True)
        return n_ndf
        
    def HLA(self,specText,isHeader):
        """HLA分型解析"""
        ndf = pd.DataFrame()
        # 定义要删除的列名
        del_column = ["HLA Class I"]
        df = self.get_table(specText,isHeader)
        if len(df) < 3:
            return df
        # 获取要处理行数
        df1 = df.iloc[0:3,]
        df1 = df1.set_index(["HLA Class I"])
        re_columns = {}
        for re_clo in del_column:
            if re_clo in df1.columns.to_list():
                re_columns[re_clo] = None
        # 将无关的列名设置为None
        df1.rename(columns=re_columns,inplace=True)
        index_list = df1.index.to_list()
        column_list = df1.columns.to_list()
        # 将数据转换为一维表
        for row in index_list:
            for clo in column_list:
#                 print(row,clo,"==",sep=",")
                ndf.loc[0,str(clo) + "(" + str(row) + ")"] = df1.at[row,clo]
        return ndf
    
    def run(self,specText,isHeader,func):
        """开始处理"""
        funtion = getattr(self,func)
        df = funtion(specText,isHeader)
        return df
        

In [8]:
ip = ImmuneProcess(r"E:\other\Word文档解读\data\442092614200-严爱凤.docx")
immuneDict = [
#     {'title': '受检者信息', 'prefix': '受检者信息', 'isHeader': False,"function":"base_table"},
#     {'title': '3.4 免疫治疗药物提示', 'prefix': '免疫治疗药物提示', 'isHeader': True,"function":"immuneCure"},
    {'title': '5.2 肿瘤突变负荷（TMB）', 'prefix': '肿瘤突变负荷（TMB）', 'isHeader': False,"function":"TMB"},
#     {'title': '5.3 微卫星不稳定性（MSI）', 'prefix': '微卫星不稳定性（MSI）', 'isHeader': True,"function":"MSI"},
#     {'title': '5.4 免疫治疗获益相关基因', 'prefix': '免疫治疗获益相关基因', 'isHeader': True,"function":"immuneProfit"},
#     {'title': '5.5 免疫治疗耐药/快速进展相关基因', 'prefix': '免疫治疗耐药/快速进展相关基因', 'isHeader': True,"function":"drugfast"},
#     {'title': '5.6 HLA分型解析', 'prefix': '5.5 HLA分型解析', 'isHeader': True,"function":"HLA"}, 
]
for item in immuneDict:
    df = ip.run(item["title"],item["isHeader"],item["function"])
#     print("title",title)
    display(df)
#     ip.run("5.3 微卫星不稳定性（MSI）",True,"MSI")
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,"突变负荷（TMB, Non-synonymous Mutations per Mb）",突变负荷在该癌种患者人群中的Percentile Rank,免疫检查点抑制剂疗效评估,潜在临床意义
0,1.54Muts/Mb,13.83%,该患者可能对PD-1/PD-L1免疫检查点抑制剂治疗不敏感,潜在临床意义\n肿瘤突变负荷（Tumor Mutation Burden，TMB）通常定义为...


### 云康靶向

In [9]:
class TargetProcess(Process):
    """云康靶向"""
    
    def __init__(self,path):
        super().__init__(path)
    
    def FDA(self,specText,isHeader):
        """FDA获批/NCCN指南推荐的靶向药物相关基因变异"""
        df = self.get_table(specText,isHeader)
#         display(df)
        return df
    
    def otherAndTumour(self,specText,isHeader):
        """其他与肿瘤发生相关变异"""
        ndf = pd.DataFrame()
        df = self.get_table(specText,isHeader)
        return df
    
    def geneVariation(self,specText,isHeader):
        """基因变异注释"""
        df = self.get_table(specText,isHeader)
        if df.empty:
            return df
        tdf = df.iloc[0:1,]
        if len(df) < 4:
            return df
        random_column = random.choice(df.columns.to_list())
        tdf.loc[:,"基因与肿瘤相关性概述"] = df.at[1,random_column]
        tdf.loc[:,"位点变异信息注释"] = df.at[2,random_column]
        tdf.loc[:,"潜在临床意义"] = df.at[3,random_column]
#         display(tdf)
        return tdf
    def potentialMedicine(self,specText,isHeader):
        """潜在获益药物/潜在耐药药物解析"""
        df = self.get_table(specText,isHeader)
#         display(df)
        return df
    
    def clinicStudy(self,specText,isHeader):
        """临床研究"""
        df = self.get_table(specText,isHeader)
#         display(df)
        return df
    
    def run(self,specText,isHeader,func):
        """开始处理"""
        funtion = getattr(self,func)
        df = funtion(specText,isHeader)
        return df

In [10]:
tp = TargetProcess(r"E:\other\Word文档解读\data\442092614200-严爱凤.docx")
# 云康靶向
targetDict = [
    {'title': '受检者信息', 'prefix': '受检者信息', 'isHeader': False,"function":"base_table"},
    {'title': '3.3 FDA获批/NCCN指南推荐的靶向药物相关基因变异', 'prefix': 'FDA获批/NCCN指南推荐的靶向药物相关基因变异', 'isHeader': True,"function":"FDA"}, 
    {'title': '4.2 其他与肿瘤发生相关变异', 'prefix': '其他与肿瘤发生相关变异', 'isHeader': True,"function":"otherAndTumour"}, 
    {'title': '4.3 基因变异注释', 'prefix': '基因变异注释', 'isHeader': True,"function":"geneVariation"}, 
    {'title': '4.4 潜在获益药物/潜在耐药药物解析', 'prefix': '潜在获益药物/潜在耐药药物解析', 'isHeader': True,"function":"potentialMedicine"}, 
    {'title': '4.5 相关临床研究', 'prefix': '相关临床研究', 'isHeader': True,"function":"clinicStudy"}, 
]
for item in targetDict:
    df = tp.run(item["title"],item["isHeader"],item["function"])
#     display(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


### 云康化疗

In [29]:
class ChemotherapyProcess(Process):
    """云康化疗"""
    
    def __init__(self,path):
        super().__init__(path)
    
    def pharmacyReminder(self,specText,isHeader):
        """化疗用药提示"""
        df = self.get_table(specText,isHeader)
        target_columns = ["药物名称","用药提示"]
        df_columns = df.columns.to_list()
        if len(df_columns) <= 0:
            return df
        res_list = []
        # 将列长度每每分为两组
        for index in range(len(df_columns) // 2):
            tdf = df.iloc[:,index * 2:(index + 1) * 2]
            if set(tdf.columns.to_list()) == set(target_columns):
                res_list.append(tdf)
        if res_list:
            tdf = pd.concat(res_list)
        if not tdf.empty:
            return tdf
        return df
    
    def analysisResult(self,specText,isHeader):
        """化疗药物相关检测结果解析"""
        df = self.get_table(specText,isHeader)
#         display(df)
        return df
    
    def run(self,specText,isHeader,func):
        """开始处理"""
        funtion = getattr(self,func)
        df = funtion(specText,isHeader)
        return df

In [33]:
cp = ChemotherapyProcess(r"E:\other\Word文档解读\data\510485152600-许帮蓉.docx")
chemotherapyDict = [
    {'title': '受检者信息', 'prefix': '受检者信息', 'isHeader': False,"function":"base_table"},
    {'title': '3.5 化疗用药提示', 'prefix': '化疗用药提示', 'isHeader': True,"function":"pharmacyReminder"}, 
    {'title': '六、化疗药物相关检测结果解析', 'prefix': '化疗药物相关检测结果解析', 'isHeader': True,"function":"analysisResult"}, 
]
df_list = []
for item in chemotherapyDict:
    df = cp.run(item["title"],item["isHeader"],item["function"])
    df_list.append(df)
    display(df)

Unnamed: 0,姓名,性别,年龄,送检医院,送检科室,送检医生,门诊号,样本类型,原病理号,条码号,病理(临床)诊断,检测号,治疗史,肿瘤家族史,样本采集日期,接收日期,报告日期
0,许帮蓉,女,69岁,,,/,,全血,,,,,/,/,/,2020-03-26,2020-04-05


Unnamed: 0,药物名称,用药提示
0,卡铂,常规使用
1,奥沙利铂,常规使用
2,顺铂,常规使用
3,多柔比星,常规使用
4,表柔比星,推荐使用
5,依托泊苷,常规使用
6,他莫昔芬,常规使用
7,卡培他滨,常规使用
8,吉西他滨,常规使用
9,氟尿嘧啶,常规使用


Unnamed: 0,药物,检测位点,检测位点.1,基因型,证据等级,用药提示
0,卡铂,MTHFR,rs1801133,GG,Level 2A,药效减弱
1,奥沙利铂,ERCC1,rs11615,AG,Level 2B,毒副作用增强；药效增强
2,奥沙利铂,GSTP1,rs1695,AG,Level 2A,毒副作用减弱
3,奥沙利铂,XRCC1,rs25487,CT,Level 2B,药效减弱
4,顺铂,ERCC1,rs3212986,AC,Level 2B,毒副作用增强
5,顺铂,ERCC2,rs13181,GT,Level 3,药效增强
6,顺铂,ERCC2,rs1799793,CC,Level 3,药效减弱
7,顺铂,TPMT,rs1142345,TT,Level 3,毒副作用减弱
8,顺铂,TPMT,rs1800460,CC,Level 3,毒副作用减弱
9,顺铂,XPC,rs2228001,GG,Level 1B,毒副作用增强


In [32]:
pd.concat(df_list)

ValueError: Plan shapes are not aligned

### 云康遗传性肿瘤

In [13]:
class InheritanceTumourProcess(Process):
    """云康遗传性肿瘤"""
    
    def __init__(self,path):
        super().__init__(path)
            
    
    def analysisResult(self,specText,isHeader):
        """遗传性肿瘤基因变异解析"""
        df = self.get_table(specText,isHeader)
#         display(df)
        return df
    
    def run(self,specText,isHeader,func):
        """开始处理"""
        funtion = getattr(self,func)
        df = funtion(specText,isHeader)
        return df

In [14]:
itp = InheritanceTumourProcess(r"E:\other\Word文档解读\data\442092614200-严爱凤.docx")
# 云康遗传性肿瘤
inheritanceTumourDict = [
#     {'title': '受检者信息', 'prefix': '受检者信息', 'isHeader': False,"function":"base_table"},
    {'title': '7.1 遗传性肿瘤基因变异解析', 'prefix': '7遗传性肿瘤基因变异解析', 'isHeader': True,"function":"analysisResult"}, 
]
for item in inheritanceTumourDict:
    df = itp.run(item["title"],item["isHeader"],item["function"])
#     display(df)

# 云康检出变异

In [15]:
class DetectionAbnormalProcess(Process):
    """云康检出变异"""
    def __init__(self,path):
        super().__init__(path)
            
    def bodyCellCollect(self,specText,isHeader):
        """遗传性肿瘤基因变异解析"""
        df = self.get_table(specText,isHeader)
#         display(df)
        return df
    
    def embryonalSystemCollect(self,specText,isHeader):
        """胚系变异结果汇总"""
        df = self.get_table(specText,isHeader)
#         display(df)
        return df
    
    def run(self,specText,isHeader,func):
        """开始处理"""
        funtion = getattr(self,func)
        df = funtion(specText,isHeader)
        return df
    

In [16]:
dap = DetectionAbnormalProcess(r"E:\other\Word文档解读\data\442092614200-严爱凤.docx")
# 云康检出变异
detectionAbnormalDict = [
#     {'title': '受检者信息', 'prefix': '受检者信息', 'isHeader': False,"function":"base_table"},
    {'title': '8.1体细胞变异结果汇总', 'prefix': '体细胞变异结果汇总', 'isHeader': True,"function":"bodyCellCollect"},
    {'title': '8.2 胚系变异结果汇总', 'prefix': '胚系变异结果汇总', 'isHeader': True,"function":"embryonalSystemCollect"}, 
]
for item in detectionAbnormalDict:
    df = dap.run(item["title"],item["isHeader"],item["function"])
#     display(df)

### 处理

In [21]:
def Run(Process,dataDict):
    """开始处理"""
    baseDict = [
                {'title': '受检者信息', 'prefix': '受检者信息', 'isHeader': False,"function":"base_table"},
    ]
    root_path = r"./data"  # 起始目录
    file_list = get_all_docx(root_path)  # 获取起始目录下的所有word文件
    for file in file_list:
        try:
            file_path = os.path.join(root_path,file)
            ip = Process(file_path)
            bdf = pd.DataFrame()
            for base_item in baseDict:
                bdf = ip.run(base_item["title"],base_item["isHeader"],base_item["function"])
            if bdf.empty:
                raise ValueError("基础信息出错")
            for item in dataDict:
                tmp_df = []
                tmp_df.append(bdf)
                df = ip.run(item["title"],item["isHeader"],item["function"])
#                 display(df)
                tmp_df.append(df)
                ndf = pd.concat(tmp_df,ignore_index=True)
                ndf = ndf.fillna(method="ffill")
                ndf = ndf.fillna(method="bfill")
                # 将不规则的列名统一处理
                wait_columns = {"药物":"药物","药物名称":"药物","药物\n类别":"类别"}
                columns = ndf.columns.to_list()
#                 print("columns",columns)
                for index,c_item in enumerate(columns):
                    if c_item in list(wait_columns.keys()):
                        columns[index] = wait_columns.get(c_item)
                ndf.columns = columns
                item["result"].append(ndf)
        except Exception as e:
            print("file",file)
            traceback.print_exc()
    return dataDict

### 保存结果

In [22]:
def saveResult(save_path,df_dict):
    try:
        excel_writer = pd.ExcelWriter(save_path, engine='xlsxwriter')
        for item in df_dict:
            sheetname = item["prefix"]
            rels = item["result"]
            df = pd.concat(rels,ignore_index=True)
            df.to_excel(excel_writer, sheet_name=sheetname, index=None)
        excel_writer.close()
    except Exception as e:
#         display(rels)
        traceback.print_exc()

In [28]:
# 云康免疫
immuneDict = [
                {'title': '3.4 免疫治疗药物提示', 'prefix': '检测结果概述', 'isHeader': True,"function":"immuneCure","result":[]},
                {'title': '5.2 肿瘤突变负荷（TMB）', 'prefix': '肿瘤突变负荷（TMB）', 'isHeader': False,"function":"TMB","result":[]},
                {'title': '5.3 微卫星不稳定性（MSI）', 'prefix': '微卫星不稳定性（MSI）', 'isHeader': True,"function":"MSI","result":[]},
                {'title': '5.4 免疫治疗获益相关基因', 'prefix': '免疫治疗获益相关基因', 'isHeader': True,"function":"immuneProfit","result":[]},
                {'title': '5.5 免疫治疗耐药/快速进展相关基因', 'prefix': '免疫治疗耐药快速进展相关基因', 'isHeader': True,"function":"drugfast","result":[]},
                {'title': '5.6 HLA分型解析', 'prefix': 'HLA分型解析', 'isHeader': True,"function":"HLA","result":[]}, 
            ]
# 云康靶向
targetDict = [
                {'title': '3.3 FDA获批/NCCN指南推荐的靶向药物相关基因变异', 'prefix': 'FDAorNCCN靶向药物', 'isHeader': True,"function":"FDA","result":[]}, 
                {'title': '4.2 其他与肿瘤发生相关变异', 'prefix': '其他与肿瘤发生相关变异', 'isHeader': True,"function":"otherAndTumour","result":[]}, 
                {'title': '4.3 基因变异注释', 'prefix': '基因变异注释', 'isHeader': True,"function":"geneVariation","result":[]}, 
                {'title': '4.4 潜在获益药物/潜在耐药药物解析', 'prefix': '药物解析', 'isHeader': True,"function":"potentialMedicine","result":[]}, 
                {'title': '4.5 相关临床研究', 'prefix': '靶向治疗药物相关临床研究', 'isHeader': True,"function":"clinicStudy","result":[]}, 
            ]
# 云康化疗
chemotherapyDict = [
                {'title': '3.5 化疗用药提示', 'prefix': '化疗用药提示', 'isHeader': True,"function":"pharmacyReminder","result":[]}, 
                {'title': '六、化疗药物相关检测结果解析', 'prefix': '化疗药物相关检测结果解析', 'isHeader': True,"function":"analysisResult","result":[]}, 
            ]
# 云康遗传性肿瘤
inheritanceTumourDict = [
                {'title': '7.1 遗传性肿瘤基因变异解析', 'prefix': '7遗传性肿瘤基因变异解析', 'isHeader': True,"function":"analysisResult","result":[]}, 
            ]
# 云康检出变异
detectionAbnormalDict = [
             {'title': '8.1体细胞变异结果汇总', 'prefix': '体细胞变异结果汇总', 'isHeader': True,"function":"bodyCellCollect","result":[]},
             {'title': '8.2 胚系变异结果汇总', 'prefix': '胚系变异结果汇总', 'isHeader': True,"function":"embryonalSystemCollect","result":[]}, 
            ]

version = "v1.0.0"
sequence_items = [immuneDict,targetDict,chemotherapyDict,inheritanceTumourDict,detectionAbnormalDict]
process_items = [ImmuneProcess,TargetProcess,ChemotherapyProcess,InheritanceTumourProcess,DetectionAbnormalProcess]
save_paths = ["./result(云康肿瘤报告需求)/云康免疫_{}.xlsx".format(version),"./result(云康肿瘤报告需求)/云康靶向_{}.xlsx".format(version),
             "./result(云康肿瘤报告需求)/云康化疗_{}.xlsx".format(version),"./result(云康肿瘤报告需求)/云康遗传性肿瘤_{}.xlsx".format(version),
             "./result(云康肿瘤报告需求)/云康检出变异_{}.xlsx".format(version)]
result_list = []


# for dict_data,process,path in zip(sequence_items,process_items,save_paths):
#     dataDict = Run(process,dict_data)
#     result_list.append(dataDict)
#     saveResult(path,dataDict)

# 云康化疗
chemotherapyDict = Run(ChemotherapyProcess,chemotherapyDict)
saveResult(save_paths[2],chemotherapyDict)
print("云康化疗处理完成")
# 云康遗传性肿瘤
inheritanceTumourDict = Run(InheritanceTumourProcess,inheritanceTumourDict)
saveResult(save_paths[3],inheritanceTumourDict)
print("云康遗传性肿瘤处理完成")
# 检出变异
detectionAbnormalDict = Run(DetectionAbnormalProcess,detectionAbnormalDict)
saveResult(save_paths[4],detectionAbnormalDict)
print("检出变异处理完成")


file 510485152600-许帮蓉.docx


Traceback (most recent call last):
  File "C:\Users\LWSDES~1.000\AppData\Local\Temp/ipykernel_72928/3156686809.py", line 23, in Run
    ndf = pd.concat(tmp_df,ignore_index=True)
  File "C:\Users\lws.DESKTOP-LBD2U5F.000\.conda\envs\daTools\lib\site-packages\pandas\core\reshape\concat.py", line 284, in concat
    return op.get_result()
  File "C:\Users\lws.DESKTOP-LBD2U5F.000\.conda\envs\daTools\lib\site-packages\pandas\core\reshape\concat.py", line 497, in get_result
    mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy
  File "C:\Users\lws.DESKTOP-LBD2U5F.000\.conda\envs\daTools\lib\site-packages\pandas\core\internals\managers.py", line 2006, in concatenate_block_managers
    for placement, join_units in concat_plan:
  File "C:\Users\lws.DESKTOP-LBD2U5F.000\.conda\envs\daTools\lib\site-packages\pandas\core\internals\concat.py", line 470, in combine_concat_plans
    raise ValueError("Plan shapes are not aligned")
ValueError: Plan shapes are not aligned


云康化疗处理完成
云康遗传性肿瘤处理完成
检出变异处理完成
