In [None]:
"""
CAR-T治疗自身免疫疾病专利智能分析系统
基于智慧芽API的CAR-T自身免疫疾病专利深度分析
"""

import requests
import json
import time
import pandas as pd
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from IPython.display import display, HTML
import re
from openai import OpenAI

# ==================== 基础配置 ====================

class PatentAnalysisSystem:
    """专利分析系统主类"""
    
    def __init__(self, search_topic: str = None):
        # 智慧芽API配置
        self.base_url = "https://connect.zhihuiya.com"
        self.api_key = "fh10ixx8marmhm9kbl3cx5676qn8nshcuwtktz0b05ebl7qf"
        self.client_credentials = "74z26dxne81bnmrbd8vjwt7r8fc6tr6cxxdvapslbz4knycxknv3dnjprap6igjy"
        self.token = None
        self.session = requests.Session()
        
        # LLM配置
        self.llm_client = OpenAI(
            api_key='sk-9b3ad78d6d51431c90091b575072e62f',
            base_url="https://api.deepseek.com"
        )
        
        # 分析配置
        self.search_topic = search_topic or "CAR-T autoimmune"
        self.initial_patents = 100
        self.top_patents = 10
        
    def set_search_topic(self, topic: str):
        """设置搜索主题"""
        self.search_topic = topic
        self.log(f"搜索主题设置为: {topic}", "INFO")
        
    def log(self, message: str, level: str = "INFO"):
        """日志输出"""
        timestamp = datetime.now().strftime("%H:%M:%S")
        color_map = {"INFO": "blue", "SUCCESS": "green", "ERROR": "red", "WARN": "orange"}
        color = color_map.get(level, "blue")
        display(HTML(f'<span style="color:{color};">[{timestamp}] {level}: {message}</span>'))
    
    def llm_call(self, prompt: str) -> str:
        """调用LLM"""
        try:
            response = self.llm_client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are a professional patent analyst specializing in CAR-T cell therapy and autoimmune diseases."},
                    {"role": "user", "content": prompt}
                ],
                stream=False
            )
            return response.choices[0].message.content
        except Exception as e:
            self.log(f"LLM调用失败: {str(e)}", "ERROR")
            return ""

# ==================== Step 1: 智慧芽API接口 ====================

class ZhihuiyaAPI:
    """智慧芽API接口类"""
    
    def __init__(self, system: PatentAnalysisSystem):
        self.system = system
        
    def authenticate(self) -> bool:
        """获取访问token"""
        try:
            url = f"{self.system.base_url}/oauth/token"
            headers = {"content-type": "application/x-www-form-urlencoded"}
            data = f"grant_type=client_credentials&client_id={self.system.api_key}&client_secret={self.system.client_credentials}"
            
            response = self.system.session.post(url, data=data, headers=headers)
            response.raise_for_status()
            
            result = response.json()
            if result.get("status") and "data" in result:
                self.system.token = result["data"]["token"]
                self.system.log("✅ Token获取成功", "SUCCESS")
                return True
            return False
        except Exception as e:
            self.system.log(f"认证失败: {str(e)}", "ERROR")
            return False
    
    def search_patents(self, query: str, limit: int = 100) -> List[Dict]:
        """P002 - 专利检索"""
        if not self.system.token and not self.authenticate():
            return []
        
        try:
            url = f"{self.system.base_url}/search/patent/query-search-patent/v2"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {"apikey": self.system.api_key}
            
            payload = {
                "sort": [{"field": "SCORE", "order": "DESC"}],
                "limit": limit,
                "offset": 0,
                "query_text": query,
                "collapse_by": "PBD",
                "collapse_type": "ALL"
            }
            
            self.system.log(f"🔍 检索专利: {query} (限制{limit}件)")
            response = self.system.session.post(url, params=params, json=payload, headers=headers)
            response.raise_for_status()
            
            result = response.json()
            if result.get("status") and "data" in result:
                patents = result["data"].get("results", [])
                self.system.log(f"✅ 找到 {len(patents)} 件专利", "SUCCESS")
                return patents
            return []
        except Exception as e:
            self.system.log(f"检索失败: {str(e)}", "ERROR")
            return []
    
    def get_simple_bibliography(self, patent_id: str, patent_number: str) -> Optional[Dict]:
        """P011 - 获取简要著录项目（含摘要）"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/simple-bibliography"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            if result.get("status") and result.get("data"):
                return result["data"][0] if isinstance(result["data"], list) else result["data"]
            return None
        except Exception as e:
            self.system.log(f"P011获取失败 {patent_number}: {str(e)}", "ERROR")
            return None
    
    def get_legal_status(self, patent_id: str, patent_number: str) -> Optional[Dict]:
        """获取法律状态"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/legal-status"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            return result.get("data") if result.get("status") else None
        except Exception as e:
            self.system.log(f"法律状态获取失败: {str(e)}", "ERROR")
            return None
    
    def get_claims(self, patent_id: str, patent_number: str) -> Optional[str]:
        """获取权利要求书"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/claim-data"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key,
                "replace_by_related": "0"
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            if result.get("status") and result.get("data"):
                claims_data = result["data"]
                if isinstance(claims_data, list) and claims_data:
                    claims = claims_data[0].get("claims", [])
                    claims_text = "\n\n".join([
                        f"Claim {c.get('claim_num', '')}: {c.get('claim_text', '')}"
                        for c in claims
                    ])
                    return claims_text
            return None
        except Exception as e:
            self.system.log(f"权利要求获取失败: {str(e)}", "ERROR")
            return None
    
    def get_description(self, patent_id: str, patent_number: str) -> Optional[str]:
        """获取说明书"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/description-data"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key,
                "replace_by_related": "0"
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            if result.get("status") and result.get("data"):
                desc_data = result["data"]
                if isinstance(desc_data, list) and desc_data:
                    desc_text = desc_data[0].get("description", [{}])[0].get("text", "")
                    # 限制长度
                    if len(desc_text) > 50000:
                        desc_text = desc_text[:50000] + "\n...[内容已截断]"
                    return desc_text
            return None
        except Exception as e:
            self.system.log(f"说明书获取失败: {str(e)}", "ERROR")
            return None

# ==================== Step 2: CAR-T自身免疫专利筛选与分析 ====================

class CARTAutoImmuneScreener:
    """CAR-T自身免疫疾病专利筛选与评分"""
    
    def __init__(self, system: PatentAnalysisSystem):
        self.system = system
        
        # CAR-T相关关键词
        self.cart_keywords = [
            "car-t", "cart", "car t", "chimeric antigen receptor",
            "car cell", "car therapy", "engineered t cell", "engineered tcell",
            "t cell therapy", "tcell therapy", "adoptive cell", "cellular immunotherapy"
        ]
        
        # 自身免疫疾病关键词
        self.autoimmune_keywords = [
            # 一般术语
            "autoimmune", "auto-immune", "autoimmunity", "self-reactive",
            "tolerance", "immune tolerance", "autoreactive",
            
            # 具体疾病
            "lupus", "sle", "systemic lupus erythematosus",
            "rheumatoid arthritis", "ra arthritis",
            "multiple sclerosis", "ms disease",
            "type 1 diabetes", "t1d", "iddm",
            "inflammatory bowel", "ibd", "crohn", "ulcerative colitis",
            "psoriasis", "psoriatic",
            "sjogren", "sjögren",
            "scleroderma", "systemic sclerosis",
            "myasthenia gravis",
            "hashimoto", "thyroiditis",
            "pemphigus", "pemphigoid",
            "vasculitis", "anca",
            "dermatomyositis", "polymyositis",
            "antiphospholipid", "aps syndrome"
        ]
        
        # 相关靶点
        self.target_keywords = [
            "cd19", "cd20", "bcma", "cd5", "baff", "april",
            "cd38", "cd138", "plasmablast", "plasma cell",
            "b cell", "bcell", "b-cell", "b lymphocyte",
            "memory b", "autoreactive b",
            "treg", "regulatory t", "t regulatory",
            "cd4", "cd8", "cd3", "cd25", "foxp3",
            "il-17", "il17", "th17", "il-23", "il23"
        ]
        
    def process_initial_patents(self, patents: List[Dict]) -> pd.DataFrame:
        """处理初始专利数据"""
        processed = []
        
        for i, patent in enumerate(patents, 1):
            if i % 20 == 0:
                self.system.log(f"处理进度: {i}/{len(patents)}")
            
            # 提取基础信息
            patent_info = {
                "patent_id": patent.get("patent_id"),
                "patent_number": patent.get("pn"),
                "title": self._extract_title(patent),
                "assignee": patent.get("current_assignee", ""),
                "application_date": str(patent.get("apdt", "")),
                "publication_date": str(patent.get("pbdt", "")),
                "abstract": "",
                "legal_status": "",
                "score": patent.get("score", 0),
                "is_cart_related": False,
                "is_autoimmune_related": False,
                "cart_score": 0,
                "autoimmune_score": 0
            }
            
            # 初步判断相关性
            title_abstract = str(patent_info["title"]).lower()
            patent_info["is_cart_related"] = any(kw in title_abstract for kw in self.cart_keywords)
            patent_info["is_autoimmune_related"] = any(kw in title_abstract for kw in self.autoimmune_keywords)
            
            processed.append(patent_info)
            time.sleep(0.1)  # API限流
        
        return pd.DataFrame(processed)
    
    def _extract_title(self, patent: Dict) -> str:
        """提取标题"""
        title = patent.get("title", "")
        if isinstance(title, dict):
            title = title.get("en") or title.get("zh", "")
        return str(title)
    
    def enrich_with_abstracts(self, df: pd.DataFrame, api: ZhihuiyaAPI) -> pd.DataFrame:
        """补充摘要和法律状态，并重新评估相关性"""
        self.system.log("📄 获取摘要和法律状态...")
        
        for idx, row in df.iterrows():
            if idx % 10 == 0:
                self.system.log(f"进度: {idx}/{len(df)}")
            
            # 获取摘要
            biblio = api.get_simple_bibliography(row["patent_id"], row["patent_number"])
            if biblio:
                abstracts = biblio.get("bibliographic_data", {}).get("abstracts", [])
                if abstracts:
                    abstract_text = abstracts[0].get("text", "")[:1000]
                    df.at[idx, "abstract"] = abstract_text
                    
                    # 重新评估相关性（包含摘要）
                    full_text = (str(row["title"]) + " " + abstract_text).lower()
                    
                    # CAR-T相关性评分
                    cart_score = sum(2 for kw in self.cart_keywords if kw in full_text)
                    cart_score += sum(1 for kw in self.target_keywords if kw in full_text)
                    df.at[idx, "cart_score"] = cart_score
                    df.at[idx, "is_cart_related"] = cart_score > 0
                    
                    # 自身免疫相关性评分
                    autoimmune_score = sum(2 for kw in self.autoimmune_keywords if kw in full_text)
                    df.at[idx, "autoimmune_score"] = autoimmune_score
                    df.at[idx, "is_autoimmune_related"] = autoimmune_score > 0
            
            # 获取法律状态
            legal = api.get_legal_status(row["patent_id"], row["patent_number"])
            if legal and isinstance(legal, list) and legal:
                legal_info = legal[0].get("patent_legal", {})
                status = legal_info.get("simple_legal_status", [])
                df.at[idx, "legal_status"] = ", ".join(status) if status else "Unknown"
            
            time.sleep(0.2)
        
        return df
    
    def filter_cart_autoimmune_patents(self, df: pd.DataFrame) -> pd.DataFrame:
        """筛选CAR-T+自身免疫相关专利"""
        # 筛选同时包含CAR-T和自身免疫关键词的专利
        filtered = df[(df["is_cart_related"] == True) & (df["is_autoimmune_related"] == True)]
        
        if len(filtered) == 0:
            # 如果没有严格符合的，放宽条件
            self.system.log("未找到严格符合CAR-T+自身免疫的专利，尝试放宽条件...", "WARN")
            filtered = df[(df["cart_score"] > 0) | (df["autoimmune_score"] > 0)]
        
        self.system.log(f"筛选出 {len(filtered)} 件相关专利", "SUCCESS")
        return filtered
    
    def analyze_patent_statistics(self, df: pd.DataFrame) -> Dict:
        """统计分析CAR-T自身免疫专利"""
        stats = {
            "total_patents": len(df),
            "cart_autoimmune_patents": len(df[(df["is_cart_related"] == True) & (df["is_autoimmune_related"] == True)]),
            "cart_only": len(df[(df["is_cart_related"] == True) & (df["is_autoimmune_related"] == False)]),
            "autoimmune_only": len(df[(df["is_cart_related"] == False) & (df["is_autoimmune_related"] == True)]),
            "assignee_distribution": df["assignee"].value_counts().to_dict(),
            "year_distribution": df["application_date"].str[:4].value_counts().to_dict(),
            "legal_status_distribution": df["legal_status"].value_counts().to_dict()
        }
        
        # 分析具体疾病类型
        disease_types = {
            "Lupus/SLE": 0,
            "Rheumatoid Arthritis": 0,
            "Multiple Sclerosis": 0,
            "Type 1 Diabetes": 0,
            "IBD/Crohn's/UC": 0,
            "Psoriasis": 0,
            "Other Autoimmune": 0,
            "Not Specified": 0
        }
        
        # 分析靶点分布
        target_distribution = {
            "CD19": 0,
            "CD20": 0,
            "BCMA": 0,
            "CD5": 0,
            "Other B cell": 0,
            "T cell targets": 0,
            "Other/Unknown": 0
        }
        
        for _, row in df.iterrows():
            text = (str(row["title"]) + " " + str(row["abstract"])).lower()
            
            # 疾病分类
            disease_found = False
            if any(kw in text for kw in ["lupus", "sle", "systemic lupus"]):
                disease_types["Lupus/SLE"] += 1
                disease_found = True
            if any(kw in text for kw in ["rheumatoid", "ra arthritis"]):
                disease_types["Rheumatoid Arthritis"] += 1
                disease_found = True
            if any(kw in text for kw in ["multiple sclerosis", "ms disease"]):
                disease_types["Multiple Sclerosis"] += 1
                disease_found = True
            if any(kw in text for kw in ["type 1 diabetes", "t1d", "iddm"]):
                disease_types["Type 1 Diabetes"] += 1
                disease_found = True
            if any(kw in text for kw in ["inflammatory bowel", "ibd", "crohn", "ulcerative colitis"]):
                disease_types["IBD/Crohn's/UC"] += 1
                disease_found = True
            if any(kw in text for kw in ["psoriasis", "psoriatic"]):
                disease_types["Psoriasis"] += 1
                disease_found = True
            if not disease_found and row["is_autoimmune_related"]:
                disease_types["Other Autoimmune"] += 1
            elif not disease_found:
                disease_types["Not Specified"] += 1
            
            # 靶点分类
            if "cd19" in text:
                target_distribution["CD19"] += 1
            elif "cd20" in text:
                target_distribution["CD20"] += 1
            elif "bcma" in text:
                target_distribution["BCMA"] += 1
            elif "cd5" in text:
                target_distribution["CD5"] += 1
            elif any(kw in text for kw in ["b cell", "bcell", "b-cell"]):
                target_distribution["Other B cell"] += 1
            elif any(kw in text for kw in ["cd4", "cd8", "cd3", "treg"]):
                target_distribution["T cell targets"] += 1
            else:
                target_distribution["Other/Unknown"] += 1
        
        stats["disease_distribution"] = disease_types
        stats["target_distribution"] = target_distribution
        
        return stats
    
    def score_and_rank_patents(self, df: pd.DataFrame) -> pd.DataFrame:
        """评分并排序CAR-T自身免疫专利"""
        self.system.log("⚖️ 专利评分中...")
        
        # 顶级制药和细胞治疗公司
        top_companies = [
            "NOVARTIS", "KITE", "JUNO", "CELGENE", "BRISTOL", "BMS",
            "GILEAD", "JANSSEN", "JOHNSON", "PFIZER", "ROCHE",
            "SANGAMO", "BLUEBIRD", "CRISPR", "EDITAS", "INTELLIA",
            "CABALETTA", "CARTESIAN", "KYVERNA", "SONOMA", "TREGS"
        ]
        
        # 顶级研究机构
        top_institutions = [
            "UNIVERSITY", "PENN", "UPENN", "STANFORD", "MIT", "HARVARD",
            "YALE", "UCLA", "UCSF", "JOHNS HOPKINS", "MEMORIAL SLOAN",
            "FRED HUTCH", "DANA FARBER", "MD ANDERSON", "NIH", "NCI"
        ]
        
        for idx, row in df.iterrows():
            score = 0
            
            # 1. CAR-T相关度（0-30分）
            score += min(row["cart_score"] * 3, 30)
            
            # 2. 自身免疫相关度（0-30分）
            score += min(row["autoimmune_score"] * 3, 30)
            
            # 3. 申请人权重（0-20分）
            assignee = str(row["assignee"]).upper()
            if any(comp in assignee for comp in top_companies):
                score += 20
            elif any(inst in assignee for inst in top_institutions):
                score += 15
            elif assignee:
                score += 5
            
            # 4. 时间新鲜度（0-10分）
            pub_date = str(row["publication_date"])
            if pub_date >= "20240000":
                score += 10
            elif pub_date >= "20230000":
                score += 8
            elif pub_date >= "20220000":
                score += 6
            elif pub_date >= "20200000":
                score += 4
            
            # 5. 法律状态（0-10分）
            legal = str(row["legal_status"]).lower()
            if "grant" in legal or "授权" in legal:
                score += 10
            elif "pending" in legal or "审查" in legal:
                score += 5
            
            df.at[idx, "final_score"] = score
        
        # 排序
        df_sorted = df.sort_values("final_score", ascending=False)
        
        return df_sorted

# ==================== Step 3: 深度分析Prompts ====================

class CARTAutoImmuneAnalysisPrompts:
    """CAR-T自身免疫专利分析Prompt模板"""
    
    def description_analysis_prompt(self, description_text: str, patent_info: Dict) -> str:
        """说明书分析prompt"""
        return f"""
作为CAR-T细胞治疗和自身免疫疾病领域的专利技术专家，请深度分析以下专利的说明书，并以连贯的段落形式输出分析结果。

专利号：{patent_info['patent_number']}
申请人：{patent_info['assignee']}
申请日：{patent_info['application_date']}

说明书内容：
{description_text}

请按以下结构分析（每部分用2-3个完整段落表述）：

## 1. 技术概述（2段）
第一段：描述这是什么类型的CAR-T技术，针对什么自身免疫疾病，要解决什么具体问题。
第二段：说明核心创新点，特别是相比传统CAR-T肿瘤治疗的适应性改进。

## 2. CAR结构与设计（3段）
第一段：详细描述CAR的结构设计，包括抗原识别域(scFv)、铰链区、跨膜域、信号转导域的具体选择。
第二段：分析靶点选择的科学依据，为什么选择该靶点治疗自身免疫疾病。
第三段：安全性设计，如自杀开关、可调控系统、避免过度免疫抑制的策略。

## 3. 制备工艺与质控（2段）
第一段：T细胞来源、转导方法（慢病毒/逆转录病毒/电穿孔）、扩增培养条件。
第二段：质量控制标准，包括CAR表达率、细胞纯度、功能检测等。

## 4. 实验验证（3段）
第一段：体外实验设计，包括细胞毒性、细胞因子释放、靶细胞清除等。
第二段：动物模型实验，使用什么自身免疫疾病模型，疗效评估指标。
第三段：临床前安全性评估，特别是针对自身免疫治疗的特殊安全性考虑。

## 5. 临床转化潜力（2段）
第一段：目标适应症的市场规模，与现有疗法（生物制剂、小分子药物）的比较优势。
第二段：临床开发策略，预期的临床试验设计，剂量选择，疗效终点。

## 6. 关键技术参数提取
- CAR结构：具体的scFv、信号域组合
- 靶点：具体的抗原靶点
- 适应症：目标自身免疫疾病
- 制备参数：转导效率、扩增倍数
- 疗效数据：关键的体内外实验数据
- 安全性特征：特殊的安全性设计

输出要求：
- 使用完整流畅的段落，避免碎片化列表
- 突出CAR-T治疗自身免疫疾病的特殊性
- 保持专业但易读的文风
- 总字数控制在1000-1500字
"""
    
    def claims_analysis_prompt(self, claims_text: str, patent_info: Dict) -> str:
        """权利要求分析prompt"""
        return f"""
作为专利法律专家，请分析以下CAR-T治疗自身免疫疾病专利的权利要求书，并以适合专业报告的段落形式输出。

专利号：{patent_info['patent_number']}
申请人：{patent_info['assignee']}

权利要求书：
{claims_text}

请按以下结构分析（每部分用2-3个完整段落表述）：

## 1. 权利要求架构概述（2段）
第一段：描述权利要求的整体结构，产品权利要求与方法权利要求的分布。
第二段：分析CAR-T相关权利要求的层次设计策略。

## 2. 核心保护范围分析（3段）
第一段：分析CAR结构相关的权利要求保护范围。
第二段：分析治疗方法相关的权利要求，特别是自身免疫适应症的限定。
第三段：评估制备方法权利要求的保护价值。

## 3. 技术特征递进策略（2段）
第一段：分析从属权利要求如何逐步限定CAR结构、靶点、疾病类型。
第二段：评价关键从属权利要求对商业化的影响。

## 4. 法律稳定性与侵权分析（2段）
第一段：评估权利要求相对于现有CAR-T技术的创造性。
第二段：分析潜在的设计规避路径和防御策略。

## 5. 与其他CAR-T专利的关系（1段）
分析该专利与Novartis、Kite等主要CAR-T专利的区别和潜在冲突。

输出要求：
- 使用连贯的专业段落
- 突出自身免疫领域的特殊性
- 总字数控制在800-1200字
"""
    
    def final_report_prompt(self, statistics: Dict, detailed_analyses: List[Dict]) -> str:
        """最终综合报告prompt"""
        return f"""
你是专业的专利分析师，请基于以下数据撰写一份详细的CAR-T治疗自身免疫疾病专利技术综述报告。

【专利统计数据】
{json.dumps(statistics, ensure_ascii=False, indent=2)}

【核心专利详细分析】
{json.dumps(detailed_analyses, ensure_ascii=False, indent=2)}

请生成一份专业的专利技术综述报告，格式如下：

# CAR-T细胞疗法治疗自身免疫疾病全球专利态势分析

## 执行摘要
简要概述CAR-T在自身免疫领域的专利现状和主要发现（300字）。

## 一、技术背景与市场机遇

### CAR-T从肿瘤到自身免疫的转化（400字）
- CAR-T在血液肿瘤的成功经验
- 自身免疫疾病的未满足需求
- CAR-T治疗自身免疫的科学基础

### 专利申请趋势分析（300字）
基于统计数据，分析：
- 年度申请量变化
- 技术成熟度评估
- 与CAR-T肿瘤专利的对比

## 二、主要专利权利人竞争格局

### 领先企业分析（各300字）
基于核心专利分析，详述主要申请人的：
- 技术路线特点
- 专利布局策略
- 临床开发进展

### 学术机构贡献（300字）
分析大学和研究机构的专利特点。

## 三、关键技术创新分析

### 靶点选择策略（400字）
- CD19 B细胞清除策略
- 其他B细胞靶点（CD20、BCMA等）
- T细胞靶点探索
- 双靶点CAR设计

### CAR结构优化（400字）
- 针对自身免疫的特殊设计
- 安全性改进（自杀开关、可调控系统）
- 持久性与记忆性优化

### 适应症覆盖（400字）
基于专利分析的疾病分布：
- 狼疮等B细胞介导疾病
- 类风湿关节炎
- 其他自身免疫疾病

## 四、专利保护策略分析

### 权利要求设计特点（300字）
- 产品vs方法权利要求
- 保护范围的平衡
- 与肿瘤CAR-T专利的区分

### 潜在的专利纠纷（300字）
- 基础CAR-T专利的影响
- 交叉许可可能性

## 五、临床转化与商业化前景

### 临床试验现状（300字）
基于专利中的临床设计信息。

### 市场预测（300字）
- 目标患者群体
- 定价策略考虑
- 与现有疗法的竞争

## 六、技术发展趋势与投资机会

### 未来技术方向（400字）
- 通用型CAR-T
- 基因编辑增强
- 联合治疗策略

### 投资建议（300字）
- 最具潜力的技术路线
- 关注的企业和机构
- 合作与许可机会

## 七、结论
总结CAR-T治疗自身免疫疾病的专利现状、机遇与挑战（300字）。

【输出要求】
1. 基于实际数据，不编造信息
2. 突出CAR-T治疗自身免疫的特殊性
3. 包含具体专利号和申请人信息
4. 保持客观专业的分析视角
5. 总字数3500-4500字
"""

# ==================== Step 4: 主流程执行 ====================

class CARTAutoImmuneAnalysisPipeline:
    """CAR-T自身免疫专利分析主流程"""
    
    def __init__(self):
        self.system = PatentAnalysisSystem()
        self.api = ZhihuiyaAPI(self.system)
        self.screener = CARTAutoImmuneScreener(self.system)
        self.prompts = CARTAutoImmuneAnalysisPrompts()
        
    def run_complete_analysis(self) -> Dict:
        """运行完整分析流程"""
        
        # ========== Step 1: 构建搜索查询 ==========
        self.system.log("=" * 50)
        self.system.log("🚀 Step 1: 搜索CAR-T自身免疫疾病相关专利", "INFO")
        
        # 构建复合搜索查询
        search_queries = [
            '("CAR-T" OR "CAR T" OR "chimeric antigen receptor") AND ("autoimmune" OR "autoimmunity" OR "lupus" OR "rheumatoid" OR "multiple sclerosis")',
            'CAR-T autoimmune disease',
            'chimeric antigen receptor autoimmune',
            'CAR T cell therapy lupus SLE',
            'CAR-T rheumatoid arthritis',
            'engineered T cell autoimmune'
        ]
        
        all_patents = []
        seen_ids = set()
        
        # 执行多个搜索查询以获得更全面的结果
        for query in search_queries[:3]:  # 使用前3个查询
            self.system.log(f"执行搜索: {query}")
            results = self.api.search_patents(query, limit=50)
            
            for patent in results:
                patent_id = patent.get("patent_id")
                if patent_id not in seen_ids:
                    all_patents.append(patent)
                    seen_ids.add(patent_id)
            
            time.sleep(2)  # 避免API限制
        
        if not all_patents:
            self.system.log("未找到相关专利", "ERROR")
            return {}
        
        self.system.log(f"✅ 共找到 {len(all_patents)} 件唯一专利", "SUCCESS")
        
        # ========== Step 2: 初步处理和筛选 ==========
        self.system.log("=" * 50)
        self.system.log("🔍 Step 2: 处理专利数据并筛选相关专利", "INFO")
        
        # 2.1 处理基础数据
        df_patents = self.screener.process_initial_patents(all_patents)
        
        # 2.2 补充摘要并重新评估相关性
        df_patents = self.screener.enrich_with_abstracts(df_patents, self.api)
        
        # 2.3 筛选CAR-T+自身免疫相关专利
        df_filtered = self.screener.filter_cart_autoimmune_patents(df_patents)
        
        if len(df_filtered) == 0:
            self.system.log("未找到符合条件的专利", "ERROR")
            return {}
        
        # 2.4 统计分析
        statistics = self.screener.analyze_patent_statistics(df_filtered)
        self.system.log("📊 专利统计分析完成", "SUCCESS")
        
        # 显示统计结果
        print("\nCAR-T自身免疫专利统计:")
        print(f"  总专利数: {statistics['total_patents']}")
        print(f"  CAR-T+自身免疫: {statistics['cart_autoimmune_patents']}")
        
        print("\n疾病类型分布:")
        for disease, count in statistics["disease_distribution"].items():
            if count > 0:
                print(f"  {disease}: {count}件")
        
        print("\n靶点分布:")
        for target, count in statistics["target_distribution"].items():
            if count > 0:
                print(f"  {target}: {count}件")
        
        # 2.5 评分和排序
        df_filtered = self.screener.score_and_rank_patents(df_filtered)
        
        # ========== Step 3: 选择Top专利 ==========
        self.system.log("=" * 50)
        self.system.log("🎯 Step 3: 选择Top专利进行深度分析", "INFO")
        
        # 选择前10个或所有（如果少于10个）
        num_top = min(10, len(df_filtered))
        top_patents = df_filtered.head(num_top)
        
        print(f"\nTop {num_top} CAR-T自身免疫专利:")
        for i, (idx, row) in enumerate(top_patents.iterrows(), 1):
            print(f"{i}. {row['patent_number']} - {row['assignee'][:40]} (Score: {row['final_score']})")
        
        # ========== Step 4: 深度分析Top专利 ==========
        self.system.log("=" * 50)
        self.system.log("🔬 Step 4: 深度分析核心专利", "INFO")
        
        detailed_analyses = []
        
        for i, (idx, patent) in enumerate(top_patents.iterrows(), 1):
            self.system.log(f"分析专利 {i}/{num_top}: {patent['patent_number']}")
            
            # 4.1 获取说明书
            description = self.api.get_description(patent["patent_id"], patent["patent_number"])
            
            # 4.2 获取权利要求
            claims = self.api.get_claims(patent["patent_id"], patent["patent_number"])
            
            if description and claims:
                # 4.3 LLM分析说明书
                desc_prompt = self.prompts.description_analysis_prompt(description, patent.to_dict())
                desc_analysis = self.system.llm_call(desc_prompt)
                
                # 4.4 LLM分析权利要求
                claims_prompt = self.prompts.claims_analysis_prompt(claims, patent.to_dict())
                claims_analysis = self.system.llm_call(claims_prompt)
                
                detailed_analyses.append({
                    "patent_number": patent["patent_number"],
                    "assignee": patent["assignee"],
                    "application_date": patent["application_date"],
                    "title": patent["title"],
                    "cart_score": patent["cart_score"],
                    "autoimmune_score": patent["autoimmune_score"],
                    "technical_analysis": desc_analysis,
                    "legal_analysis": claims_analysis
                })
                
                self.system.log(f"✅ 完成分析: {patent['patent_number']}", "SUCCESS")
            else:
                self.system.log(f"⚠️ 无法获取完整内容: {patent['patent_number']}", "WARN")
            
            time.sleep(2)  # API限流
        
        # ========== Step 5: 生成综合报告 ==========
        self.system.log("=" * 50)
        self.system.log("📝 Step 5: 生成综合报告", "INFO")
        
        # 5.1 准备数据
        statistics["top_patents"] = top_patents[["patent_number", "assignee", "final_score"]].to_dict("records")
        
        # 5.2 生成最终报告
        final_prompt = self.prompts.final_report_prompt(statistics, detailed_analyses)
        final_report = self.system.llm_call(final_prompt)
        
        # ========== 保存结果 ==========
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # 保存专利列表
        df_filtered.to_csv(f"cart_autoimmune_patents_{timestamp}.csv", index=False, encoding="utf-8-sig")
        self.system.log(f"✅ 专利列表已保存至: cart_autoimmune_patents_{timestamp}.csv", "SUCCESS")
        
        # 保存详细分析
        with open(f"cart_autoimmune_detailed_analysis_{timestamp}.json", "w", encoding="utf-8") as f:
            json.dump({
                "statistics": statistics,
                "detailed_analyses": detailed_analyses
            }, f, ensure_ascii=False, indent=2)
        
        # 保存最终报告
        with open(f"cart_autoimmune_report_{timestamp}.md", "w", encoding="utf-8") as f:
            f.write(final_report)
        
        self.system.log(f"✅ CAR-T自身免疫专利分析完成！", "SUCCESS")
        self.system.log(f"✅ 报告已保存至: cart_autoimmune_report_{timestamp}.md", "SUCCESS")
        
        return {
            "statistics": statistics,
            "detailed_analyses": detailed_analyses,
            "final_report": final_report,
            "patents_df": df_filtered
        }

# ==================== 运行分析 ====================

# 创建分析器并运行
pipeline = CARTAutoImmuneAnalysisPipeline()
results = pipeline.run_complete_analysis()

# 显示报告预览
if results and "final_report" in results:
    print("\n" + "=" * 50)
    print("📄 CAR-T自身免疫专利分析报告预览（前1500字）:")
    print("=" * 50)
    print(results["final_report"][:1500] + "...")

In [None]:

"""
通用基因专利智能分析系统 - Universal Gene Patent Analysis System
基于智慧芽API的任意基因专利深度分析
"""

import requests
import json
import time
import pandas as pd
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from IPython.display import display, HTML
import re
from openai import OpenAI

# ==================== 基础配置 ====================

class PatentAnalysisSystem:
    """专利分析系统主类"""
    
    def __init__(self, target_gene: str = None):
        # 智慧芽API配置
        self.base_url = "https://connect.zhihuiya.com"
        self.api_key = "fh10ixx8marmhm9kbl3cx5676qn8nshcuwtktz0b05ebl7qf"
        self.client_credentials = "74z26dxne81bnmrbd8vjwt7r8fc6tr6cxxdvapslbz4knycxknv3dnjprap6igjy"
        self.token = None
        self.session = requests.Session()
        
        # LLM配置
        self.llm_client = OpenAI(
            api_key='sk-9b3ad78d6d51431c90091b575072e62f',
            base_url="https://api.deepseek.com"
        )
        
        # 分析配置
        self.target_gene = target_gene or "GENE"  # 默认基因名
        self.initial_patents = 100
        self.top_patents = 10
        
    def set_target_gene(self, gene_name: str):
        """设置目标基因"""
        self.target_gene = gene_name
        self.log(f"目标基因设置为: {gene_name}", "INFO")
        
    def log(self, message: str, level: str = "INFO"):
        """日志输出"""
        timestamp = datetime.now().strftime("%H:%M:%S")
        color_map = {"INFO": "blue", "SUCCESS": "green", "ERROR": "red", "WARN": "orange"}
        color = color_map.get(level, "blue")
        display(HTML(f'<span style="color:{color};">[{timestamp}] {level}: {message}</span>'))
    
    def llm_call(self, prompt: str) -> str:
        """调用LLM"""
        try:
            response = self.llm_client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are a professional patent analyst specializing in biotechnology and pharmaceutical patents."},
                    {"role": "user", "content": prompt}
                ],
                stream=False
            )
            return response.choices[0].message.content
        except Exception as e:
            self.log(f"LLM调用失败: {str(e)}", "ERROR")
            return ""

# ==================== Step 1: 智慧芽API接口 ====================

class ZhihuiyaAPI:
    """智慧芽API接口类"""
    
    def __init__(self, system: PatentAnalysisSystem):
        self.system = system
        
    def authenticate(self) -> bool:
        """获取访问token"""
        try:
            url = f"{self.system.base_url}/oauth/token"
            headers = {"content-type": "application/x-www-form-urlencoded"}
            data = f"grant_type=client_credentials&client_id={self.system.api_key}&client_secret={self.system.client_credentials}"
            
            response = self.system.session.post(url, data=data, headers=headers)
            response.raise_for_status()
            
            result = response.json()
            if result.get("status") and "data" in result:
                self.system.token = result["data"]["token"]
                self.system.log("✅ Token获取成功", "SUCCESS")
                return True
            return False
        except Exception as e:
            self.system.log(f"认证失败: {str(e)}", "ERROR")
            return False
    
    def search_patents(self, query: str, limit: int = 100) -> List[Dict]:
        """P002 - 专利检索"""
        if not self.system.token and not self.authenticate():
            return []
        
        try:
            url = f"{self.system.base_url}/search/patent/query-search-patent/v2"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {"apikey": self.system.api_key}
            
            payload = {
                "sort": [{"field": "SCORE", "order": "DESC"}],
                "limit": limit,
                "offset": 0,
                "query_text": query,
                "collapse_by": "PBD",
                "collapse_type": "ALL"
            }
            
            self.system.log(f"🔍 检索专利: {query} (限制{limit}件)")
            response = self.system.session.post(url, params=params, json=payload, headers=headers)
            response.raise_for_status()
            
            result = response.json()
            if result.get("status") and "data" in result:
                patents = result["data"].get("results", [])
                self.system.log(f"✅ 找到 {len(patents)} 件专利", "SUCCESS")
                return patents
            return []
        except Exception as e:
            self.system.log(f"检索失败: {str(e)}", "ERROR")
            return []
    
    def get_simple_bibliography(self, patent_id: str, patent_number: str) -> Optional[Dict]:
        """P011 - 获取简要著录项目（含摘要）"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/simple-bibliography"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            if result.get("status") and result.get("data"):
                return result["data"][0] if isinstance(result["data"], list) else result["data"]
            return None
        except Exception as e:
            self.system.log(f"P011获取失败 {patent_number}: {str(e)}", "ERROR")
            return None
    
    def get_legal_status(self, patent_id: str, patent_number: str) -> Optional[Dict]:
        """获取法律状态"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/legal-status"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            return result.get("data") if result.get("status") else None
        except Exception as e:
            self.system.log(f"法律状态获取失败: {str(e)}", "ERROR")
            return None
    
    def get_claims(self, patent_id: str, patent_number: str) -> Optional[str]:
        """获取权利要求书"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/claim-data"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key,
                "replace_by_related": "0"
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            if result.get("status") and result.get("data"):
                claims_data = result["data"]
                if isinstance(claims_data, list) and claims_data:
                    claims = claims_data[0].get("claims", [])
                    claims_text = "\n\n".join([
                        f"Claim {c.get('claim_num', '')}: {c.get('claim_text', '')}"
                        for c in claims
                    ])
                    return claims_text
            return None
        except Exception as e:
            self.system.log(f"权利要求获取失败: {str(e)}", "ERROR")
            return None
    
    def get_description(self, patent_id: str, patent_number: str) -> Optional[str]:
        """获取说明书"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/description-data"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key,
                "replace_by_related": "0"
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            if result.get("status") and result.get("data"):
                desc_data = result["data"]
                if isinstance(desc_data, list) and desc_data:
                    desc_text = desc_data[0].get("description", [{}])[0].get("text", "")
                    # 限制长度
                    if len(desc_text) > 50000:
                        desc_text = desc_text[:50000] + "\n...[内容已截断]"
                    return desc_text
            return None
        except Exception as e:
            self.system.log(f"说明书获取失败: {str(e)}", "ERROR")
            return None

# ==================== Step 2: 专利初步分析与筛选 ====================

class PatentScreener:
    """专利筛选与评分"""
    
    def __init__(self, system: PatentAnalysisSystem):
        self.system = system
        
    def process_initial_patents(self, patents: List[Dict]) -> pd.DataFrame:
        """处理初始专利数据"""
        processed = []
        
        for i, patent in enumerate(patents, 1):
            if i % 20 == 0:
                self.system.log(f"处理进度: {i}/{len(patents)}")
            
            # 提取基础信息
            patent_info = {
                "patent_id": patent.get("patent_id"),
                "patent_number": patent.get("pn"),
                "title": self._extract_title(patent),
                "assignee": patent.get("current_assignee", ""),
                "application_date": str(patent.get("apdt", "")),
                "publication_date": str(patent.get("pbdt", "")),
                "abstract": "",
                "legal_status": "",
                "score": patent.get("score", 0)
            }
            
            processed.append(patent_info)
            time.sleep(0.1)  # API限流
        
        return pd.DataFrame(processed)
    
    def _extract_title(self, patent: Dict) -> str:
        """提取标题"""
        title = patent.get("title", "")
        if isinstance(title, dict):
            title = title.get("en") or title.get("zh", "")
        return str(title)
    
    def enrich_with_abstracts(self, df: pd.DataFrame, api: ZhihuiyaAPI) -> pd.DataFrame:
        """补充摘要和法律状态"""
        self.system.log("📄 获取摘要和法律状态...")
        
        for idx, row in df.iterrows():
            if idx % 10 == 0:
                self.system.log(f"进度: {idx}/{len(df)}")
            
            # 获取摘要
            biblio = api.get_simple_bibliography(row["patent_id"], row["patent_number"])
            if biblio:
                abstracts = biblio.get("bibliographic_data", {}).get("abstracts", [])
                if abstracts:
                    df.at[idx, "abstract"] = abstracts[0].get("text", "")[:500]
            
            # 获取法律状态
            legal = api.get_legal_status(row["patent_id"], row["patent_number"])
            if legal and isinstance(legal, list) and legal:
                legal_info = legal[0].get("patent_legal", {})
                status = legal_info.get("simple_legal_status", [])
                df.at[idx, "legal_status"] = ", ".join(status) if status else "Unknown"
            
            time.sleep(0.2)
        
        return df
    
    def analyze_patent_statistics(self, df: pd.DataFrame) -> Dict:
        """统计分析专利 - 通用版本"""
        stats = {
            "total_patents": len(df),
            "assignee_distribution": df["assignee"].value_counts().to_dict(),
            "year_distribution": df["application_date"].str[:4].value_counts().to_dict(),
            "legal_status_distribution": df["legal_status"].value_counts().to_dict()
        }
        
        # 基于基因名的动态技术类型识别
        tech_types = {
            "RNAi/siRNA": 0,
            "Antibody/mAb": 0,
            "Small Molecule": 0,
            "CRISPR/Gene Editing": 0,
            "Cell Therapy": 0,
            "Protein/Peptide": 0,
            "Gene Therapy": 0,
            "Other": 0
        }
        
        for _, row in df.iterrows():
            text = (str(row["title"]) + " " + str(row["abstract"])).lower()
            
            # 检测技术类型
            if any(kw in text for kw in ["rnai", "sirna", "interference", "oligonucleotide", "antisense"]):
                tech_types["RNAi/siRNA"] += 1
            elif any(kw in text for kw in ["antibody", "mab", "immunoglobulin", "monoclonal"]):
                tech_types["Antibody/mAb"] += 1
            elif any(kw in text for kw in ["compound", "inhibitor", "small molecule", "chemical"]):
                tech_types["Small Molecule"] += 1
            elif any(kw in text for kw in ["crispr", "cas9", "gene editing", "genome editing"]):
                tech_types["CRISPR/Gene Editing"] += 1
            elif any(kw in text for kw in ["car-t", "cell therapy", "tcr", "nk cell"]):
                tech_types["Cell Therapy"] += 1
            elif any(kw in text for kw in ["protein", "peptide", "fusion protein", "recombinant"]):
                tech_types["Protein/Peptide"] += 1
            elif any(kw in text for kw in ["gene therapy", "aav", "viral vector", "lentivirus"]):
                tech_types["Gene Therapy"] += 1
            else:
                tech_types["Other"] += 1
        
        stats["technology_distribution"] = tech_types
        
        return stats
    
    def score_and_rank_patents(self, df: pd.DataFrame) -> pd.DataFrame:
        """评分并排序专利 - 通用版本"""
        self.system.log("⚖️ 专利评分中...")
        
        # 构建与目标基因相关的关键词列表
        gene_lower = self.system.target_gene.lower()
        gene_keywords = [
            gene_lower,
            self.system.target_gene.upper(),
            # 添加常见的疾病相关关键词
            "therapeutic", "treatment", "inhibitor", "agonist", "antagonist",
            "disease", "disorder", "cancer", "tumor", "diabetes", "obesity",
            "inflammation", "metabolic", "cardiovascular", "neurological"
        ]
        
        # 顶级制药公司列表
        top_pharma_companies = [
            "ROCHE", "NOVARTIS", "PFIZER", "MERCK", "JOHNSON", "SANOFI", 
            "GLAXOSMITHKLINE", "GSK", "ASTRAZENECA", "ABBVIE", "BRISTOL",
            "LILLY", "AMGEN", "GILEAD", "REGENERON", "VERTEX", "BIOGEN",
            "ARROWHEAD", "ALNYLAM", "MODERNA", "BIONTECH", "WAVE"
        ]
        
        for idx, row in df.iterrows():
            score = 0
            
            # 1. 摘要和标题相关度（0-35分）
            text = (str(row["title"]) + " " + str(row["abstract"])).lower()
            
            # 基因名称出现得分
            gene_count = text.count(gene_lower)
            score += min(gene_count * 5, 20)
            
            # 其他关键词得分
            keyword_score = sum(2 for kw in gene_keywords[2:] if kw in text)
            score += min(keyword_score, 15)
            
            # 2. 申请人权重（0-20分）
            assignee = str(row["assignee"]).upper()
            if any(comp in assignee for comp in top_pharma_companies):
                score += 20
            elif assignee and "UNIVERSITY" in assignee:
                score += 10
            elif assignee:
                score += 5
            
            # 3. 时间新鲜度（0-15分）
            pub_date = str(row["publication_date"])
            if pub_date >= "20240000":
                score += 15
            elif pub_date >= "20230000":
                score += 12
            elif pub_date >= "20220000":
                score += 8
            elif pub_date >= "20200000":
                score += 5
            
            # 4. 法律状态（0-10分）
            legal = str(row["legal_status"]).lower()
            if "grant" in legal or "授权" in legal:
                score += 10
            elif "pending" in legal or "审查" in legal:
                score += 5
            
            # 5. 原始相关度分数（0-20分）
            original_score = row["score"]
            if original_score > 80:
                score += 20
            elif original_score > 60:
                score += 15
            elif original_score > 40:
                score += 10
            elif original_score > 20:
                score += 5
            
            df.at[idx, "final_score"] = score
        
        # 排序
        df_sorted = df.sort_values("final_score", ascending=False)
        
        return df_sorted

# ==================== Step 3: 深度分析Prompts ====================

class PatentAnalysisPrompts:
    """专利分析Prompt模板 - 通用版本"""
    
    def __init__(self, target_gene: str):
        self.target_gene = target_gene
    
    def description_analysis_prompt(self, description_text: str, patent_info: Dict) -> str:
        """说明书分析prompt"""
        return f"""
作为专利技术专家，请深度分析以下{self.target_gene}基因相关专利的说明书，并以连贯的段落形式输出分析结果。

专利号：{patent_info['patent_number']}
申请人：{patent_info['assignee']}
申请日：{patent_info['application_date']}

说明书内容：
{description_text}

请按以下结构分析（每部分用2-3个完整段落表述）：

## 1. 技术概述（2段）
第一段：简要描述这是什么类型的技术（RNAi/抗体/小分子/基因编辑/细胞治疗等），针对{self.target_gene}靶点要解决什么具体问题。
第二段：说明核心创新点是什么，与现有技术相比的主要改进在哪里。

## 2. 技术方案分析（3段）
第一段：详细描述具体的技术方案。根据技术类型分析关键要素（序列设计、化合物结构、载体构建等）。
第二段：分析优化或改进策略（化学修饰、结构优化、递送系统等）。
第三段：与同领域其他专利技术的对比，突出本专利的独特性。

## 3. 实验验证（3段）
第一段：概述实验设计的整体思路，包括体外、体内实验的层次安排。
第二段：详细描述最关键的实验结果，包括具体数据（IC50、EC50、抑制率、持续时间等）。
第三段：安全性评估和临床转化考虑。如果有临床试验设计，说明主要终点和给药方案。

## 4. 商业价值评估（2段）
第一段：评估{self.target_gene}相关疾病的市场规模和竞争格局。该技术的目标适应症是什么？市场潜力如何？
第二段：分析专利技术的可实施性和商业化前景。生产工艺是否成熟？成本是否可控？临床开发路径是否清晰？

## 5. 关键技术参数提取
请特别提取以下关键信息（如果存在）：
- 核心序列/化合物：具体序列号或化学结构
- 靶向机制：{self.target_gene}的作用位点或机制
- 实验数据：关键的量化指标
- 技术特征：独特的技术特点
- 临床方案：剂量、给药途径、频率（如有）

输出要求：
- 使用完整流畅的段落，避免碎片化列表
- 数据自然融入叙述中
- 保持专业但易读的文风
- 总字数控制在1000-1500字
"""
    
    def claims_analysis_prompt(self, claims_text: str, patent_info: Dict) -> str:
        """权利要求分析prompt"""
        return f"""
作为专利法律专家，请分析以下{self.target_gene}基因相关专利的权利要求书，并以适合专业报告的段落形式输出。

专利号：{patent_info['patent_number']}
申请人：{patent_info['assignee']}

权利要求书：
{claims_text}

请按以下结构分析（每部分用2-3个完整段落表述）：

## 1. 权利要求架构概述（2段）
第一段：描述权利要求的整体结构，包括权利要求数量、独立权利要求的类型分布。
第二段：分析权利要求之间的逻辑关系和保护策略。

## 2. 核心保护范围分析（3段）
第一段：深入分析独立权利要求的保护范围，特别是与{self.target_gene}相关的必要技术特征。
第二段：分析关键限定条件对保护范围的影响。
第三段：评估其他独立权利要求的补充作用。

## 3. 技术特征递进策略（2段）
第一段：分析从属权利要求的递进逻辑和层次结构。
第二段：评价关键从属权利要求的价值和商业意义。

## 4. 法律稳定性与侵权分析（2段）
第一段：评估权利要求的法律稳定性（清楚性、支持性、创造性）。
第二段：分析侵权判定的关键要素和潜在规避路径。

## 5. 与其他{self.target_gene}专利的关系（1段）
分析该专利权利要求与其他主要申请人{self.target_gene}专利的潜在冲突或互补关系。

输出要求：
- 使用连贯的专业段落
- 法律分析结合商业考虑
- 总字数控制在800-1200字
"""
    
    def final_report_prompt(self, statistics: Dict, detailed_analyses: List[Dict]) -> str:
        """最终综合报告prompt"""
        return f"""
你是专业的专利分析师，请基于以下数据撰写一份详细的{self.target_gene}基因相关专利技术综述报告。

【100篇专利统计数据】
{json.dumps(statistics, ensure_ascii=False, indent=2)}

【10篇核心专利详细分析】
{json.dumps(detailed_analyses, ensure_ascii=False, indent=2)}

请生成一份专业的专利技术综述报告，格式如下：

# {self.target_gene}基因相关全球专利竞争格局分析

## 一、专利数量、类型与地域分布

### 全球专利公开数量与类型（400字）
基于分析的100篇{self.target_gene}相关专利，详细说明：
- 专利总数和时间分布趋势
- 技术类型分布（各类技术占比）
- 主要申请人分布
- 法律状态统计

### 地域分布（300字）
分析专利的地域布局特点。

## 二、核心专利权利人及布局策略

基于10篇核心专利的深度分析，详细描述各主要玩家的技术策略。
[根据实际申请人情况动态生成各公司分析]

## 三、技术发展趋势与关键创新

### 技术路线对比（500字）
详细对比不同公司针对{self.target_gene}的技术方案差异。

### 关键技术参数汇总
整理所有核心专利的关键参数。

## 四、专利保护范围与法律风险

### 权利要求保护范围对比（400字）
对比不同专利的保护策略。

### 潜在冲突分析（300字）
识别可能的专利冲突点。

## 五、商业机会与投资建议

### 技术空白与机会（300字）
基于专利分析识别的{self.target_gene}领域机会。

### 投资与研发建议（300字）
- 最有前景的技术路线
- 需要规避的专利壁垒
- 潜在的合作机会

## 六、结论与展望

总结{self.target_gene}专利领域的发展现状和未来趋势（300字）。

【输出要求】
1. 必须基于提供的数据，不要编造信息
2. 包含具体的专利号、申请人、技术细节
3. 数据和分析要相互印证
4. 保持客观专业的语气
5. 总字数3000-4000字
"""
# ==================== Step 4: 主流程执行 ====================

class PatentAnalysisPipeline:
    """专利分析主流程 - 通用版本"""
    
    def __init__(self, target_gene: str = None):
        self.target_gene = target_gene
        self.system = PatentAnalysisSystem(target_gene)
        self.api = ZhihuiyaAPI(self.system)
        self.screener = PatentScreener(self.system)
        self.prompts = None  # 将在运行时初始化
        
    def run_complete_analysis(self, target_gene: str = None) -> Dict:
        """运行完整分析流程
        
        Args:
            target_gene: 目标基因名称（如 "PCSK9", "PD-1", "EGFR" 等）
        
        Returns:
            包含统计数据、详细分析和最终报告的字典
        """
        
        # 设置目标基因
        if target_gene:
            self.target_gene = target_gene
            self.system.set_target_gene(target_gene)
        elif not self.target_gene:
            raise ValueError("请提供目标基因名称")
        
        # 初始化Prompts
        self.prompts = PatentAnalysisPrompts(self.target_gene)
        
        # ========== Step 1: 获取专利数据 ==========
        self.system.log("=" * 50)
        self.system.log(f"🚀 Step 1: 获取{self.target_gene}相关专利数据", "INFO")
        
        # 1.1 搜索专利
        search_results = self.api.search_patents(self.target_gene, limit=100)
        if not search_results:
            self.system.log(f"未找到{self.target_gene}相关专利", "ERROR")
            return {}
        
        # 1.2 处理基础数据
        df_patents = self.screener.process_initial_patents(search_results)
        self.system.log(f"✅ 处理了 {len(df_patents)} 篇专利", "SUCCESS")
        
        # ========== Step 2: 获取摘要和统计分析 ==========
        self.system.log("=" * 50)
        self.system.log("🔍 Step 2: 获取摘要并进行统计分析", "INFO")
        
        # 2.1 补充摘要和法律状态
        df_patents = self.screener.enrich_with_abstracts(df_patents, self.api)
        
        # 2.2 统计分析
        statistics = self.screener.analyze_patent_statistics(df_patents)
        statistics["target_gene"] = self.target_gene
        self.system.log("📊 专利统计分析完成", "SUCCESS")
        
        # 显示统计结果
        print(f"\n{self.target_gene}相关技术类型分布:")
        for tech, count in statistics["technology_distribution"].items():
            print(f"  {tech}: {count}件")
        
        print(f"\n{self.target_gene}专利主要申请人（前5）:")
        assignee_dist = dict(list(statistics["assignee_distribution"].items())[:5])
        for assignee, count in assignee_dist.items():
            print(f"  {assignee}: {count}件")
        
        # 2.3 评分和排序
        df_patents = self.screener.score_and_rank_patents(df_patents)
        
        # ========== Step 3: 选择Top 10专利 ==========
        self.system.log("=" * 50)
        self.system.log("🎯 Step 3: 选择Top 10专利进行深度分析", "INFO")
        
        top10_patents = df_patents.head(10)
        
        # 显示Top 10
        print(f"\n{self.target_gene}相关Top 10专利:")
        for idx, row in top10_patents.iterrows():
            print(f"{idx+1}. {row['patent_number']} - {row['assignee'][:30]} (Score: {row['final_score']})")
        
        # ========== Step 4: 深度分析Top 10专利 ==========
        self.system.log("=" * 50)
        self.system.log("🔬 Step 4: 深度分析核心专利", "INFO")
        
        detailed_analyses = []
        
        for idx, patent in top10_patents.iterrows():
            self.system.log(f"分析专利 {idx+1}/10: {patent['patent_number']}")
            
            # 4.1 获取说明书
            description = self.api.get_description(patent["patent_id"], patent["patent_number"])
            
            # 4.2 获取权利要求
            claims = self.api.get_claims(patent["patent_id"], patent["patent_number"])
            
            if description and claims:
                # 4.3 LLM分析说明书
                desc_prompt = self.prompts.description_analysis_prompt(description, patent.to_dict())
                desc_analysis = self.system.llm_call(desc_prompt)
                
                # 4.4 LLM分析权利要求
                claims_prompt = self.prompts.claims_analysis_prompt(claims, patent.to_dict())
                claims_analysis = self.system.llm_call(claims_prompt)
                
                detailed_analyses.append({
                    "patent_number": patent["patent_number"],
                    "assignee": patent["assignee"],
                    "application_date": patent["application_date"],
                    "title": patent["title"],
                    "technical_analysis": desc_analysis,
                    "legal_analysis": claims_analysis
                })
                
                self.system.log(f"✅ 完成分析: {patent['patent_number']}", "SUCCESS")
            else:
                self.system.log(f"⚠️ 无法获取完整内容: {patent['patent_number']}", "WARN")
            
            time.sleep(2)  # API限流
        
        # ========== Step 5: 生成综合报告 ==========
        self.system.log("=" * 50)
        self.system.log("📝 Step 5: 生成综合报告", "INFO")
        
        # 5.1 准备数据
        statistics["top_patents"] = top10_patents[["patent_number", "assignee", "final_score"]].to_dict("records")
        
        # 5.2 生成最终报告
        final_prompt = self.prompts.final_report_prompt(statistics, detailed_analyses)
        final_report = self.system.llm_call(final_prompt)
        
        # ========== 保存结果 ==========
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # 保存详细分析
        with open(f"patent_detailed_analysis_{self.target_gene}_{timestamp}.json", "w", encoding="utf-8") as f:
            json.dump({
                "target_gene": self.target_gene,
                "statistics": statistics,
                "detailed_analyses": detailed_analyses
            }, f, ensure_ascii=False, indent=2)
        
        # 保存最终报告
        with open(f"patent_report_{self.target_gene}_{timestamp}.md", "w", encoding="utf-8") as f:
            f.write(final_report)
        
        self.system.log(f"✅ {self.target_gene}专利分析完成！报告已保存", "SUCCESS")
        
        return {
            "target_gene": self.target_gene,
            "statistics": statistics,
            "detailed_analyses": detailed_analyses,
            "final_report": final_report
        }

# ==================== 使用示例 ====================

# 示例1：分析PCSK9基因
def analyze_gene_patents(gene_name: str):
    """分析指定基因的专利"""
    pipeline = PatentAnalysisPipeline()
    results = pipeline.run_complete_analysis(gene_name)
    
    if results and "final_report" in results:
        print("\n" + "=" * 50)
        print(f"📄 {gene_name}专利报告预览（前1000字）:")
        print("=" * 50)
        print(results["final_report"][:1000] + "...")
    
    return results

# 运行分析 - 可以替换为任何基因
# 示例基因列表：
# - "PCSK9" (降脂靶点)
# - "PD-1" 或 "PD-L1" (免疫检查点)
# - "EGFR" (肿瘤靶点)
# - "TNF" 或 "TNF-alpha" (炎症靶点)
# - "HER2" (乳腺癌靶点)
# - "VEGF" (血管生成)
# - "CD19" (血液肿瘤CAR-T靶点)
# - "BCMA" (多发性骨髓瘤)
# - "GLP-1" (糖尿病/肥胖)
# - "INHBE" (代谢疾病新靶点)

# 运行分析
gene_to_analyze = "PCSK9"  # 修改这里来分析不同的基因
results = analyze_gene_patents(gene_to_analyze)
# 批量分析多个基因（可选）
def batch_analyze_genes(gene_list: List[str]):
    """批量分析多个基因"""
    all_results = {}
    
    for gene in gene_list:
        print(f"\n{'='*60}")
        print(f"开始分析基因: {gene}")
        print(f"{'='*60}")
        
        try:
            pipeline = PatentAnalysisPipeline()
            results = pipeline.run_complete_analysis(gene)
            all_results[gene] = results
            
            # 休息一下，避免API限制
            time.sleep(30)
            
        except Exception as e:
            print(f"分析{gene}时出错: {str(e)}")
            continue
    
    return all_results

# 批量分析示例（取消注释以使用）
# genes_to_analyze = ["PCSK9", "PD-1", "EGFR"]
# batch_results = batch_analyze_genes(genes_to_analyze)

In [9]:
"""
CAR-T治疗自身免疫疾病专利智能分析系统
基于智慧芽API的CAR-T自身免疫疾病专利深度分析
"""

import requests
import json
import time
import pandas as pd
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from IPython.display import display, HTML
import re
from openai import OpenAI

# ==================== 基础配置 ====================

class PatentAnalysisSystem:
    """专利分析系统主类"""
    
    def __init__(self, search_topic: str = None):
        # 智慧芽API配置
        self.base_url = "https://connect.zhihuiya.com"
        self.api_key = "fh10ixx8marmhm9kbl3cx5676qn8nshcuwtktz0b05ebl7qf"
        self.client_credentials = "74z26dxne81bnmrbd8vjwt7r8fc6tr6cxxdvapslbz4knycxknv3dnjprap6igjy"
        self.token = None
        self.session = requests.Session()
        
        # LLM配置
        self.llm_client = OpenAI(
            api_key='sk-9b3ad78d6d51431c90091b575072e62f',
            base_url="https://api.deepseek.com"
        )
        
        # 分析配置
        self.search_topic = search_topic or "CAR-T autoimmune"
        self.initial_patents = 100
        self.top_patents = 10
        
    def set_search_topic(self, topic: str):
        """设置搜索主题"""
        self.search_topic = topic
        self.log(f"搜索主题设置为: {topic}", "INFO")
        
    def log(self, message: str, level: str = "INFO"):
        """日志输出"""
        timestamp = datetime.now().strftime("%H:%M:%S")
        color_map = {"INFO": "blue", "SUCCESS": "green", "ERROR": "red", "WARN": "orange"}
        color = color_map.get(level, "blue")
        display(HTML(f'<span style="color:{color};">[{timestamp}] {level}: {message}</span>'))
    
    def llm_call(self, prompt: str) -> str:
        """调用LLM"""
        try:
            response = self.llm_client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are a professional patent analyst specializing in CAR-T cell therapy and autoimmune diseases."},
                    {"role": "user", "content": prompt}
                ],
                stream=False
            )
            return response.choices[0].message.content
        except Exception as e:
            self.log(f"LLM调用失败: {str(e)}", "ERROR")
            return ""

# ==================== Step 1: 智慧芽API接口 ====================

class ZhihuiyaAPI:
    """智慧芽API接口类"""
    
    def __init__(self, system: PatentAnalysisSystem):
        self.system = system
        
    def authenticate(self) -> bool:
        """获取访问token"""
        try:
            url = f"{self.system.base_url}/oauth/token"
            headers = {"content-type": "application/x-www-form-urlencoded"}
            data = f"grant_type=client_credentials&client_id={self.system.api_key}&client_secret={self.system.client_credentials}"
            
            response = self.system.session.post(url, data=data, headers=headers)
            response.raise_for_status()
            
            result = response.json()
            if result.get("status") and "data" in result:
                self.system.token = result["data"]["token"]
                self.system.log("✅ Token获取成功", "SUCCESS")
                return True
            return False
        except Exception as e:
            self.system.log(f"认证失败: {str(e)}", "ERROR")
            return False
    
    def search_patents(self, query: str, limit: int = 100) -> List[Dict]:
        """P002 - 专利检索"""
        if not self.system.token and not self.authenticate():
            return []
        
        try:
            url = f"{self.system.base_url}/search/patent/query-search-patent/v2"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {"apikey": self.system.api_key}
            
            payload = {
                "sort": [{"field": "SCORE", "order": "DESC"}],
                "limit": limit,
                "offset": 0,
                "query_text": query,
                "collapse_by": "PBD",
                "collapse_type": "ALL"
            }
            
            self.system.log(f"🔍 检索专利: {query} (限制{limit}件)")
            response = self.system.session.post(url, params=params, json=payload, headers=headers)
            response.raise_for_status()
            
            result = response.json()
            if result.get("status") and "data" in result:
                patents = result["data"].get("results", [])
                self.system.log(f"✅ 找到 {len(patents)} 件专利", "SUCCESS")
                return patents
            return []
        except Exception as e:
            self.system.log(f"检索失败: {str(e)}", "ERROR")
            return []
    
    def get_simple_bibliography(self, patent_id: str, patent_number: str) -> Optional[Dict]:
        """P011 - 获取简要著录项目（含摘要）"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/simple-bibliography"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            if result.get("status") and result.get("data"):
                return result["data"][0] if isinstance(result["data"], list) else result["data"]
            return None
        except Exception as e:
            self.system.log(f"P011获取失败 {patent_number}: {str(e)}", "ERROR")
            return None
    
    def get_legal_status(self, patent_id: str, patent_number: str) -> Optional[Dict]:
        """获取法律状态"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/legal-status"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            return result.get("data") if result.get("status") else None
        except Exception as e:
            self.system.log(f"法律状态获取失败: {str(e)}", "ERROR")
            return None
    
    def get_claims(self, patent_id: str, patent_number: str) -> Optional[str]:
        """获取权利要求书"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/claim-data"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key,
                "replace_by_related": "0"
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            if result.get("status") and result.get("data"):
                claims_data = result["data"]
                if isinstance(claims_data, list) and claims_data:
                    claims = claims_data[0].get("claims", [])
                    claims_text = "\n\n".join([
                        f"Claim {c.get('claim_num', '')}: {c.get('claim_text', '')}"
                        for c in claims
                    ])
                    return claims_text
            return None
        except Exception as e:
            self.system.log(f"权利要求获取失败: {str(e)}", "ERROR")
            return None
    
    def get_description(self, patent_id: str, patent_number: str) -> Optional[str]:
        """获取说明书"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/description-data"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key,
                "replace_by_related": "0"
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            if result.get("status") and result.get("data"):
                desc_data = result["data"]
                if isinstance(desc_data, list) and desc_data:
                    desc_text = desc_data[0].get("description", [{}])[0].get("text", "")
                    # 限制长度
                    if len(desc_text) > 50000:
                        desc_text = desc_text[:50000] + "\n...[内容已截断]"
                    return desc_text
            return None
        except Exception as e:
            self.system.log(f"说明书获取失败: {str(e)}", "ERROR")
            return None

# ==================== Step 2: CAR-T自身免疫专利筛选与分析 ====================

class CARTAutoImmuneScreener:
    """CAR-T自身免疫疾病专利筛选与评分"""
    
    def __init__(self, system: PatentAnalysisSystem):
        self.system = system
        
        # CAR-T相关关键词
        self.cart_keywords = [
            "car-t", "cart", "car t", "chimeric antigen receptor",
            "car cell", "car therapy", "engineered t cell", "engineered tcell",
            "t cell therapy", "tcell therapy", "adoptive cell", "cellular immunotherapy"
        ]
        
        # 自身免疫疾病关键词
        self.autoimmune_keywords = [
            # 一般术语
            "autoimmune", "auto-immune", "autoimmunity", "self-reactive",
            "tolerance", "immune tolerance", "autoreactive",
            
            # 具体疾病
            "lupus", "sle", "systemic lupus erythematosus",
            "rheumatoid arthritis", "ra arthritis",
            "multiple sclerosis", "ms disease",
            "type 1 diabetes", "t1d", "iddm",
            "inflammatory bowel", "ibd", "crohn", "ulcerative colitis",
            "psoriasis", "psoriatic",
            "sjogren", "sjögren",
            "scleroderma", "systemic sclerosis",
            "myasthenia gravis",
            "hashimoto", "thyroiditis",
            "pemphigus", "pemphigoid",
            "vasculitis", "anca",
            "dermatomyositis", "polymyositis",
            "antiphospholipid", "aps syndrome"
        ]
        
        # 相关靶点
        self.target_keywords = [
            "cd19", "cd20", "bcma", "cd5", "baff", "april",
            "cd38", "cd138", "plasmablast", "plasma cell",
            "b cell", "bcell", "b-cell", "b lymphocyte",
            "memory b", "autoreactive b",
            "treg", "regulatory t", "t regulatory",
            "cd4", "cd8", "cd3", "cd25", "foxp3",
            "il-17", "il17", "th17", "il-23", "il23"
        ]
        
    def process_initial_patents(self, patents: List[Dict]) -> pd.DataFrame:
        """处理初始专利数据"""
        processed = []
        
        for i, patent in enumerate(patents, 1):
            if i % 20 == 0:
                self.system.log(f"处理进度: {i}/{len(patents)}")
            
            # 提取基础信息
            patent_info = {
                "patent_id": patent.get("patent_id"),
                "patent_number": patent.get("pn"),
                "title": self._extract_title(patent),
                "assignee": patent.get("current_assignee", ""),
                "application_date": str(patent.get("apdt", "")),
                "publication_date": str(patent.get("pbdt", "")),
                "abstract": "",
                "legal_status": "",
                "score": patent.get("score", 0),
                "is_cart_related": False,
                "is_autoimmune_related": False,
                "cart_score": 0,
                "autoimmune_score": 0
            }
            
            # 初步判断相关性
            title_abstract = str(patent_info["title"]).lower()
            patent_info["is_cart_related"] = any(kw in title_abstract for kw in self.cart_keywords)
            patent_info["is_autoimmune_related"] = any(kw in title_abstract for kw in self.autoimmune_keywords)
            
            processed.append(patent_info)
            time.sleep(0.1)  # API限流
        
        return pd.DataFrame(processed)
    
    def _extract_title(self, patent: Dict) -> str:
        """提取标题"""
        title = patent.get("title", "")
        if isinstance(title, dict):
            title = title.get("en") or title.get("zh", "")
        return str(title)
    
    def enrich_with_abstracts(self, df: pd.DataFrame, api: ZhihuiyaAPI) -> pd.DataFrame:
        """补充摘要和法律状态，并重新评估相关性"""
        self.system.log("📄 获取摘要和法律状态...")
        
        for idx, row in df.iterrows():
            if idx % 10 == 0:
                self.system.log(f"进度: {idx}/{len(df)}")
            
            # 获取摘要
            biblio = api.get_simple_bibliography(row["patent_id"], row["patent_number"])
            if biblio:
                abstracts = biblio.get("bibliographic_data", {}).get("abstracts", [])
                if abstracts:
                    abstract_text = abstracts[0].get("text", "")[:1000]
                    df.at[idx, "abstract"] = abstract_text
                    
                    # 重新评估相关性（包含摘要）
                    full_text = (str(row["title"]) + " " + abstract_text).lower()
                    
                    # CAR-T相关性评分
                    cart_score = sum(2 for kw in self.cart_keywords if kw in full_text)
                    cart_score += sum(1 for kw in self.target_keywords if kw in full_text)
                    df.at[idx, "cart_score"] = cart_score
                    df.at[idx, "is_cart_related"] = cart_score > 0
                    
                    # 自身免疫相关性评分
                    autoimmune_score = sum(2 for kw in self.autoimmune_keywords if kw in full_text)
                    df.at[idx, "autoimmune_score"] = autoimmune_score
                    df.at[idx, "is_autoimmune_related"] = autoimmune_score > 0
            
            # 获取法律状态
            legal = api.get_legal_status(row["patent_id"], row["patent_number"])
            if legal and isinstance(legal, list) and legal:
                legal_info = legal[0].get("patent_legal", {})
                status = legal_info.get("simple_legal_status", [])
                df.at[idx, "legal_status"] = ", ".join(status) if status else "Unknown"
            
            time.sleep(0.2)
        
        return df
    
    def filter_cart_autoimmune_patents(self, df: pd.DataFrame) -> pd.DataFrame:
        """筛选CAR-T+自身免疫相关专利"""
        # 首先尝试筛选同时包含CAR-T和自身免疫关键词的专利
        filtered = df[(df["is_cart_related"] == True) & (df["is_autoimmune_related"] == True)]
        
        if len(filtered) > 0:
            self.system.log(f"✅ 找到 {len(filtered)} 件CAR-T+自身免疫专利", "SUCCESS")
            return filtered
        
        # 如果没有严格符合的，尝试只有CAR-T关键词的
        self.system.log("未找到严格符合CAR-T+自身免疫的专利，检查CAR-T相关专利...", "WARN")
        cart_only = df[df["cart_score"] > 0]
        if len(cart_only) > 0:
            self.system.log(f"找到 {len(cart_only)} 件CAR-T相关专利", "INFO")
            # 在这些专利中再次搜索可能的自身免疫相关性
            return cart_only
        
        # 如果还是没有，检查自身免疫相关的
        self.system.log("检查自身免疫相关专利...", "WARN")
        autoimmune_only = df[df["autoimmune_score"] > 0]
        if len(autoimmune_only) > 0:
            self.system.log(f"找到 {len(autoimmune_only)} 件自身免疫相关专利", "INFO")
            return autoimmune_only
        
        # 最后返回所有专利
        self.system.log(f"返回所有 {len(df)} 件专利进行分析", "WARN")
        return df
    
    def analyze_patent_statistics(self, df: pd.DataFrame) -> Dict:
        """统计分析CAR-T自身免疫专利"""
        stats = {
            "total_patents": len(df),
            "cart_autoimmune_patents": len(df[(df["is_cart_related"] == True) & (df["is_autoimmune_related"] == True)]),
            "cart_only": len(df[(df["is_cart_related"] == True) & (df["is_autoimmune_related"] == False)]),
            "autoimmune_only": len(df[(df["is_cart_related"] == False) & (df["is_autoimmune_related"] == True)]),
            "assignee_distribution": df["assignee"].value_counts().to_dict(),
            "year_distribution": df["application_date"].str[:4].value_counts().to_dict(),
            "legal_status_distribution": df["legal_status"].value_counts().to_dict()
        }
        
        # 分析具体疾病类型
        disease_types = {
            "Lupus/SLE": 0,
            "Rheumatoid Arthritis": 0,
            "Multiple Sclerosis": 0,
            "Type 1 Diabetes": 0,
            "IBD/Crohn's/UC": 0,
            "Psoriasis": 0,
            "Other Autoimmune": 0,
            "Not Specified": 0
        }
        
        # 分析靶点分布
        target_distribution = {
            "CD19": 0,
            "CD20": 0,
            "BCMA": 0,
            "CD5": 0,
            "Other B cell": 0,
            "T cell targets": 0,
            "Other/Unknown": 0
        }
        
        for _, row in df.iterrows():
            text = (str(row["title"]) + " " + str(row["abstract"])).lower()
            
            # 疾病分类
            disease_found = False
            if any(kw in text for kw in ["lupus", "sle", "systemic lupus"]):
                disease_types["Lupus/SLE"] += 1
                disease_found = True
            if any(kw in text for kw in ["rheumatoid", "ra arthritis"]):
                disease_types["Rheumatoid Arthritis"] += 1
                disease_found = True
            if any(kw in text for kw in ["multiple sclerosis", "ms disease"]):
                disease_types["Multiple Sclerosis"] += 1
                disease_found = True
            if any(kw in text for kw in ["type 1 diabetes", "t1d", "iddm"]):
                disease_types["Type 1 Diabetes"] += 1
                disease_found = True
            if any(kw in text for kw in ["inflammatory bowel", "ibd", "crohn", "ulcerative colitis"]):
                disease_types["IBD/Crohn's/UC"] += 1
                disease_found = True
            if any(kw in text for kw in ["psoriasis", "psoriatic"]):
                disease_types["Psoriasis"] += 1
                disease_found = True
            if not disease_found and row["is_autoimmune_related"]:
                disease_types["Other Autoimmune"] += 1
            elif not disease_found:
                disease_types["Not Specified"] += 1
            
            # 靶点分类
            if "cd19" in text:
                target_distribution["CD19"] += 1
            elif "cd20" in text:
                target_distribution["CD20"] += 1
            elif "bcma" in text:
                target_distribution["BCMA"] += 1
            elif "cd5" in text:
                target_distribution["CD5"] += 1
            elif any(kw in text for kw in ["b cell", "bcell", "b-cell"]):
                target_distribution["Other B cell"] += 1
            elif any(kw in text for kw in ["cd4", "cd8", "cd3", "treg"]):
                target_distribution["T cell targets"] += 1
            else:
                target_distribution["Other/Unknown"] += 1
        
        stats["disease_distribution"] = disease_types
        stats["target_distribution"] = target_distribution
        
        return stats
    
    def score_and_rank_patents(self, df: pd.DataFrame) -> pd.DataFrame:
        """评分并排序CAR-T自身免疫专利"""
        self.system.log("⚖️ 专利评分中...")
        
        # 顶级制药和细胞治疗公司
        top_companies = [
            "NOVARTIS", "KITE", "JUNO", "CELGENE", "BRISTOL", "BMS",
            "GILEAD", "JANSSEN", "JOHNSON", "PFIZER", "ROCHE",
            "SANGAMO", "BLUEBIRD", "CRISPR", "EDITAS", "INTELLIA",
            "CABALETTA", "CARTESIAN", "KYVERNA", "SONOMA", "TREGS"
        ]
        
        # 顶级研究机构
        top_institutions = [
            "UNIVERSITY", "PENN", "UPENN", "STANFORD", "MIT", "HARVARD",
            "YALE", "UCLA", "UCSF", "JOHNS HOPKINS", "MEMORIAL SLOAN",
            "FRED HUTCH", "DANA FARBER", "MD ANDERSON", "NIH", "NCI"
        ]
        
        for idx, row in df.iterrows():
            score = 0
            
            # 1. CAR-T相关度（0-30分）
            score += min(row["cart_score"] * 3, 30)
            
            # 2. 自身免疫相关度（0-30分）
            score += min(row["autoimmune_score"] * 3, 30)
            
            # 3. 申请人权重（0-20分）
            assignee = str(row["assignee"]).upper()
            if any(comp in assignee for comp in top_companies):
                score += 20
            elif any(inst in assignee for inst in top_institutions):
                score += 15
            elif assignee:
                score += 5
            
            # 4. 时间新鲜度（0-10分）
            pub_date = str(row["publication_date"])
            if pub_date >= "20240000":
                score += 10
            elif pub_date >= "20230000":
                score += 8
            elif pub_date >= "20220000":
                score += 6
            elif pub_date >= "20200000":
                score += 4
            
            # 5. 法律状态（0-10分）
            legal = str(row["legal_status"]).lower()
            if "grant" in legal or "授权" in legal:
                score += 10
            elif "pending" in legal or "审查" in legal:
                score += 5
            
            df.at[idx, "final_score"] = score
        
        # 排序
        df_sorted = df.sort_values("final_score", ascending=False)
        
        return df_sorted

# ==================== Step 3: 深度分析Prompts ====================

class CARTAutoImmuneAnalysisPrompts:
    """CAR-T自身免疫专利分析Prompt模板"""
    
    def description_analysis_prompt(self, description_text: str, patent_info: Dict) -> str:
        """说明书分析prompt"""
        return f"""
作为CAR-T细胞治疗和自身免疫疾病领域的专利技术专家，请深度分析以下专利的说明书，并以连贯的段落形式输出分析结果。

专利号：{patent_info['patent_number']}
申请人：{patent_info['assignee']}
申请日：{patent_info['application_date']}

说明书内容：
{description_text}

请按以下结构分析（每部分用2-3个完整段落表述）：

## 1. 技术概述（2段）
第一段：描述这是什么类型的CAR-T技术，针对什么自身免疫疾病，要解决什么具体问题。
第二段：说明核心创新点，特别是相比传统CAR-T肿瘤治疗的适应性改进。

## 2. CAR结构与设计（3段）
第一段：详细描述CAR的结构设计，包括抗原识别域(scFv)、铰链区、跨膜域、信号转导域的具体选择。
第二段：分析靶点选择的科学依据，为什么选择该靶点治疗自身免疫疾病。
第三段：安全性设计，如自杀开关、可调控系统、避免过度免疫抑制的策略。

## 3. 制备工艺与质控（2段）
第一段：T细胞来源、转导方法（慢病毒/逆转录病毒/电穿孔）、扩增培养条件。
第二段：质量控制标准，包括CAR表达率、细胞纯度、功能检测等。

## 4. 实验验证（3段）
第一段：体外实验设计，包括细胞毒性、细胞因子释放、靶细胞清除等。
第二段：动物模型实验，使用什么自身免疫疾病模型，疗效评估指标。
第三段：临床前安全性评估，特别是针对自身免疫治疗的特殊安全性考虑。

## 5. 临床转化潜力（2段）
第一段：目标适应症的市场规模，与现有疗法（生物制剂、小分子药物）的比较优势。
第二段：临床开发策略，预期的临床试验设计，剂量选择，疗效终点。

## 6. 关键技术参数提取
- CAR结构：具体的scFv、信号域组合
- 靶点：具体的抗原靶点
- 适应症：目标自身免疫疾病
- 制备参数：转导效率、扩增倍数
- 疗效数据：关键的体内外实验数据
- 安全性特征：特殊的安全性设计

输出要求：
- 使用完整流畅的段落，避免碎片化列表
- 突出CAR-T治疗自身免疫疾病的特殊性
- 保持专业但易读的文风
- 总字数控制在1000-1500字
"""
    
    def claims_analysis_prompt(self, claims_text: str, patent_info: Dict) -> str:
        """权利要求分析prompt"""
        return f"""
作为专利法律专家，请分析以下CAR-T治疗自身免疫疾病专利的权利要求书，并以适合专业报告的段落形式输出。

专利号：{patent_info['patent_number']}
申请人：{patent_info['assignee']}

权利要求书：
{claims_text}

请按以下结构分析（每部分用2-3个完整段落表述）：

## 1. 权利要求架构概述（2段）
第一段：描述权利要求的整体结构，产品权利要求与方法权利要求的分布。
第二段：分析CAR-T相关权利要求的层次设计策略。

## 2. 核心保护范围分析（3段）
第一段：分析CAR结构相关的权利要求保护范围。
第二段：分析治疗方法相关的权利要求，特别是自身免疫适应症的限定。
第三段：评估制备方法权利要求的保护价值。

## 3. 技术特征递进策略（2段）
第一段：分析从属权利要求如何逐步限定CAR结构、靶点、疾病类型。
第二段：评价关键从属权利要求对商业化的影响。

## 4. 法律稳定性与侵权分析（2段）
第一段：评估权利要求相对于现有CAR-T技术的创造性。
第二段：分析潜在的设计规避路径和防御策略。

## 5. 与其他CAR-T专利的关系（1段）
分析该专利与Novartis、Kite等主要CAR-T专利的区别和潜在冲突。

输出要求：
- 使用连贯的专业段落
- 突出自身免疫领域的特殊性
- 总字数控制在800-1200字
"""
    
    def final_report_prompt(self, statistics: Dict, detailed_analyses: List[Dict]) -> str:
        """最终综合报告prompt"""
        return f"""
你是专业的专利分析师，请基于以下数据撰写一份详细的CAR-T治疗自身免疫疾病专利技术综述报告。

【专利统计数据】
{json.dumps(statistics, ensure_ascii=False, indent=2)}

【核心专利详细分析】
{json.dumps(detailed_analyses, ensure_ascii=False, indent=2)}

请生成一份专业的专利技术综述报告，格式如下：

# CAR-T细胞疗法治疗自身免疫疾病全球专利态势分析

## 执行摘要
简要概述CAR-T在自身免疫领域的专利现状和主要发现（300字）。

## 一、技术背景与市场机遇

### CAR-T从肿瘤到自身免疫的转化（400字）
- CAR-T在血液肿瘤的成功经验
- 自身免疫疾病的未满足需求
- CAR-T治疗自身免疫的科学基础

### 专利申请趋势分析（300字）
基于统计数据，分析：
- 年度申请量变化
- 技术成熟度评估
- 与CAR-T肿瘤专利的对比

## 二、主要专利权利人竞争格局

### 领先企业分析（各300字）
基于核心专利分析，详述主要申请人的：
- 技术路线特点
- 专利布局策略
- 临床开发进展

### 学术机构贡献（300字）
分析大学和研究机构的专利特点。

## 三、关键技术创新分析

### 靶点选择策略（400字）
- CD19 B细胞清除策略
- 其他B细胞靶点（CD20、BCMA等）
- T细胞靶点探索
- 双靶点CAR设计

### CAR结构优化（400字）
- 针对自身免疫的特殊设计
- 安全性改进（自杀开关、可调控系统）
- 持久性与记忆性优化

### 适应症覆盖（400字）
基于专利分析的疾病分布：
- 狼疮等B细胞介导疾病
- 类风湿关节炎
- 其他自身免疫疾病

## 四、专利保护策略分析

### 权利要求设计特点（300字）
- 产品vs方法权利要求
- 保护范围的平衡
- 与肿瘤CAR-T专利的区分

### 潜在的专利纠纷（300字）
- 基础CAR-T专利的影响
- 交叉许可可能性

## 五、临床转化与商业化前景

### 临床试验现状（300字）
基于专利中的临床设计信息。

### 市场预测（300字）
- 目标患者群体
- 定价策略考虑
- 与现有疗法的竞争

## 六、技术发展趋势与投资机会

### 未来技术方向（400字）
- 通用型CAR-T
- 基因编辑增强
- 联合治疗策略

### 投资建议（300字）
- 最具潜力的技术路线
- 关注的企业和机构
- 合作与许可机会

## 七、结论
总结CAR-T治疗自身免疫疾病的专利现状、机遇与挑战（300字）。

【输出要求】
1. 基于实际数据，不编造信息
2. 突出CAR-T治疗自身免疫的特殊性
3. 包含具体专利号和申请人信息
4. 保持客观专业的分析视角
5. 总字数3500-4500字
"""

# ==================== Step 4: 主流程执行 ====================

class CARTAutoImmuneAnalysisPipeline:
    """CAR-T自身免疫专利分析主流程"""
    
    def __init__(self):
        self.system = PatentAnalysisSystem()
        self.api = ZhihuiyaAPI(self.system)
        self.screener = CARTAutoImmuneScreener(self.system)
        self.prompts = CARTAutoImmuneAnalysisPrompts()
        
    def run_complete_analysis(self) -> Dict:
        """运行完整分析流程"""
        
        # ========== Step 1: 构建搜索查询 ==========
        self.system.log("=" * 50)
        self.system.log("🚀 Step 1: 搜索CAR-T自身免疫疾病相关专利", "INFO")
        
        # 构建复合搜索查询
        search_queries = [
            '("CAR-T" OR "CAR T" OR "chimeric antigen receptor") AND ("autoimmune" OR "autoimmunity" OR "lupus" OR "rheumatoid" OR "multiple sclerosis")',
            'CAR-T autoimmune disease',
            'chimeric antigen receptor autoimmune',
            'CAR T cell therapy lupus SLE',
            'CAR-T rheumatoid arthritis',
            'engineered T cell autoimmune'
        ]
        
        all_patents = []
        seen_ids = set()
        
        # 执行多个搜索查询以获得更全面的结果
        for query in search_queries[:3]:  # 使用前3个查询
            self.system.log(f"执行搜索: {query}")
            results = self.api.search_patents(query, limit=50)
            
            for patent in results:
                patent_id = patent.get("patent_id")
                if patent_id not in seen_ids:
                    all_patents.append(patent)
                    seen_ids.add(patent_id)
            
            time.sleep(2)  # 避免API限制
        
        if not all_patents:
            self.system.log("未找到相关专利", "ERROR")
            return {}
        
        self.system.log(f"✅ 共找到 {len(all_patents)} 件唯一专利", "SUCCESS")
        
        # ========== Step 2: 初步处理和筛选 ==========
        self.system.log("=" * 50)
        self.system.log("🔍 Step 2: 处理专利数据并筛选相关专利", "INFO")
        
        # 2.1 处理基础数据
        df_patents = self.screener.process_initial_patents(all_patents)
        
        # 2.2 补充摘要并重新评估相关性
        df_patents = self.screener.enrich_with_abstracts(df_patents, self.api)
        
        # 2.3 筛选CAR-T+自身免疫相关专利
        df_filtered = self.screener.filter_cart_autoimmune_patents(df_patents)
        
        if len(df_filtered) == 0:
            self.system.log("未找到符合条件的专利", "ERROR")
            return {}
        
        # 2.4 统计分析
        statistics = self.screener.analyze_patent_statistics(df_filtered)
        self.system.log("📊 专利统计分析完成", "SUCCESS")
        
        # 显示统计结果
        print("\nCAR-T自身免疫专利统计:")
        print(f"  总专利数: {statistics['total_patents']}")
        print(f"  CAR-T+自身免疫: {statistics['cart_autoimmune_patents']}")
        
        print("\n疾病类型分布:")
        for disease, count in statistics["disease_distribution"].items():
            if count > 0:
                print(f"  {disease}: {count}件")
        
        print("\n靶点分布:")
        for target, count in statistics["target_distribution"].items():
            if count > 0:
                print(f"  {target}: {count}件")
        
        # 2.5 评分和排序
        df_filtered = self.screener.score_and_rank_patents(df_filtered)
        
        # ========== Step 3: 选择Top专利 ==========
        self.system.log("=" * 50)
        self.system.log("🎯 Step 3: 选择Top专利进行深度分析", "INFO")
        
        # 选择前10个或所有（如果少于10个）
        num_top = min(10, len(df_filtered))
        top_patents = df_filtered.head(num_top)
        
        print(f"\nTop {num_top} CAR-T自身免疫专利:")
        for i, (idx, row) in enumerate(top_patents.iterrows(), 1):
            print(f"{i}. {row['patent_number']} - {row['assignee'][:40]} (Score: {row['final_score']})")
        
        # ========== Step 4: 深度分析Top专利 ==========
        self.system.log("=" * 50)
        self.system.log("🔬 Step 4: 深度分析核心专利", "INFO")
        
        detailed_analyses = []
        
        for i, (idx, patent) in enumerate(top_patents.iterrows(), 1):
            self.system.log(f"分析专利 {i}/{num_top}: {patent['patent_number']}")
            
            # 4.1 获取说明书
            description = self.api.get_description(patent["patent_id"], patent["patent_number"])
            
            # 4.2 获取权利要求
            claims = self.api.get_claims(patent["patent_id"], patent["patent_number"])
            
            if description and claims:
                # 4.3 LLM分析说明书
                desc_prompt = self.prompts.description_analysis_prompt(description, patent.to_dict())
                desc_analysis = self.system.llm_call(desc_prompt)
                
                # 4.4 LLM分析权利要求
                claims_prompt = self.prompts.claims_analysis_prompt(claims, patent.to_dict())
                claims_analysis = self.system.llm_call(claims_prompt)
                
                detailed_analyses.append({
                    "patent_number": patent["patent_number"],
                    "assignee": patent["assignee"],
                    "application_date": patent["application_date"],
                    "title": patent["title"],
                    "cart_score": patent["cart_score"],
                    "autoimmune_score": patent["autoimmune_score"],
                    "technical_analysis": desc_analysis,
                    "legal_analysis": claims_analysis
                })
                
                self.system.log(f"✅ 完成分析: {patent['patent_number']}", "SUCCESS")
            else:
                self.system.log(f"⚠️ 无法获取完整内容: {patent['patent_number']}", "WARN")
            
            time.sleep(2)  # API限流
        
        # ========== Step 5: 生成综合报告 ==========
        self.system.log("=" * 50)
        self.system.log("📝 Step 5: 生成综合报告", "INFO")
        
        # 5.1 准备数据
        statistics["top_patents"] = top_patents[["patent_number", "assignee", "final_score"]].to_dict("records")
        
        # 5.2 生成最终报告
        final_prompt = self.prompts.final_report_prompt(statistics, detailed_analyses)
        final_report = self.system.llm_call(final_prompt)
        
        # ========== 保存结果 ==========
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # 保存专利列表
        df_filtered.to_csv(f"cart_autoimmune_patents_{timestamp}.csv", index=False, encoding="utf-8-sig")
        self.system.log(f"✅ 专利列表已保存至: cart_autoimmune_patents_{timestamp}.csv", "SUCCESS")
        
        # 保存详细分析
        with open(f"cart_autoimmune_detailed_analysis_{timestamp}.json", "w", encoding="utf-8") as f:
            json.dump({
                "statistics": statistics,
                "detailed_analyses": detailed_analyses
            }, f, ensure_ascii=False, indent=2)
        
        # 保存最终报告
        with open(f"cart_autoimmune_report_{timestamp}.md", "w", encoding="utf-8") as f:
            f.write(final_report)
        
        self.system.log(f"✅ CAR-T自身免疫专利分析完成！", "SUCCESS")
        self.system.log(f"✅ 报告已保存至: cart_autoimmune_report_{timestamp}.md", "SUCCESS")
        
        return {
            "statistics": statistics,
            "detailed_analyses": detailed_analyses,
            "final_report": final_report,
            "patents_df": df_filtered
        }

# ==================== 运行分析 ====================

# 创建分析器并运行
pipeline = CARTAutoImmuneAnalysisPipeline()
results = pipeline.run_complete_analysis()

# 显示报告预览
if results and "final_report" in results:
    print("\n" + "=" * 50)
    print("📄 CAR-T自身免疫专利分析报告预览（前1500字）:")
    print("=" * 50)
    print(results["final_report"][:1500] + "...")

In [None]:

"""
通用基因专利智能分析系统 - Universal Gene Patent Analysis System
基于智慧芽API的任意基因专利深度分析
"""

import requests
import json
import time
import pandas as pd
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from IPython.display import display, HTML
import re
from openai import OpenAI

# ==================== 基础配置 ====================

class PatentAnalysisSystem:
    """专利分析系统主类"""
    
    def __init__(self, target_gene: str = None):
        # 智慧芽API配置
        self.base_url = "https://connect.zhihuiya.com"
        self.api_key = "fh10ixx8marmhm9kbl3cx5676qn8nshcuwtktz0b05ebl7qf"
        self.client_credentials = "74z26dxne81bnmrbd8vjwt7r8fc6tr6cxxdvapslbz4knycxknv3dnjprap6igjy"
        self.token = None
        self.session = requests.Session()
        
        # LLM配置
        self.llm_client = OpenAI(
            api_key='sk-9b3ad78d6d51431c90091b575072e62f',
            base_url="https://api.deepseek.com"
        )
        
        # 分析配置
        self.target_gene = target_gene or "GENE"  # 默认基因名
        self.initial_patents = 100
        self.top_patents = 10
        
    def set_target_gene(self, gene_name: str):
        """设置目标基因"""
        self.target_gene = gene_name
        self.log(f"目标基因设置为: {gene_name}", "INFO")
        
    def log(self, message: str, level: str = "INFO"):
        """日志输出"""
        timestamp = datetime.now().strftime("%H:%M:%S")
        color_map = {"INFO": "blue", "SUCCESS": "green", "ERROR": "red", "WARN": "orange"}
        color = color_map.get(level, "blue")
        display(HTML(f'<span style="color:{color};">[{timestamp}] {level}: {message}</span>'))
    
    def llm_call(self, prompt: str) -> str:
        """调用LLM"""
        try:
            response = self.llm_client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are a professional patent analyst specializing in biotechnology and pharmaceutical patents."},
                    {"role": "user", "content": prompt}
                ],
                stream=False
            )
            return response.choices[0].message.content
        except Exception as e:
            self.log(f"LLM调用失败: {str(e)}", "ERROR")
            return ""

# ==================== Step 1: 智慧芽API接口 ====================

class ZhihuiyaAPI:
    """智慧芽API接口类"""
    
    def __init__(self, system: PatentAnalysisSystem):
        self.system = system
        
    def authenticate(self) -> bool:
        """获取访问token"""
        try:
            url = f"{self.system.base_url}/oauth/token"
            headers = {"content-type": "application/x-www-form-urlencoded"}
            data = f"grant_type=client_credentials&client_id={self.system.api_key}&client_secret={self.system.client_credentials}"
            
            response = self.system.session.post(url, data=data, headers=headers)
            response.raise_for_status()
            
            result = response.json()
            if result.get("status") and "data" in result:
                self.system.token = result["data"]["token"]
                self.system.log("✅ Token获取成功", "SUCCESS")
                return True
            return False
        except Exception as e:
            self.system.log(f"认证失败: {str(e)}", "ERROR")
            return False
    
    def search_patents(self, query: str, limit: int = 100) -> List[Dict]:
        """P002 - 专利检索"""
        if not self.system.token and not self.authenticate():
            return []
        
        try:
            url = f"{self.system.base_url}/search/patent/query-search-patent/v2"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {"apikey": self.system.api_key}
            
            payload = {
                "sort": [{"field": "SCORE", "order": "DESC"}],
                "limit": limit,
                "offset": 0,
                "query_text": query,
                "collapse_by": "PBD",
                "collapse_type": "ALL"
            }
            
            self.system.log(f"🔍 检索专利: {query} (限制{limit}件)")
            response = self.system.session.post(url, params=params, json=payload, headers=headers)
            response.raise_for_status()
            
            result = response.json()
            if result.get("status") and "data" in result:
                patents = result["data"].get("results", [])
                self.system.log(f"✅ 找到 {len(patents)} 件专利", "SUCCESS")
                return patents
            return []
        except Exception as e:
            self.system.log(f"检索失败: {str(e)}", "ERROR")
            return []
    
    def get_simple_bibliography(self, patent_id: str, patent_number: str) -> Optional[Dict]:
        """P011 - 获取简要著录项目（含摘要）"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/simple-bibliography"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            if result.get("status") and result.get("data"):
                return result["data"][0] if isinstance(result["data"], list) else result["data"]
            return None
        except Exception as e:
            self.system.log(f"P011获取失败 {patent_number}: {str(e)}", "ERROR")
            return None
    
    def get_legal_status(self, patent_id: str, patent_number: str) -> Optional[Dict]:
        """获取法律状态"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/legal-status"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            return result.get("data") if result.get("status") else None
        except Exception as e:
            self.system.log(f"法律状态获取失败: {str(e)}", "ERROR")
            return None
    
    def get_claims(self, patent_id: str, patent_number: str) -> Optional[str]:
        """获取权利要求书"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/claim-data"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key,
                "replace_by_related": "0"
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            if result.get("status") and result.get("data"):
                claims_data = result["data"]
                if isinstance(claims_data, list) and claims_data:
                    claims = claims_data[0].get("claims", [])
                    claims_text = "\n\n".join([
                        f"Claim {c.get('claim_num', '')}: {c.get('claim_text', '')}"
                        for c in claims
                    ])
                    return claims_text
            return None
        except Exception as e:
            self.system.log(f"权利要求获取失败: {str(e)}", "ERROR")
            return None
    
    def get_description(self, patent_id: str, patent_number: str) -> Optional[str]:
        """获取说明书"""
        try:
            url = f"{self.system.base_url}/basic-patent-data/description-data"
            headers = {
                "Content-Type": "application/json",
                "authorization": f"Bearer {self.system.token}"
            }
            params = {
                "patent_id": patent_id,
                "patent_number": patent_number,
                "apikey": self.system.api_key,
                "replace_by_related": "0"
            }
            
            response = self.system.session.get(url, params=params, headers=headers)
            response.raise_for_status()
            result = response.json()
            
            if result.get("status") and result.get("data"):
                desc_data = result["data"]
                if isinstance(desc_data, list) and desc_data:
                    desc_text = desc_data[0].get("description", [{}])[0].get("text", "")
                    # 限制长度
                    if len(desc_text) > 50000:
                        desc_text = desc_text[:50000] + "\n...[内容已截断]"
                    return desc_text
            return None
        except Exception as e:
            self.system.log(f"说明书获取失败: {str(e)}", "ERROR")
            return None

# ==================== Step 2: 专利初步分析与筛选 ====================

class PatentScreener:
    """专利筛选与评分"""
    
    def __init__(self, system: PatentAnalysisSystem):
        self.system = system
        
    def process_initial_patents(self, patents: List[Dict]) -> pd.DataFrame:
        """处理初始专利数据"""
        processed = []
        
        for i, patent in enumerate(patents, 1):
            if i % 20 == 0:
                self.system.log(f"处理进度: {i}/{len(patents)}")
            
            # 提取基础信息
            patent_info = {
                "patent_id": patent.get("patent_id"),
                "patent_number": patent.get("pn"),
                "title": self._extract_title(patent),
                "assignee": patent.get("current_assignee", ""),
                "application_date": str(patent.get("apdt", "")),
                "publication_date": str(patent.get("pbdt", "")),
                "abstract": "",
                "legal_status": "",
                "score": patent.get("score", 0)
            }
            
            processed.append(patent_info)
            time.sleep(0.1)  # API限流
        
        return pd.DataFrame(processed)
    
    def _extract_title(self, patent: Dict) -> str:
        """提取标题"""
        title = patent.get("title", "")
        if isinstance(title, dict):
            title = title.get("en") or title.get("zh", "")
        return str(title)
    
    def enrich_with_abstracts(self, df: pd.DataFrame, api: ZhihuiyaAPI) -> pd.DataFrame:
        """补充摘要和法律状态"""
        self.system.log("📄 获取摘要和法律状态...")
        
        for idx, row in df.iterrows():
            if idx % 10 == 0:
                self.system.log(f"进度: {idx}/{len(df)}")
            
            # 获取摘要
            biblio = api.get_simple_bibliography(row["patent_id"], row["patent_number"])
            if biblio:
                abstracts = biblio.get("bibliographic_data", {}).get("abstracts", [])
                if abstracts:
                    df.at[idx, "abstract"] = abstracts[0].get("text", "")[:500]
            
            # 获取法律状态
            legal = api.get_legal_status(row["patent_id"], row["patent_number"])
            if legal and isinstance(legal, list) and legal:
                legal_info = legal[0].get("patent_legal", {})
                status = legal_info.get("simple_legal_status", [])
                df.at[idx, "legal_status"] = ", ".join(status) if status else "Unknown"
            
            time.sleep(0.2)
        
        return df
    
    def analyze_patent_statistics(self, df: pd.DataFrame) -> Dict:
        """统计分析专利 - 通用版本"""
        stats = {
            "total_patents": len(df),
            "assignee_distribution": df["assignee"].value_counts().to_dict(),
            "year_distribution": df["application_date"].str[:4].value_counts().to_dict(),
            "legal_status_distribution": df["legal_status"].value_counts().to_dict()
        }
        
        # 基于基因名的动态技术类型识别
        tech_types = {
            "RNAi/siRNA": 0,
            "Antibody/mAb": 0,
            "Small Molecule": 0,
            "CRISPR/Gene Editing": 0,
            "Cell Therapy": 0,
            "Protein/Peptide": 0,
            "Gene Therapy": 0,
            "Other": 0
        }
        
        for _, row in df.iterrows():
            text = (str(row["title"]) + " " + str(row["abstract"])).lower()
            
            # 检测技术类型
            if any(kw in text for kw in ["rnai", "sirna", "interference", "oligonucleotide", "antisense"]):
                tech_types["RNAi/siRNA"] += 1
            elif any(kw in text for kw in ["antibody", "mab", "immunoglobulin", "monoclonal"]):
                tech_types["Antibody/mAb"] += 1
            elif any(kw in text for kw in ["compound", "inhibitor", "small molecule", "chemical"]):
                tech_types["Small Molecule"] += 1
            elif any(kw in text for kw in ["crispr", "cas9", "gene editing", "genome editing"]):
                tech_types["CRISPR/Gene Editing"] += 1
            elif any(kw in text for kw in ["car-t", "cell therapy", "tcr", "nk cell"]):
                tech_types["Cell Therapy"] += 1
            elif any(kw in text for kw in ["protein", "peptide", "fusion protein", "recombinant"]):
                tech_types["Protein/Peptide"] += 1
            elif any(kw in text for kw in ["gene therapy", "aav", "viral vector", "lentivirus"]):
                tech_types["Gene Therapy"] += 1
            else:
                tech_types["Other"] += 1
        
        stats["technology_distribution"] = tech_types
        
        return stats
    
    def score_and_rank_patents(self, df: pd.DataFrame) -> pd.DataFrame:
        """评分并排序专利 - 通用版本"""
        self.system.log("⚖️ 专利评分中...")
        
        # 构建与目标基因相关的关键词列表
        gene_lower = self.system.target_gene.lower()
        gene_keywords = [
            gene_lower,
            self.system.target_gene.upper(),
            # 添加常见的疾病相关关键词
            "therapeutic", "treatment", "inhibitor", "agonist", "antagonist",
            "disease", "disorder", "cancer", "tumor", "diabetes", "obesity",
            "inflammation", "metabolic", "cardiovascular", "neurological"
        ]
        
        # 顶级制药公司列表
        top_pharma_companies = [
            "ROCHE", "NOVARTIS", "PFIZER", "MERCK", "JOHNSON", "SANOFI", 
            "GLAXOSMITHKLINE", "GSK", "ASTRAZENECA", "ABBVIE", "BRISTOL",
            "LILLY", "AMGEN", "GILEAD", "REGENERON", "VERTEX", "BIOGEN",
            "ARROWHEAD", "ALNYLAM", "MODERNA", "BIONTECH", "WAVE"
        ]
        
        for idx, row in df.iterrows():
            score = 0
            
            # 1. 摘要和标题相关度（0-35分）
            text = (str(row["title"]) + " " + str(row["abstract"])).lower()
            
            # 基因名称出现得分
            gene_count = text.count(gene_lower)
            score += min(gene_count * 5, 20)
            
            # 其他关键词得分
            keyword_score = sum(2 for kw in gene_keywords[2:] if kw in text)
            score += min(keyword_score, 15)
            
            # 2. 申请人权重（0-20分）
            assignee = str(row["assignee"]).upper()
            if any(comp in assignee for comp in top_pharma_companies):
                score += 20
            elif assignee and "UNIVERSITY" in assignee:
                score += 10
            elif assignee:
                score += 5
            
            # 3. 时间新鲜度（0-15分）
            pub_date = str(row["publication_date"])
            if pub_date >= "20240000":
                score += 15
            elif pub_date >= "20230000":
                score += 12
            elif pub_date >= "20220000":
                score += 8
            elif pub_date >= "20200000":
                score += 5
            
            # 4. 法律状态（0-10分）
            legal = str(row["legal_status"]).lower()
            if "grant" in legal or "授权" in legal:
                score += 10
            elif "pending" in legal or "审查" in legal:
                score += 5
            
            # 5. 原始相关度分数（0-20分）
            original_score = row["score"]
            if original_score > 80:
                score += 20
            elif original_score > 60:
                score += 15
            elif original_score > 40:
                score += 10
            elif original_score > 20:
                score += 5
            
            df.at[idx, "final_score"] = score
        
        # 排序
        df_sorted = df.sort_values("final_score", ascending=False)
        
        return df_sorted

# ==================== Step 3: 深度分析Prompts ====================

class PatentAnalysisPrompts:
    """专利分析Prompt模板 - 通用版本"""
    
    def __init__(self, target_gene: str):
        self.target_gene = target_gene
    
    def description_analysis_prompt(self, description_text: str, patent_info: Dict) -> str:
        """说明书分析prompt"""
        return f"""
作为专利技术专家，请深度分析以下{self.target_gene}基因相关专利的说明书，并以连贯的段落形式输出分析结果。

专利号：{patent_info['patent_number']}
申请人：{patent_info['assignee']}
申请日：{patent_info['application_date']}

说明书内容：
{description_text}

请按以下结构分析（每部分用2-3个完整段落表述）：

## 1. 技术概述（2段）
第一段：简要描述这是什么类型的技术（RNAi/抗体/小分子/基因编辑/细胞治疗等），针对{self.target_gene}靶点要解决什么具体问题。
第二段：说明核心创新点是什么，与现有技术相比的主要改进在哪里。

## 2. 技术方案分析（3段）
第一段：详细描述具体的技术方案。根据技术类型分析关键要素（序列设计、化合物结构、载体构建等）。
第二段：分析优化或改进策略（化学修饰、结构优化、递送系统等）。
第三段：与同领域其他专利技术的对比，突出本专利的独特性。

## 3. 实验验证（3段）
第一段：概述实验设计的整体思路，包括体外、体内实验的层次安排。
第二段：详细描述最关键的实验结果，包括具体数据（IC50、EC50、抑制率、持续时间等）。
第三段：安全性评估和临床转化考虑。如果有临床试验设计，说明主要终点和给药方案。

## 4. 商业价值评估（2段）
第一段：评估{self.target_gene}相关疾病的市场规模和竞争格局。该技术的目标适应症是什么？市场潜力如何？
第二段：分析专利技术的可实施性和商业化前景。生产工艺是否成熟？成本是否可控？临床开发路径是否清晰？

## 5. 关键技术参数提取
请特别提取以下关键信息（如果存在）：
- 核心序列/化合物：具体序列号或化学结构
- 靶向机制：{self.target_gene}的作用位点或机制
- 实验数据：关键的量化指标
- 技术特征：独特的技术特点
- 临床方案：剂量、给药途径、频率（如有）

输出要求：
- 使用完整流畅的段落，避免碎片化列表
- 数据自然融入叙述中
- 保持专业但易读的文风
- 总字数控制在1000-1500字
"""
    
    def claims_analysis_prompt(self, claims_text: str, patent_info: Dict) -> str:
        """权利要求分析prompt"""
        return f"""
作为专利法律专家，请分析以下{self.target_gene}基因相关专利的权利要求书，并以适合专业报告的段落形式输出。

专利号：{patent_info['patent_number']}
申请人：{patent_info['assignee']}

权利要求书：
{claims_text}

请按以下结构分析（每部分用2-3个完整段落表述）：

## 1. 权利要求架构概述（2段）
第一段：描述权利要求的整体结构，包括权利要求数量、独立权利要求的类型分布。
第二段：分析权利要求之间的逻辑关系和保护策略。

## 2. 核心保护范围分析（3段）
第一段：深入分析独立权利要求的保护范围，特别是与{self.target_gene}相关的必要技术特征。
第二段：分析关键限定条件对保护范围的影响。
第三段：评估其他独立权利要求的补充作用。

## 3. 技术特征递进策略（2段）
第一段：分析从属权利要求的递进逻辑和层次结构。
第二段：评价关键从属权利要求的价值和商业意义。

## 4. 法律稳定性与侵权分析（2段）
第一段：评估权利要求的法律稳定性（清楚性、支持性、创造性）。
第二段：分析侵权判定的关键要素和潜在规避路径。

## 5. 与其他{self.target_gene}专利的关系（1段）
分析该专利权利要求与其他主要申请人{self.target_gene}专利的潜在冲突或互补关系。

输出要求：
- 使用连贯的专业段落
- 法律分析结合商业考虑
- 总字数控制在800-1200字
"""
    
    def final_report_prompt(self, statistics: Dict, detailed_analyses: List[Dict]) -> str:
        """最终综合报告prompt"""
        return f"""
你是专业的专利分析师，请基于以下数据撰写一份详细的{self.target_gene}基因相关专利技术综述报告。

【100篇专利统计数据】
{json.dumps(statistics, ensure_ascii=False, indent=2)}

【10篇核心专利详细分析】
{json.dumps(detailed_analyses, ensure_ascii=False, indent=2)}

请生成一份专业的专利技术综述报告，格式如下：

# {self.target_gene}基因相关全球专利竞争格局分析

## 一、专利数量、类型与地域分布

### 全球专利公开数量与类型（400字）
基于分析的100篇{self.target_gene}相关专利，详细说明：
- 专利总数和时间分布趋势
- 技术类型分布（各类技术占比）
- 主要申请人分布
- 法律状态统计

### 地域分布（300字）
分析专利的地域布局特点。

## 二、核心专利权利人及布局策略

基于10篇核心专利的深度分析，详细描述各主要玩家的技术策略。
[根据实际申请人情况动态生成各公司分析]

## 三、技术发展趋势与关键创新

### 技术路线对比（500字）
详细对比不同公司针对{self.target_gene}的技术方案差异。

### 关键技术参数汇总
整理所有核心专利的关键参数。

## 四、专利保护范围与法律风险

### 权利要求保护范围对比（400字）
对比不同专利的保护策略。

### 潜在冲突分析（300字）
识别可能的专利冲突点。

## 五、商业机会与投资建议

### 技术空白与机会（300字）
基于专利分析识别的{self.target_gene}领域机会。

### 投资与研发建议（300字）
- 最有前景的技术路线
- 需要规避的专利壁垒
- 潜在的合作机会

## 六、结论与展望

总结{self.target_gene}专利领域的发展现状和未来趋势（300字）。

【输出要求】
1. 必须基于提供的数据，不要编造信息
2. 包含具体的专利号、申请人、技术细节
3. 数据和分析要相互印证
4. 保持客观专业的语气
5. 总字数3000-4000字
"""
# ==================== Step 4: 主流程执行 ====================

class PatentAnalysisPipeline:
    """专利分析主流程 - 通用版本"""
    
    def __init__(self, target_gene: str = None):
        self.target_gene = target_gene
        self.system = PatentAnalysisSystem(target_gene)
        self.api = ZhihuiyaAPI(self.system)
        self.screener = PatentScreener(self.system)
        self.prompts = None  # 将在运行时初始化
        
    def run_complete_analysis(self, target_gene: str = None) -> Dict:
        """运行完整分析流程
        
        Args:
            target_gene: 目标基因名称（如 "PCSK9", "PD-1", "EGFR" 等）
        
        Returns:
            包含统计数据、详细分析和最终报告的字典
        """
        
        # 设置目标基因
        if target_gene:
            self.target_gene = target_gene
            self.system.set_target_gene(target_gene)
        elif not self.target_gene:
            raise ValueError("请提供目标基因名称")
        
        # 初始化Prompts
        self.prompts = PatentAnalysisPrompts(self.target_gene)
        
        # ========== Step 1: 获取专利数据 ==========
        self.system.log("=" * 50)
        self.system.log(f"🚀 Step 1: 获取{self.target_gene}相关专利数据", "INFO")
        
        # 1.1 搜索专利
        search_results = self.api.search_patents(self.target_gene, limit=100)
        if not search_results:
            self.system.log(f"未找到{self.target_gene}相关专利", "ERROR")
            return {}
        
        # 1.2 处理基础数据
        df_patents = self.screener.process_initial_patents(search_results)
        self.system.log(f"✅ 处理了 {len(df_patents)} 篇专利", "SUCCESS")
        
        # ========== Step 2: 获取摘要和统计分析 ==========
        self.system.log("=" * 50)
        self.system.log("🔍 Step 2: 获取摘要并进行统计分析", "INFO")
        
        # 2.1 补充摘要和法律状态
        df_patents = self.screener.enrich_with_abstracts(df_patents, self.api)
        
        # 2.2 统计分析
        statistics = self.screener.analyze_patent_statistics(df_patents)
        statistics["target_gene"] = self.target_gene
        self.system.log("📊 专利统计分析完成", "SUCCESS")
        
        # 显示统计结果
        print(f"\n{self.target_gene}相关技术类型分布:")
        for tech, count in statistics["technology_distribution"].items():
            print(f"  {tech}: {count}件")
        
        print(f"\n{self.target_gene}专利主要申请人（前5）:")
        assignee_dist = dict(list(statistics["assignee_distribution"].items())[:5])
        for assignee, count in assignee_dist.items():
            print(f"  {assignee}: {count}件")
        
        # 2.3 评分和排序
        df_patents = self.screener.score_and_rank_patents(df_patents)
        
        # ========== Step 3: 选择Top 10专利 ==========
        self.system.log("=" * 50)
        self.system.log("🎯 Step 3: 选择Top 10专利进行深度分析", "INFO")
        
        top10_patents = df_patents.head(10)
        
        # 显示Top 10
        print(f"\n{self.target_gene}相关Top 10专利:")
        for idx, row in top10_patents.iterrows():
            print(f"{idx+1}. {row['patent_number']} - {row['assignee'][:30]} (Score: {row['final_score']})")
        
        # ========== Step 4: 深度分析Top 10专利 ==========
        self.system.log("=" * 50)
        self.system.log("🔬 Step 4: 深度分析核心专利", "INFO")
        
        detailed_analyses = []
        
        for idx, patent in top10_patents.iterrows():
            self.system.log(f"分析专利 {idx+1}/10: {patent['patent_number']}")
            
            # 4.1 获取说明书
            description = self.api.get_description(patent["patent_id"], patent["patent_number"])
            
            # 4.2 获取权利要求
            claims = self.api.get_claims(patent["patent_id"], patent["patent_number"])
            
            if description and claims:
                # 4.3 LLM分析说明书
                desc_prompt = self.prompts.description_analysis_prompt(description, patent.to_dict())
                desc_analysis = self.system.llm_call(desc_prompt)
                
                # 4.4 LLM分析权利要求
                claims_prompt = self.prompts.claims_analysis_prompt(claims, patent.to_dict())
                claims_analysis = self.system.llm_call(claims_prompt)
                
                detailed_analyses.append({
                    "patent_number": patent["patent_number"],
                    "assignee": patent["assignee"],
                    "application_date": patent["application_date"],
                    "title": patent["title"],
                    "technical_analysis": desc_analysis,
                    "legal_analysis": claims_analysis
                })
                
                self.system.log(f"✅ 完成分析: {patent['patent_number']}", "SUCCESS")
            else:
                self.system.log(f"⚠️ 无法获取完整内容: {patent['patent_number']}", "WARN")
            
            time.sleep(2)  # API限流
        
        # ========== Step 5: 生成综合报告 ==========
        self.system.log("=" * 50)
        self.system.log("📝 Step 5: 生成综合报告", "INFO")
        
        # 5.1 准备数据
        statistics["top_patents"] = top10_patents[["patent_number", "assignee", "final_score"]].to_dict("records")
        
        # 5.2 生成最终报告
        final_prompt = self.prompts.final_report_prompt(statistics, detailed_analyses)
        final_report = self.system.llm_call(final_prompt)
        
        # ========== 保存结果 ==========
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # 保存详细分析
        with open(f"patent_detailed_analysis_{self.target_gene}_{timestamp}.json", "w", encoding="utf-8") as f:
            json.dump({
                "target_gene": self.target_gene,
                "statistics": statistics,
                "detailed_analyses": detailed_analyses
            }, f, ensure_ascii=False, indent=2)
        
        # 保存最终报告
        with open(f"patent_report_{self.target_gene}_{timestamp}.md", "w", encoding="utf-8") as f:
            f.write(final_report)
        
        self.system.log(f"✅ {self.target_gene}专利分析完成！报告已保存", "SUCCESS")
        
        return {
            "target_gene": self.target_gene,
            "statistics": statistics,
            "detailed_analyses": detailed_analyses,
            "final_report": final_report
        }

# ==================== 使用示例 ====================

# 示例1：分析PCSK9基因
def analyze_gene_patents(gene_name: str):
    """分析指定基因的专利"""
    pipeline = PatentAnalysisPipeline()
    results = pipeline.run_complete_analysis(gene_name)
    
    if results and "final_report" in results:
        print("\n" + "=" * 50)
        print(f"📄 {gene_name}专利报告预览（前1000字）:")
        print("=" * 50)
        print(results["final_report"][:1000] + "...")
    
    return results

# 运行分析 - 可以替换为任何基因
# 示例基因列表：
# - "PCSK9" (降脂靶点)
# - "PD-1" 或 "PD-L1" (免疫检查点)
# - "EGFR" (肿瘤靶点)
# - "TNF" 或 "TNF-alpha" (炎症靶点)
# - "HER2" (乳腺癌靶点)
# - "VEGF" (血管生成)
# - "CD19" (血液肿瘤CAR-T靶点)
# - "BCMA" (多发性骨髓瘤)
# - "GLP-1" (糖尿病/肥胖)
# - "INHBE" (代谢疾病新靶点)

# 运行分析
gene_to_analyze = "PCSK9"  # 修改这里来分析不同的基因
results = analyze_gene_patents(gene_to_analyze)
# 批量分析多个基因（可选）
def batch_analyze_genes(gene_list: List[str]):
    """批量分析多个基因"""
    all_results = {}
    
    for gene in gene_list:
        print(f"\n{'='*60}")
        print(f"开始分析基因: {gene}")
        print(f"{'='*60}")
        
        try:
            pipeline = PatentAnalysisPipeline()
            results = pipeline.run_complete_analysis(gene)
            all_results[gene] = results
            
            # 休息一下，避免API限制
            time.sleep(30)
            
        except Exception as e:
            print(f"分析{gene}时出错: {str(e)}")
            continue
    
    return all_results

# 批量分析示例（取消注释以使用）
# genes_to_analyze = ["PCSK9", "PD-1", "EGFR"]
# batch_results = batch_analyze_genes(genes_to_analyze)

In [10]:
"""
智慧芽API权限和搜索功能测试
"""

import requests
import json
from datetime import datetime

# API配置
BASE_URL = "https://connect.zhihuiya.com"
API_KEY = "fh10ixx8marmhm9kbl3cx5676qn8nshcuwtktz0b05ebl7qf"
CLIENT_CREDENTIALS = "74z26dxne81bnmrbd8vjwt7r8fc6tr6cxxdvapslbz4knycxknv3dnjprap6igjy"

def test_api():
    """测试API权限和搜索功能"""
    
    print("=" * 60)
    print("智慧芽API测试")
    print("=" * 60)
    
    # Step 1: 获取Token
    print("\n1. 测试获取Token...")
    print(f"   API_KEY: {API_KEY[:20]}...")
    
    token_url = f"{BASE_URL}/oauth/token"
    headers = {"content-type": "application/x-www-form-urlencoded"}
    data = f"grant_type=client_credentials&client_id={API_KEY}&client_secret={CLIENT_CREDENTIALS}"
    
    try:
        response = requests.post(token_url, data=data, headers=headers)
        print(f"   状态码: {response.status_code}")
        
        if response.status_code == 200:
            result = response.json()
            if result.get("status") and "data" in result:
                token = result["data"]["token"]
                print(f"   ✅ Token获取成功!")
                print(f"   Token前20位: {token[:20]}...")
            else:
                print(f"   ❌ Token获取失败: {result}")
                return
        else:
            print(f"   ❌ HTTP错误: {response.status_code}")
            print(f"   响应内容: {response.text}")
            return
            
    except Exception as e:
        print(f"   ❌ 网络错误: {str(e)}")
        return
    
    # Step 2: 测试简单搜索
    print("\n2. 测试搜索功能...")
    
    test_queries = [
        "CAR-T",           # 简单搜索
        "PCSK9",           # 基因搜索
        "antibody",        # 抗体搜索
        "Novartis",        # 公司搜索
        "cancer therapy"   # 短语搜索
    ]
    
    search_url = f"{BASE_URL}/search/patent/query-search-patent/v2"
    search_headers = {
        "Content-Type": "application/json",
        "authorization": f"Bearer {token}"
    }
    params = {"apikey": API_KEY}
    
    for query in test_queries:
        print(f"\n   测试搜索: '{query}'")
        
        payload = {
            "sort": [{"field": "SCORE", "order": "DESC"}],
            "limit": 5,  # 只取5个结果测试
            "offset": 0,
            "query_text": query,
            "collapse_by": "PBD",
            "collapse_type": "ALL"
        }
        
        try:
            response = requests.post(search_url, params=params, json=payload, headers=search_headers)
            print(f"   状态码: {response.status_code}")
            
            if response.status_code == 200:
                result = response.json()
                if result.get("status") and "data" in result:
                    patents = result["data"].get("results", [])
                    total = result["data"].get("total_hits", 0)
                    print(f"   ✅ 搜索成功! 找到 {len(patents)} 件专利 (总计: {total})")
                    
                    # 显示第一个专利的信息
                    if patents:
                        first_patent = patents[0]
                        print(f"      第一个专利号: {first_patent.get('pn', 'N/A')}")
                        title = first_patent.get('title', {})
                        if isinstance(title, dict):
                            title = title.get('en') or title.get('zh', 'N/A')
                        print(f"      标题: {str(title)[:50]}...")
                else:
                    print(f"   ⚠️ 搜索返回异常: {result}")
            else:
                print(f"   ❌ 搜索失败: HTTP {response.status_code}")
                print(f"   响应: {response.text[:200]}")
                
        except Exception as e:
            print(f"   ❌ 搜索错误: {str(e)}")
    
    # Step 3: 测试获取专利详情（使用第一个搜索结果）
    if patents:
        print("\n3. 测试获取专利详情...")
        test_patent = patents[0]
        patent_id = test_patent.get("patent_id")
        patent_number = test_patent.get("pn")
        
        print(f"   测试专利: {patent_number}")
        
        # 测试获取摘要
        biblio_url = f"{BASE_URL}/basic-patent-data/simple-bibliography"
        detail_params = {
            "patent_id": patent_id,
            "patent_number": patent_number,
            "apikey": API_KEY
        }
        
        try:
            response = requests.get(biblio_url, params=detail_params, headers=search_headers)
            print(f"   获取摘要状态码: {response.status_code}")
            
            if response.status_code == 200:
                result = response.json()
                if result.get("status"):
                    print(f"   ✅ 摘要获取成功!")
                else:
                    print(f"   ⚠️ 摘要获取异常: {result}")
            else:
                print(f"   ❌ 摘要获取失败: HTTP {response.status_code}")
                
        except Exception as e:
            print(f"   ❌ 获取摘要错误: {str(e)}")
    
    print("\n" + "=" * 60)
    print("测试完成!")
    print("=" * 60)
    
    # 测试结果总结
    print("\n测试总结:")
    print(f"✅ Token获取: {'成功' if token else '失败'}")
    print(f"✅ 搜索功能: 测试了 {len(test_queries)} 个查询")
    print(f"✅ API权限: {'正常' if token else '异常'}")
    
    return token

# 运行测试
if __name__ == "__main__":
    token = test_api()
    
    # 额外测试：专门测试CAR-T和自身免疫搜索
    if token:
        print("\n" + "=" * 60)
        print("额外测试: CAR-T和自身免疫疾病搜索")
        print("=" * 60)
        
        cart_queries = [
            "CAR-T autoimmune",
            "CAR T",
            "chimeric antigen receptor", 
            "CD19 CAR",
            "autoimmune disease",
            "lupus therapy",
            "cell therapy"
        ]
        
        search_url = f"{BASE_URL}/search/patent/query-search-patent/v2"
        search_headers = {
            "Content-Type": "application/json",
            "authorization": f"Bearer {token}"
        }
        params = {"apikey": API_KEY}
        
        for query in cart_queries:
            payload = {
                "sort": [{"field": "SCORE", "order": "DESC"}],
                "limit": 10,
                "offset": 0,
                "query_text": query,
                "collapse_by": "PBD",
                "collapse_type": "ALL"
            }
            
            try:
                response = requests.post(search_url, params=params, json=payload, headers=search_headers)
                if response.status_code == 200:
                    result = response.json()
                    if result.get("status") and "data" in result:
                        total = result["data"].get("total_hits", 0)
                        count = len(result["data"].get("results", []))
                        print(f"'{query}': 找到 {count} 件 (总计: {total})")
                    else:
                        print(f"'{query}': 无结果")
                else:
                    print(f"'{query}': 错误 {response.status_code}")
            except Exception as e:
                print(f"'{query}': 异常 {str(e)}")

智慧芽API测试

1. 测试获取Token...
   API_KEY: fh10ixx8marmhm9kbl3c...
   状态码: 200
   ❌ Token获取失败: {'status': False, 'error_code': 67200003, 'error_msg': 'Access token expired or authentication error!'}
