Q-A model搭建

1. 下载中QA模型：`roberta-base-chinese-extractive-qa` : https://hf-mirror.com/uer/roberta-base-chinese-extractive-qa
2. 使用数据集对Q-A模型进行训练
3. 部署(flask)

## 获取QA模型
加载模型并测试

In [None]:
from transformers import pipeline
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

In [None]:
QA_model = pipeline("question-answering", 
                    model="./models_roberta-base-chinese-extractive-qa",
                    tokenizer="./models_roberta-base-chinese-extractive-qa")

In [None]:
QA_input = {'question': "著名诗歌《假如生活欺骗了你》的作者是",
            'context': "普希金从那里学习人民的语言，吸取了许多有益的养料，这一切对普希金后来的创作产生了很大的影响。这两年里，普希金创作了不少优秀的作品，如《囚徒》、《致大海》、《致凯恩》和《假如生活欺骗了你》等几十首抒情诗，叙事诗《努林伯爵》，历史剧《鲍里斯·戈都诺夫》，以及《叶甫盖尼·奥涅金》前六章。"}
QA_model(QA_input)

## fine-tuning 模型

可以参考如下 `notebook` 或 `qa_training_v0.py`

```sh

#把training 数据 和 模型放到对应文件夹下

 nohup python qa_training_v0.py &

```

### 加载pre-train 模型

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
model_name = "./models_roberta-base-chinese-extractive-qa/"  # 使用roberta-base-squad2模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

### 数据预处理

1. 将csv --> squad.json2. 
将 squad.json -> 可输入的训练
3. 
训练数据

#### 将csv转换为SQuAD 格式的文件

In [None]:
import csv
import json

def convert_to_squad_format(input_file, output_file):
    squad_data = {"data": []}
    
    with open(input_file, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        
        paragraphs = []
        for row in reader:
            context = row['context']
            question = row['question']
            answer_text = row['answers']
            
            # 找到答案在 context 中的起始位置
            answer_start = context.find(answer_text)
            
            if answer_start == -1:
                raise ValueError(f"Answer '{answer_text}' not found in context '{context}'")
            
            # 构建符合 SQuAD 格式的结构
            qas = {
                "question": question,
                "id": str(hash(question)),
                "answers": [{
                    "text": answer_text,
                    "answer_start": answer_start
                }],
                "is_impossible": False
            }
            
            paragraph = {
                "context": context,
                "qas": [qas]
            }
            
            paragraphs.append(paragraph)
        
        # 将段落添加到 "data" 部分
        squad_data["data"].append({
            "title": "custom_dataset",
            "paragraphs": paragraphs
        })
    
    # 将结果写入 JSON 文件
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(squad_data, outfile, ensure_ascii=False, indent=4)

# 使用示例
convert_to_squad_format("QA_test_data/test.csv", "QA_test_data/test_squad_format.json")

#### 加载SQUAD数据

In [None]:
# 加载 SQuAD 格式的 JSON 数据集
with open("QA_test_data/test_squad_format.json", "r", encoding="utf-8") as f:
    squad_data = json.load(f)

# 准备数据
contexts = []
questions = []
answers_text = []
answers_start = []

# 遍历数据集，将其转化为需要的格式
for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            for answer in qa['answers']:
                contexts.append(context)
                questions.append(question)
                answers_text.append(answer['text'])
                answers_start.append(answer['answer_start'])

# 将数据加载为 Dataset 对象
data_dict = {
    'context': contexts,
    'question': questions,
    'answers': [{'text': a, 'answer_start': b} for a, b in zip(answers_text, answers_start)]
}

dataset = Dataset.from_dict(data_dict)

#### 将questions和context进行tokenize

In [None]:
def preprocess_function(examples):
    # 提取 question 和 context
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]

    # 对 question 和 context 进行批量编码，设置最大长度和 padding
    # 只需要对qeustion和context进行tokenizer就可以了？
    inputs = tokenizer(
        questions,
        contexts,
        truncation="only_second",  # 在 context 进行截断
        max_length=512,  # 设置最大长度为 512
        padding="max_length",  # 使用最大长度进行填充
        return_offsets_mapping=True,  # 返回 offsets 以计算答案的位置
        return_tensors="np"  # 返回 NumPy 格式，确保所有长度一致
    )

    # 为了保存答案的起始和结束 token 索引，我们手动计算
    start_positions = []
    end_positions = []

    for i in range(len(examples["answers"])):
        # 获取答案的开始字符位置
        answer_start = examples["answers"][i]["answer_start"]
        answer_text = examples["answers"][i]["text"]

        # 获取 context 的 offset 映射
        offset_mapping = inputs["offset_mapping"][i]
        input_ids = inputs["input_ids"][i]

        # 查找答案的起始和结束 token 索引
        start_char = answer_start
        end_char = start_char + len(answer_text)

        # 初始化 token 索引
        token_start_index = 0
        token_end_index = 0

        # 查找与字符索引对应的 token 索引
        for idx, (start, end) in enumerate(offset_mapping):
            if start <= start_char < end:
                token_start_index = idx
            if start <= end_char <= end:
                token_end_index = idx
                break

        # 保存 token 索引
        start_positions.append(token_start_index)
        end_positions.append(token_end_index)

    # 移除 offset_mapping 因为我们已经不再需要它
    inputs.pop("offset_mapping")

    # 添加 start_positions 和 end_positions
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

### 训练模型

使用以上数据集，对模型进行训练

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',          # 输出目录
    evaluation_strategy="epoch",     # 每轮评估一次
    learning_rate=3e-4,              # 学习率
    per_device_train_batch_size=8,  # 每个设备的batch size
    per_device_eval_batch_size=8,   # 验证的batch size
    num_train_epochs=3,              # 训练轮数
    weight_decay=0.01,               # 权重衰减
    logging_dir='./logs',            # 日志目录
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

#### 模型保存

In [None]:
trainer.save_model("./tmp/finetuned-roberta-base-squad2_wuxi2")
tokenizer.save_pretrained("./tmp/finetuned-roberta-base-squad2_wuxi2")

#### 模型加载与使用

In [None]:
model = pipeline("question-answering", 
                 model="./tmp/finetuned-roberta-base-squad2_wuxi2",
                 tokenizer="./tmp/finetuned-roberta-base-squad2_wuxi2")

In [None]:
QA_input = {
    'question': '查什么模型',
    'context': '帮忙查下PDX模型中EGFR的表达'
}
res = model(QA_input)
res

## 模型部署

1. 本地部署
2. flask云端部署

### 本地部署

将模型进行本地部署，使用脚本调用模型：

```sh
python qa_search_v1.py --question 在我们PDX模型中检索EGFR突变数据
#输出固定的检索结果为 `res.tsv`
```

In [None]:
#! /home_bk/fan_qiangqiang/miniconda3/envs/chatGLM/bin/python
# -*- coding: utf-8 -*-
"""
Created on 2024.10.16
@author: fan_qiangqiang
Description: 在PDX，细胞系，Syngeneic模型中，检索基因的表达、突变、基因融合，拷贝数变异等数据
"""

from transformers import AutoTokenizer, AutoModelForQuestionAnswering,pipeline
import subprocess
import argparse 

def get_dict(filein):
    '获取model_types 和 datatypes'
    dict_map={}
    with open(filein,'r') as f:
        fs=f.readlines()
        for rs in fs:
            records=rs.split("\t")
            dict_map[records[0]]=records[1].strip("\n")
    return dict_map
    

def search(models,genes,datatypes):
    '根据输入的models，genes和datatypes，在服务器后台进行检索'
    dt=datatype_dict[datatypes]
    genes="|".join([m.upper() for m in genes.split(",")])

    #还需要先评价下是否有空值
    
    if dt=='mutation':
        search_mutation(models,genes,dt)
        
    if dt=='expression':
        search_expression(models,genes,dt)
        
    if dt=='CNV':
        search_CNV(models,genes,dt)
        
    if dt=='fusion':
        search_fusion(models,genes,dt)
        
    if dt=='HLA':
        search_HLA(models,genes,dt)


def search_mutation(models,genes,dt):
    '检索突变'
    command="cat {NGSpath}*|grep -E exonic|grep -wiE '{genes}'".format(
        NGSpath=getattr(tmp_database[model_dict[models]],dt),
        genes= genes)
    #print(command)
    result = subprocess.Popen(command,shell=True,
                             stdout=subprocess.PIPE, 
                             stderr=subprocess.PIPE, 
                             text=True)
    stdout, stderr = result.communicate()
    with open('res.tsv','w') as f:
        f.writelines("\t".join(["Model ID","WGC ID","chroM","Start","End","ref","Alt",
                               "Genotype", "GeneLocation","Gene","Exon Syno",
                               "Exon","dbSNP", "1000G","Cosmic","AF","DP"])+"\n")
        f.writelines(stdout)

def search_expression(models,genes,dt):
    '检索表达'
    command="cat {NGSpath}*|grep -wiE '{genes}'|cut -f1,2,5".format(
        NGSpath=getattr(tmp_database[model_dict[models]],dt),
        genes= genes)
    #print(command)
    result = subprocess.Popen(command,shell=True,
                             stdout=subprocess.PIPE, 
                             stderr=subprocess.PIPE, 
                             text=True)
    stdout, stderr = result.communicate()
    with open('res.tsv','w') as f:
        f.writelines("\t".join(['modelID','Gene','expression'])+"\n")
        f.writelines(stdout)

def search_CNV(models,genes,dt):
    '检索CNV'
    command="cat {NGSpath}*|grep -wiE '{genes}'|cut -f1,2,5".format(
        NGSpath=getattr(tmp_database[model_dict[models]],dt),
        genes= genes)
    #print(command)
    result = subprocess.Popen(command,shell=True,
                             stdout=subprocess.PIPE, 
                             stderr=subprocess.PIPE, 
                             text=True)
    stdout, stderr = result.communicate()
    with open('res.tsv','w') as f:
        f.writelines("\t".join(["Model ID","Gene","CNV"])+"\n")
        f.writelines(stdout)

def search_fusion(models,genes,dt):
    '检索融合'
    command="cat {NGSpath}*|grep -wiE '{genes}' | cut -f1,2,7,8,9,10,14".format(
        NGSpath=getattr(tmp_database[model_dict[models]],dt),
        genes= genes)
    #print(command)
    result = subprocess.Popen(command,shell=True,
                             stdout=subprocess.PIPE, 
                             stderr=subprocess.PIPE, 
                             text=True)
    stdout, stderr = result.communicate()
    with open('res.tsv','w') as f:
        f.writelines("\t".join(["ID","JunctionReadCount","LeftGene",
                                "LeftBreakpoint","RightGene","RightBreakpoint","FFPM"])+"\n")
        f.writelines(stdout)

def search_HLA(models,genes,dt):
    pass

def get_inputs(context_query):
    '从输入中获取模型，基因，数据类型'    
    QA_input={'question':'什么模型','context':context_query}
    global model
    res = model(QA_input)
    models=res['answer'] if res['score'] >0.9 else ''
    
    QA_input={'question':'什么数据类型','context':context_query}
    res = model(QA_input)
    datatypes=res['answer'] if res['score'] >0.9 else '' 
    
    QA_input={'question':'什么基因','context':context_query}
    res = model(QA_input)
    genes=res['answer'] if res['score'] >0.9 else ''
    
    return models,genes,datatypes

def write_out(log):
    '输出检索日志'
    with open('QA_log','a') as f:
        f.writelines(log+"\n")

class innerDatabse:
    def __init__(self,name,expression,mutation,CNV,HLA,fusion):
        self.name=name
        self.expression=expression
        self.mutation=mutation
        self.CNV=CNV
        self.HLA=HLA
        self.fusion=fusion

tmp_database={}
tmp_database['PDX']=innerDatabse(name='PDX',
                                 expression='/OIU/innerNGSresults/filterRNAseq/',
                                mutation='/OIU/innerNGSresults/filterWXS/',
                                CNV='/OIU/innerNGSresults/CNV_cnvkit/',
                                HLA='/OIU/innerNGSresults/HLA/',
                                fusion='/OIU/innerNGSresults/fusion_starfusion_formatPDX/')
tmp_database['cancer_cell_line']=innerDatabse(name='cancer cell line',
                                 expression='/OIU/innerNGSresults/CellLineDatasets/expression_format_TPM/',
                                mutation='/OIU/innerNGSresults/CellLineDatasets/mutation_format/',
                                CNV='/OIU/innerNGSresults/CellLineDatasets/cnv_format/',
                                HLA='/OIU/innerNGSresults/CellLineDatasets/HLA_HD/',
                                fusion='/OIU/innerNGSresults/CellLineDatasets/fusion_starfusion/')

tmp_database['syngeneic']=innerDatabse(name='syngeneic models',
                                 expression='/OIU/innerNGSresults/Syngeneic/expression/',
                                mutation='/OIU/innerNGSresults/Syngeneic/mutation/',
                                CNV='',
                                HLA='',
                                fusion='')
                                
model = pipeline("question-answering", 
                 model="./tmp/finetuned-roberta-base-squad2_wuxi1",
                 tokenizer="./tmp/finetuned-roberta-base-squad2_wuxi1")
model_dict=get_dict('./model_dict')
datatype_dict=get_dict('./datatype_dict')


def main():
    # Command-line argument parser
    parser = argparse.ArgumentParser(description="search NGS data based on your input")
    parser.add_argument('--question', type=str, required=True, help="your query question: 在我们PDX模型中检索EGFR突变数据")
    #parser.add_argument('--param2', type=int, default=20, help="Second parameter (default: 20)")
    args = parser.parse_args()
    
    # Use the example function
    input_contest=args.question
    print("Searching....")
    assert input_contest, "Your input not avaliable, please check your question"
    models,genes,datatypes=get_inputs(input_contest)
    
    res = 1 if models in model_dict and genes and datatypes in datatype_dict else 0
    if res:
        search(models,genes,datatypes)
        check_status="\t".join([input_contest,datatype_dict[datatypes],models,genes,datatypes,str(res)])
        write_out(check_status)
        print("Complete! Please find results in res.tsv")
    else:
        check_status="\t".join([input_contest,'-',models,genes,datatypes,str(res)])
        write_out(check_status)
        print("No data feedback!")
    
# Ensure the script runs only when executed directly
if __name__ == "__main__":
    main()

### flask 云端部署

1. 云端部署，只需要API即可访问使用: `qa_search_v2.py`
2. 加上uuid，可以多人同时使用：`qa_search_v3.py`

#### 服务器端脚本

`qa_search_v3.py`

In [None]:
#! /home_bk/fan_qiangqiang/miniconda3/envs/chatGLM/bin/python
# -*- coding: utf-8 -*-
"""
Created on 2024.10.16
@author: fan_qiangqiang
Description: 在PDX，细胞系，Syngeneic模型中，检索基因的表达、突变、基因融合，拷贝数变异等数据
Update: 
1. 更新为API使用
2. 多用户使用是，分别分发结果
"""

from transformers import AutoTokenizer, AutoModelForQuestionAnswering,pipeline
import subprocess
import argparse
from flask import Flask, request, jsonify, send_file
import uuid

def get_dict(filein):
    '获取model_types 和 datatypes'
    dict_map={}
    with open(filein,'r') as f:
        fs=f.readlines()
        for rs in fs:
            records=rs.split("\t")
            dict_map[records[0]]=records[1].strip("\n")
    return dict_map


def search(models,genes,datatypes,outs):
    '根据输入的models，genes和datatypes，在服务器后台进行检索'
    dt=datatype_dict[datatypes]
    genes="|".join([m.upper() for m in genes.split(",")])

    #还需要先评价下是否有空值

    if dt=='mutation':
        search_mutation(models,genes,dt,outs)

    if dt=='expression':
        search_expression(models,genes,dt,outs)

    if dt=='CNV':
        search_CNV(models,genes,dt,outs)

    if dt=='fusion':
        search_fusion(models,genes,dt,outs)

    if dt=='HLA':
        search_HLA(models,genes,dt,outs)


def search_mutation(models,genes,dt,outs):
    '检索突变'
    command="cat {NGSpath}*|grep -E exonic|grep -wiE '{genes}'".format(
        NGSpath=getattr(tmp_database[model_dict[models]],dt),
        genes= genes)
    #print(command)
    result = subprocess.Popen(command,shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             text=True)
    stdout, stderr = result.communicate()
    with open(outs,'w') as f:
        f.writelines("\t".join(["Model ID","WGC ID","chroM","Start","End","ref","Alt",
                               "Genotype", "GeneLocation","Gene","Exon Syno",
                               "Exon","dbSNP", "1000G","Cosmic","AF","DP"])+"\n")
        f.writelines(stdout)

def search_expression(models,genes,dt,outs):
    '检索表达'
    command="cat {NGSpath}*|grep -wiE '{genes}'|cut -f1,2,5".format(
        NGSpath=getattr(tmp_database[model_dict[models]],dt),
        genes= genes)
    #print(command)
    result = subprocess.Popen(command,shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             text=True)
    stdout, stderr = result.communicate()
    with open(outs,'w') as f:
        f.writelines("\t".join(['modelID','Gene','expression'])+"\n")
        f.writelines(stdout)

def search_CNV(models,genes,dt,outs):
    '检索CNV'
    command="cat {NGSpath}*|grep -wiE '{genes}'|cut -f1,2,5".format(
        NGSpath=getattr(tmp_database[model_dict[models]],dt),
        genes= genes)
    #print(command)
    result = subprocess.Popen(command,shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             text=True)
    stdout, stderr = result.communicate()
    with open(outs,'w') as f:
        f.writelines("\t".join(["Model ID","Gene","CNV"])+"\n")
        f.writelines(stdout)

def search_fusion(models,genes,dt,outs):
    '检索融合'
    command="cat {NGSpath}*|grep -wiE '{genes}' | cut -f1,2,7,8,9,10,14".format(
        NGSpath=getattr(tmp_database[model_dict[models]],dt),
        genes= genes)
    #print(command)
    result = subprocess.Popen(command,shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             text=True)
    stdout, stderr = result.communicate()
    with open(outs,'w') as f:
        f.writelines("\t".join(["ID","JunctionReadCount","LeftGene",
                                "LeftBreakpoint","RightGene","RightBreakpoint","FFPM"])+"\n")
        f.writelines(stdout)

def search_HLA(models,genes,dt):
    pass

def get_inputs(context_query):
    '从输入中获取模型，基因，数据类型'
    QA_input={'question':'什么模型','context':context_query}
    global model
    res = model(QA_input)
    models=res['answer'] if res['score'] >0.9 else ''

    QA_input={'question':'什么数据类型','context':context_query}
    res = model(QA_input)
    datatypes=res['answer'] if res['score'] >0.9 else ''

    QA_input={'question':'什么基因','context':context_query}
    res = model(QA_input)
    genes=res['answer'] if res['score'] >0.9 else ''

    return models,genes,datatypes

def write_out(log):
    '输出检索日志'
    with open('QA_log','a') as f:
        f.writelines(log+"\n")

class innerDatabse:
    def __init__(self,name,expression,mutation,CNV,HLA,fusion):
        self.name=name
        self.expression=expression
        self.mutation=mutation
        self.CNV=CNV
        self.HLA=HLA
        self.fusion=fusion

tmp_database={}
tmp_database['PDX']=innerDatabse(name='PDX',
                                 expression='/OIU/innerNGSresults/filterRNAseq/',
                                mutation='/OIU/innerNGSresults/filterWXS/',
                                CNV='/OIU/innerNGSresults/CNV_cnvkit/',
                                HLA='/OIU/innerNGSresults/HLA/',
                                fusion='/OIU/innerNGSresults/fusion_starfusion_formatPDX/')
tmp_database['cancer_cell_line']=innerDatabse(name='cancer cell line',
                                 expression='/OIU/innerNGSresults/CellLineDatasets/expression_format_TPM/',
                                mutation='/OIU/innerNGSresults/CellLineDatasets/mutation_format/',
                                CNV='/OIU/innerNGSresults/CellLineDatasets/cnv_format/',
                                HLA='/OIU/innerNGSresults/CellLineDatasets/HLA_HD/',
                                fusion='/OIU/innerNGSresults/CellLineDatasets/fusion_starfusion/')

tmp_database['syngeneic']=innerDatabse(name='syngeneic models',
                                 expression='/OIU/innerNGSresults/Syngeneic/expression/',
                                mutation='/OIU/innerNGSresults/Syngeneic/mutation/',
                                CNV='',
                                HLA='',
                                fusion='')

model = pipeline("question-answering",
                 model="./tmp/finetuned-roberta-base-squad2_wuxi1",
                 tokenizer="./tmp/finetuned-roberta-base-squad2_wuxi1")
model_dict=get_dict('./model_dict')
datatype_dict=get_dict('./datatype_dict')

app = Flask(__name__)
@app.route('/predict', methods=['POST'])

def predict():
    # Command-line argument parser
    #parser = argparse.ArgumentParser(description="search NGS data based on your input")
    #parser.add_argument('--question', type=str, required=True, help="your query question: 在我们PDX模型中检索EGFR突变数据")
    #args = parser.parse_args()

    # Use the example function
    #input_contest=args.question
    # 获取请求中的数据（假设为 JSON 格式）
    data = request.json
    #print(data)
    #print(data['input'])#input_contest=data['input']
    input_contest=data['input']

    assert input_contest, "Your input not avaliable, please check your question"
    models,genes,datatypes=get_inputs(input_contest)

    res = 1 if models in model_dict and genes and datatypes in datatype_dict else 0
    if res:
        outs = str(uuid.uuid4()) + '_result.tsv'
        search(models,genes,datatypes,outs)
        check_status="\t".join([input_contest,datatype_dict[datatypes],models,genes,datatypes,str(res)])
        write_out(check_status)
        return send_file(outs,as_attachment=True)
    else:
        check_status="\t".join([input_contest,'-',models,genes,datatypes,str(res)])
        write_out(check_status)
        return 'Data received, but search failed! \n Thanks for your support', 200

# Ensure the script runs only when executed directly
if __name__ == "__main__":
    #main()
    app.run(host='0.0.0.0', port=5000)


'''
客户端python脚本

import requests
 
data_to_submit = {
    'input': '哪些PDX模型中有EGFR发生基因融合'
}

# 发送 POST 请求并获取响应
response=requests.post('http://localhost:5000/predict', json=data_to_submit)

# 检查响应状态码
if response.status_code == 200:
    # 将文件保存到本地
    with open('search_result.tsv', 'wb') as f:
        f.write(response.content)
    print('文件已成功下载并保存为 search_result.tsv')
else:
    print('请求失败，状态码:', response.status_code)
'''

#### 客户端脚本

`search_NGS.py`

+ 使用示例

```sh
python search_NGS.py --question 在小鼠模型中找一些kras突变数据
```

In [None]:


# -*- coding: utf-8 -*-
"""
Created on 2024.10
@author: Qiangqiang Fan
Description: 主要用于检索PDX,细胞系,Syngeneic模型的突变、表达等数据
"""

# Import necessary modules
import os
import sys
import argparse  # For handling command-line arguments
import requests


def search(context):

    data_to_submit = {'input': context}
    response=requests.post('http://10.111.17.67:5000/predict', json=data_to_submit)

    # 检查响应状态码
    if response.status_code == 200:
        # 将文件保存到本地
        with open('search_result.tsv', 'wb') as f:
            f.write(response.content)
        print('文件已成功下载并保存为 search_result.tsv')
    else:
        print('请求失败，状态码:', response.status_code)


def main():
    parser = argparse.ArgumentParser(description="search NGS data based on your input")
    parser.add_argument('--question', type=str, required=True, help="your query question: 在我们PDX模型中检索EGFR突变数据")
    args = parser.parse_args()
    
    # Use the example function
    input_contest=args.question
    search(input_contest)

# Ensure the script runs only when executed directly
if __name__ == "__main__":
    main()


'''
一些成功的示例：
python search_NGS.py --question 在pdx模型中找一下kras突变数据
python search_NGS.py --question 哪些细胞系模型中有egfr突变数据呢
python search_NGS.py --question 在小鼠模型中找一些kras突变数据
python search_NGS.py --question 在小鼠模型中找一些kras,egfr突变数据
'''