In [1]:
import copy
import logging
import numpy as np
import pandas as pd
from datetime import datetime,timedelta,date
import time
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModel, get_linear_schedule_with_warmup, \
    default_data_collator, TrainingArguments, Trainer
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr, spearmanr
from transformers import AutoTokenizer
from transformers import BertConfig, BertTokenizer,BertTokenizerFast
from datasets import Dataset
from typing import List,Union
import tqdm
import re
import json
import os,time
from torch.nn.utils.rnn import pad_sequence
from torch.types import Number

os.chdir('/home/stops/Work_space/NLP_work/Med_assit_chatglm')
os.environ['CUDA_VISIBLE_DEVICES']='0'

from db_config_taiyi import DB ## load data from pgsql

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')
logger.info('Starting')



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.0
CUDA SETUP: Detected CUDA version 116
CUDA SETUP: Loading binary /home/stops/anaconda3/envs/text_generation_x/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda116_nocublaslt.so...


  warn(msg)
2023-09-08 11:03:58,305 - INFO : Starting


In [11]:
data_file="/home/stops/BERT-model/train2.json"

text_data_list=[]
label_data_list=[]
label_id_data_list=[]
with open(data_file,"r") as f:
    for sub_line in f.readlines():
        sub_line=json.loads(sub_line)
        human = sub_line['conversations'][0]['value']
        if len(human) > 87:
            h = human[:54]+human[-33:]
        else:
            h = human
        assistant = sub_line['conversations'][1]['value']
        if len(assistant) > 425:
            a = assistant[:400]+assistant[-25:]
        else:
            a = assistant
        label=sub_line["label"]
        label_id=sub_line["id"]
        text = h + '。' + a
        text_data_list.append(text)
        label_data_list.append(label)
        label_id_data_list.append(label_id)


logger.info(f"load data file: {data_file}")
logger.info(f"train_data_list nums : {len(text_data_list)}")
logger.info(f"data example: {text_data_list[:2]}")


2023-09-08 11:34:12,946 - INFO : load data file: /home/stops/BERT-model/train2.json
2023-09-08 11:34:12,947 - INFO : train_data_list nums : 1031
2023-09-08 11:34:12,948 - INFO : data example: ['请编写一个SQL查询，找出销售额最高的5个产品，并进行注释说明每条代码的功能。。1. 使用INNER JOIN将“订单”和“订单细节”表连接起来\n2. 使用SUM函数计算每种产品的销售额，按降序排序\n3. 使用LIMIT限制结果返回前5条\n代码实现：\n```sql\nSELECT p.product_name, SUM(od.quantity*od.unit_price) AS total_sales\nFROM orders o\nINNER JOIN order_details od ON o.order_id=od.order_id\nINNER JOIN products p ON od.product_id =p.product_id\nGROUP BY p.product_name\nORDER BY total_sales DESC\nLIMIT 5;\n```\n代码解释：\n1. 第一行：选择产品名称以及计算其销售额。\n2. 第二行：使用INNER JOIN将三个表连接\n5. 第五行：使用LIMIT限制结果返回前5条。', '请解释关于Python中的lambda函数的概念。。在Python中，lambda函数是一种匿名函数，也称为内联函数或函数式函数。它们允许指定单行函数定义，而无需编写完整的函数体。Lambda函数可以传递给其他函数作为参数，或在其它的表达式中使用。Python中的lambda函数完全是语法糖，它使得我们能够更快速和方便地定义和使用简单的函数。\nlambda函数的语法规则如下：\n```\nlambda arguments : expression\n```\n其中，arguments是指函数的输入参数，使用逗号分隔，而expression则是函数内的返回值。通过这种方式，可以轻松地创建简单函数，而无需定义完整的函数体。lambda函

In [12]:
def show_df(df):
    print(df.shape)
    print(df.head(2))
    print(df.isnull().sum())


instruct_clf_df=pd.DataFrame({"text":text_data_list,
                              "label_answer":label_data_list,
                              "label_id":label_id_data_list})

show_df(instruct_clf_df)


(1031, 3)
                                                text label_answer  label_id
0  请编写一个SQL查询，找出销售额最高的5个产品，并进行注释说明每条代码的功能。。1. 使用I...         code  37252458
1  请解释关于Python中的lambda函数的概念。。在Python中，lambda函数是一种...         code  82662470
text            0
label_answer    0
label_id        0
dtype: int64


In [17]:
label_text_list=instruct_clf_df.loc[:,"label_answer"].unique().tolist()

In [18]:
label2id_dict=dict(zip(label_text_list,range(len(label_text_list))))

from pprint import pprint
pprint(label2id_dict)


{'biography': 12,
 'brainstorming': 1,
 'business': 13,
 'chat': 3,
 'classification': 9,
 'code': 0,
 'culture': 16,
 'economy': 15,
 'expansion': 7,
 'extraction': 8,
 'health': 2,
 'math': 11,
 'other': 17,
 'rewriting': 5,
 'science': 10,
 'society': 14,
 'summarization': 6,
 'translation': 4}


In [19]:
print(instruct_clf_df["label_answer"].value_counts())

brainstorming     119
biography         102
science            80
code               76
classification     67
business           65
summarization      59
math               59
other              55
extraction         45
society            45
chat               44
culture            42
expansion          40
translation        39
health             34
economy            33
rewriting          27
Name: label_answer, dtype: int64


In [20]:
save_mode=True
if save_mode:
    save_file="data/instruct_clf_df_0908.xlsx"
    instruct_clf_df.to_excel(save_file,index=False)
    logger.info(f"save file: {save_file}")

2023-09-08 13:10:49,429 - INFO : save file: data/instruct_clf_df_0908.xlsx


In [22]:
with open("data/instruct_label2id.json","w+") as f:
    json.dump(label2id_dict,f)
