In [1]:
from datasets import load_dataset
import datasets

### 1. FPB

In [2]:
dic = {
    0:"negative",
    1:'neutral',
    2:'positive',
}

In [3]:
fpb_datasets = load_dataset("financial_phrasebank", "sentences_50agree")
fpb_datasets = fpb_datasets["train"]
fpb_datasets = fpb_datasets.to_pandas()
fpb_datasets.columns = ["input", "output"]
fpb_datasets["output"] = fpb_datasets["output"].apply(lambda x:dic[x])
fpb_datasets["instruction"]  = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
fpb_datasets = datasets.Dataset.from_pandas(fpb_datasets)

# 只保留了训练集
fpb_datasets = fpb_datasets.train_test_split(seed = 42)['train']
fpb_datasets

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 3634
})

In [4]:
train_dataset = datasets.concatenate_datasets([fpb_datasets]*6)   # we want each data source have similar number of samples
train_dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 21804
})

### 2. FiQA SA

In [5]:
def make_label(x):
    if x < - 0.1: return "negative"
    elif x >=-0.1 and x < 0.1: return "neutral"
    elif x >= 0.1: return "positive"

In [6]:
def add_instructions(x):
    if x == "post":
        return "What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}."
    else:
        return "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."

In [7]:
dataset = load_dataset('pauri32/fiqa-2018') # 用于评估在线评论质量的公开数据集
dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ])
dataset = dataset.to_pandas()
dataset["output"] = dataset.sentiment_score.apply(make_label)
dataset["instruction"] = dataset.format.apply(add_instructions)
dataset = dataset[['sentence', 'output',"instruction"]]
dataset.columns = ["input", "output","instruction"]
dataset = datasets.Dataset.from_pandas(dataset)
dataset = dataset.train_test_split(0.226, seed = 42)['train']
dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 938
})

In [8]:
tmp_dataset = datasets.concatenate_datasets([dataset]*21)
train_dataset = datasets.concatenate_datasets([train_dataset, tmp_dataset]) 
print(tmp_dataset.num_rows)
train_dataset

19698


Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 41502
})

### 3. TFNS

In [9]:
dic = {
    0:"negative",
    1:'positive',
    2:'neutral',
}

In [10]:
social_media_dataset = load_dataset('zeroshot/twitter-financial-news-sentiment')
social_media_dataset = social_media_dataset['train']
social_media_dataset = social_media_dataset.to_pandas()
social_media_dataset['label'] = social_media_dataset['label'].apply(lambda x:dic[x])
social_media_dataset['instruction'] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'
social_media_dataset.columns = ['input', 'output', 'instruction']
social_media_dataset = datasets.Dataset.from_pandas(social_media_dataset)
social_media_dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 9543
})

In [11]:
tmp_dataset = datasets.concatenate_datasets([social_media_dataset]*2)
train_dataset = datasets.concatenate_datasets([train_dataset,tmp_dataset]) 
print(tmp_dataset.num_rows)
train_dataset

19086


Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 60588
})

### 4. NWGI

In [12]:
finance_dataset = load_dataset('oliverwang15/news_with_gpt_instructions')
finance_dataset = finance_dataset['train'].to_pandas()
finance_dataset['output'] = finance_dataset['label']
finance_dataset["input"] = finance_dataset["news"]
finance_dataset["instruction"] = 'What is the sentiment of this news? Please choose an answer from {strong negative/moderately negative/mildly negative/neutral/mildly positive/moderately positive/strong positive}, then provide some short reasons.'
finance_dataset = finance_dataset[['input', 'output', 'instruction']]
finance_dataset = datasets.Dataset.from_pandas(finance_dataset)
finance_dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 16184
})

In [13]:
train_dataset = datasets.concatenate_datasets([train_dataset, finance_dataset])
all_dataset = train_dataset.shuffle(seed = 42)
all_dataset.shape

(76772, 3)

In [14]:
# from huggingface_hub import notebook_login
# notebook_login()

In [15]:
# all_dataset.push_to_hub("fingpt_chatglm2_sentiment_instruction_lora_ft_dataset")

In [16]:
21804 + 19698 + 19086 + 16184

76772

### Make Dataset

In [17]:
import json
from tqdm.notebook import tqdm

In [18]:
def format_example(example: dict) -> dict:
    context = f"Instruction: {example['instruction']}\n"
    if example.get("input"):
        context += f"Input: {example['input']}\n"
    context += "Answer: "
    target = example["output"]
    return {"context": context, "target": target}

In [19]:
data_list = []
for item in all_dataset.to_pandas().itertuples():
    tmp = {}
    tmp["instruction"] = item.instruction
    tmp["input"] = item.input
    tmp["output"] = item.output
    data_list.append(tmp)

In [20]:
with open("dataset_new.jsonl", 'w') as f:
    for example in tqdm(data_list, desc="formatting.."):
        f.write(json.dumps(format_example(example)) + '\n')

formatting..:   0%|          | 0/76772 [00:00<?, ?it/s]

### Tokenize

In [21]:
import json
from tqdm.notebook import tqdm

import datasets
from transformers import AutoTokenizer, AutoConfig

model_name = "THUDM/chatglm2-6b"
jsonl_path = "dataset_new.jsonl"
save_path = 'dataset_new'
max_seq_length = 512
skip_overlength = True

In [22]:
def preprocess(tokenizer, config, example, max_seq_length):
    prompt = example["context"]
    target = example["target"]
    prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)
    target_ids = tokenizer.encode(
        target,
        max_length=max_seq_length,
        truncation=True,
        add_special_tokens=False)
    input_ids = prompt_ids + target_ids + [config.eos_token_id]
    return {"input_ids": input_ids, "seq_len": len(prompt_ids)}

def read_jsonl(path, max_seq_length, skip_overlength=False):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, trust_remote_code=True)
    config = AutoConfig.from_pretrained(
        model_name, trust_remote_code=True, device_map='auto')
    with open(path, "r") as f:
        for line in tqdm(f.readlines()):
            example = json.loads(line)
            feature = preprocess(tokenizer, config, example, max_seq_length)
            if skip_overlength and len(feature["input_ids"]) > max_seq_length:
                continue
            feature["input_ids"] = feature["input_ids"][:max_seq_length]
            yield feature

In [23]:
dataset = datasets.Dataset.from_generator(
    lambda: read_jsonl(jsonl_path, max_seq_length, skip_overlength)
    )
dataset.save_to_disk(save_path)

Generating train split: 0 examples [00:00, ? examples/s]





  0%|          | 0/76772 [00:00<?, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/76771 [00:00<?, ? examples/s]