In [None]:
import wandb
import random
import numpy as np
import torch
import os
import warnings
warnings.filterwarnings('ignore')

wandb.login()
wandb_project= 'label_round'
wandb_run = wandb.init(project=wandb_project)
wandb_run_id = wandb_run.id

def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED']=str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)#is using multi-GPU
    torch.backends.cudnn.benchmark=False
    torch.backends.cudnn.deterministic = True
set_seed(0)

In [None]:
import pandas as pd
from classifier.load_data import LoadData
from torch.utils.data import DataLoader
from classifier.collote_data import collote_fn

origin_data_pool = pd.read_csv('../dataset/datapool/data_pool.csv')
data_pool = LoadData(path='../dataset/datapool/', data_type='data_pool')
data_pool_dataloader = DataLoader(data_pool.data, batch_size=200, shuffle=True, collate_fn=collote_fn)
train_data = LoadData(path='../dataset/pilotData/', data_type='train_data')
train_dataloader = DataLoader(train_data, batch_size=20, shuffle=True, collate_fn=collote_fn)
val_data = LoadData(path='../dataset/pilotData/', data_type='test_data')
val_dataloader = DataLoader(val_data, batch_size=10, shuffle=True, collate_fn=collote_fn)

In [3]:
import torch
from transformers import AdamW, get_scheduler
from classifier.multi_laber_classifier import classifer_model
classifier = classifer_model()
checkpoint=torch.load('learner/before_active_learning/checkpoint/model_weights.bin')
learning_rate = 1e-5  # 学习率
optimizer = AdamW(classifier.parameters(), lr=learning_rate) 
classifier.load_state_dict(checkpoint['estimator'])
optimizer.load_state_dict(checkpoint['optimizer'])
start_epoch=checkpoint['epoch']
completed_steps = start_epoch * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",  
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=completed_steps,
)
lr_scheduler.load_state_dict(checkpoint['lr_schedule'])

metrics={'best_error':3.36,'best_loss':0.12,'best_rank_pre':0.76,'best_avg_pre':0.53}

In [None]:
from learner.active_learner import MyActiveLearner
from learner.query_strategy import min_confidence_sampling
learner = MyActiveLearner(
        estimator=classifier,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        query_strategy=min_confidence_sampling,
        data_pool=data_pool_dataloader,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        metrics=metrics,
    )

In [None]:
from sklearn.exceptions import NotFittedError

# Number of instances we want to annotate per iteration
n_instances = 30
query_idx, query_inst = learner.learner_query(n_instances=n_instances)#
try:
    probabilities = learner.predict(query_idx)#
# For the very first query we do not have any predictions
except NotFittedError:
    probabilities = [[0.5]*10] * n_instances
predict=[]
for pred in probabilities:
    for pre in pred:
        temp=[]
        for p in pre:
            temp.append(float(p))
        predict.append(temp)
# predict

In [None]:

import pandas as pd
import argilla as rg
import datetime

print(query_idx)
rg.init(
    api_url="http://localhost:6900/",
    api_key="owner.apikey",
    # extra_headers={"Authorization": f"Bearer {"HF_TOKEN"}"}
)
label_list=["Achievement","Self-direction","Hedonism", "Security", "Power", "Stimulation","Benevolence", "Universalism","Conformity","Tradition"]
inputs = []
records=[]
for i in query_idx:
    print(i)
    input = {}
    sample_id=learner.data_pool.dataset[i]['id']#
    sample_text_id=origin_data_pool['id'].tolist().index(sample_id)#
    sample_text=origin_data_pool['text'][sample_text_id]#
    input_text=str(sample_id)+':'+sample_text#
    input['text']=input_text#
    inputs.append(input)
input_df=pd.Series(input for input in inputs)
ids=[]
for i in query_idx.tolist():
    now_time = datetime.datetime.now()
    temp_str=datetime.datetime.strftime(now_time,'%Y-%m-%d %H:%M:%S')
    str_i=f'{i}'
    id=str_i+'_'+temp_str
    ids.append(id)
for i in range(len(input_df)):
    records.append(
        rg.TextClassificationRecord(
            id=ids[i],
            inputs=input_df[i],
            prediction=list(zip(label_list, predict[i])),
            multi_label=True
        )
    )

settings = rg.TextClassificationSettings(label_schema=label_list)
rg.log(workspace='admin',records=records,name="saner_test")

In [21]:
from learner.read_recore import get_labeldata_fromrubrix

records_df = rg.load("saner_test",ids=ids)#
records_df = records_df.to_datasets()
records_df = records_df.to_pandas()

if any(records_df.annotation.isna()):
    raise UserWarning("Please annotate first all your samples before teaching the model")

data_df=pd.DataFrame()

data_df['label'] = records_df['annotation']

id_list=[]
text_list=[]

data_list=list(data_pool.data.values())
temp_id=[]
temp_text=[]
# temp_label=[]
for data_dict in data_list:
    temp_id.append(data_dict['id'])
    temp_text.append(data_dict['text'])
    # temp_label.append(data_dict['label'])

for content in records_df['text']:
    current=content.split(':',1)#
    current_id=int(current[0])#
    # current_text=current[1]#
    current_text_id=temp_id.index(current_id)#
    current_text=temp_text[current_text_id]#
    id_list.append(current_id)
    text_list.append(current_text)


onehot_label_record=[]#
for content in data_df['label'].tolist():
    temp_label=get_labeldata_fromrubrix(content)
    onehot_label_record.append(temp_label)

addsample_train=[]
for i in range(len(onehot_label_record)):
    new_sample={}
    new_sample['id']=id_list[i]

    new_sample['text']=text_list[i]
    # new_sample['text']=data_df['text'][i]
    new_sample['label']=onehot_label_record[i]
    addsample_train.append(new_sample)


In [22]:
epoch_more=50
learner.my_teach_new(query_idx,addsample_train,start_epoch,epoch_more,wandb_project,wandb_run_id)

In [12]:

import os
import re
from getLabelReview import get_data_fromrubrix
import datetime

now_time = datetime.datetime.now()
temp_str = datetime.datetime.strftime(now_time, '%Y-%m-%d %H:%M:%S')
strs = re.sub(r"""[-|:| ]""", "_", temp_str)
base_path = f'dataset/label_round/label_epoch/'
temp_path = base_path + strs#
rubrix_path=temp_path+'/rubrix'
train_path=temp_path+'/train'
os.makedirs(rubrix_path)
os.makedirs(train_path)
currentTrainPathBase=rubrix_path+'/train_rubrix.csv'
currentTrainFilePath=train_path+'/current_train.csv'
print(currentTrainPathBase)
get_data_fromrubrix('saner_test',currentTrainPathBase,currentTrainFilePath,agentName=None)#

dataset/label_round/label_epoch/2024_10_10_16_33_27/rubrix/train_rubrix.csv
data stored


In [None]:
from classifier.data_preprocess import data_preprocess_first, data_preprocess_sencond

from dataset.dataConcat import concatCSV

add_train_path1=train_path+'/current_train_after1.csv'
add_train_path2=train_path+'/current_train_after2.csv'

data_preprocess_first(currentTrainFilePath,add_train_path1)#
data_preprocess_sencond(add_train_path1,add_train_path2)#


In [None]:
train_base='../dataset/pilotData/train_data.csv'#
train_data_path=train_path+'/current_train_added.csv'#
concatCSV(train_base,add_train_path2,train_data_path)

In [None]:
from test_train import testtrain
testtrain(learner.estimator,learner.train_dataloader,learner.val_dataloader)