# Inference AutoModelForClassification & PromptClassification
### AutoModelForClassification includes: E5, Qwen 
### PromptClassification: Mixtral

In [11]:
%%writefile inf_test.py

import os
import time
import numpy as np
import pandas as pd
import csv
import pickle
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
from transformers.tokenization_utils_base import BatchEncoding
from peft import PeftModel

from tqdm import tqdm
import re

import copy
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader

prompt_part1 = \
f'''You are a website spam expert. You are given information about a webpage to judge whether or not it is spam. 0 means nonspam and 1 means spam. Give your prediction after the <ANS>: tag.
    Url: {{Url}}
    UrlTitle: {{UrlTitle}}
    UrlSnippet: {{UrlSnippet}} 
    Site Content: {{FullBody}}
'''

prompt_part2_inference = \
'''
What is your prediction <ANS>: '''

IGNORE_INDEX = -100  # The default setting in CrossEntropyLoss
MAX_LENGTH_EVAL = 1024

class EvalDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        def apply_prompt_template(sample):
            return prompt_part1.format(Url=sample['Url'],
                                        UrlTitle=sample['UrlTitle'],
                                        UrlSnippet=sample['UrlSnippet'],
                                        FullBody=sample['FullBody'])
        row = self.df.iloc[idx]
        text = apply_prompt_template(row)
        
        res = self.tokenizer(text, prompt_part2_inference, add_special_tokens=False, max_length=MAX_LENGTH_EVAL, padding='max_length', truncation='only_first')
        return {
            'input_ids': torch.tensor(res['input_ids']),
            'attention_mask': torch.tensor(res['attention_mask'])
        }

def create_eval_dataloader(df, tokenizer, batch_size=16):
    ds = EvalDataset(df, tokenizer)

    dataloader = DataLoader(ds, batch_size=batch_size, shuffle=False)
    return dataloader

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--load_from', type=str, required=True)
    # parser.add_argument('--tokenizer_dir', type=str, default='/data/local/IndexQuality/FinetuneLLM/Phi-3-medium')
    parser.add_argument('--tokenizer_dir', type=str, default='/cosmos/local/IndexQuality/FinetuneLLM/Mixtral-8x7B-Instruct-v0.1/')
    parser.add_argument('--input_file', type=str, required=True) 
    parser.add_argument('--output_file', type=str, required=True)
    parser.add_argument('--batch_size', type=int, default=4)
    parser.add_argument('--max_new_tokens', type=int, default=1)

    args = parser.parse_args()
    
    dist.init_process_group("nccl")
    world_size = dist.get_world_size()
    local_rank = dist.get_rank()
    print('local rank:', local_rank, torch.distributed.is_initialized(), world_size)
    if local_rank == 0:
        print(args)

    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir,add_bos_token=True,trust_remote_code=True)
    # left padding for batch inference
    tokenizer.padding_side = 'left'
    tokenizer.pad_token = tokenizer.eos_token
    
    # df = pd.read_parquet("/data/local/IndexQuality/FinetuneLLM/EvaluationSets/scrapekr1.2_spamllm2.4.parquet")[["Url","UrlTitle","UrlSnippet","FullBody"]]
    # df = pd.read_csv(args.input_file,sep="\t")[["Url","UrlTitle","UrlSnippet","FullBody","UrlID"]]
    # df = pd.read_csv(args.input_file,sep="\t")[["Url","UrlTitle","UrlSnippet","FullBody"]]
    # df = pd.read_parquet(args.input_file)[["Url","UrlTitle","UrlSnippet","FullBody"]]
    # df = pd.read_parquet(args.input_file)
    #df = pd.read_csv(args.input_file,sep="\t",lineterminator='\n')[["Url","UrlTitle","UrlSnippet","FullBody"]]
    #df = pd.read_parquet("/data/local/IndexQuality/FinetuneLLM/EvaluationSets/spamgtx5.0_UHRSoutput 1.parquet")[["Url","UrlTitle","UrlSnippet","FullBody"]]

    df = pd.read_parquet("/cosmos/local/IndexQuality/FinetuneLLM/EvaluationSets/scrapekr1.2_spamllm2.4.parquet")[["Url","UrlTitle","UrlSnippet","FullBody"]]
    # df = pd.read_csv("/cosmos/local/IndexQuality/FinetuneLLM/EvaluationSets/test_dataset_2024_03_05.tsv",sep="\t")[["Url","UrlTitle","UrlSnippet","FullBody"]]
    # df = pd.read_parquet("/cosmos/local/IndexQuality/FinetuneLLM/EvaluationSets/spamgtx5.0_UHRSoutput_201.parquet")[["Url","UrlTitle","UrlSnippet","FullBody"]]
    # df = pd.read_csv("/cosmos/local/IndexQuality/FinetuneLLM/EvaluationSets/Clean_60k.tsv",sep="\t",lineterminator="\n").drop_duplicates()
    # df = pd.read_csv("/cosmos/local/IndexQuality/FinetuneLLM/EvaluationSets/8k_with_flipped_labels.tsv",sep="\t")
    
    if world_size > 1:
        df_rank = np.array_split(df, world_size)[local_rank]
    else:
        df_rank = df
    
    print(df.shape, df_rank.shape)
    
    dataloader = create_eval_dataloader(df_rank, tokenizer, batch_size=args.batch_size)
    
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
    
    # config = AutoConfig.from_pretrained(args.tokenizer_dir)
    # config.gradient_checkpointing = True
    
    # model = AutoModelForCausalLM.from_pretrained(args.load_from, device_map=f'cuda:{local_rank}',trust_remote_code=True, 
    # model = AutoModelForCausalLM.from_pretrained("/cosmos/local/IndexQuality/FinetuneLLM/FullTrain/Mixtral_2_6_host_site/", 
    model = AutoModelForCausalLM.from_pretrained(args.load_from, 
                                                device_map=f'cuda:{local_rank}',
                                                trust_remote_code=True, 
                                                quantization_config=quantization_config,
                                                # config=config,
                                                torch_dtype=torch.bfloat16,)
    #load trained model
    # model.load_state_dict(torch.load("/data/local/IndexQuality/FinetuneLLM/FullTrain/Phi3_Medium_O1_A3_data_low_lr_quantized/pytorch_model_2000.bin"))
    # model = model.to(torch.bfloat16)
    if world_size > 1:
        ddp_model = DDP(model, device_ids=[local_rank])
    model.eval()

    result_rank = []
    prob_rank = []
    
    start = time.time()
    for i, x in tqdm(enumerate(dataloader), total=len(dataloader), disable=(local_rank!=0)):
        input_data = {key: value for key, value in x.items() if key in ['input_ids', 'attention_mask']}
        model_inputs = BatchEncoding(input_data).to(f'cuda:{local_rank}')
        with torch.no_grad():
            output = model.generate(**model_inputs, max_new_tokens=1,temperature=0.0,return_dict_in_generate=True, output_scores=True)
            transition_scores = model.compute_transition_scores(output.sequences, output.scores, normalize_logits=True).to("cpu")
            generated_tokens = output.sequences.detach()[:, MAX_LENGTH_EVAL:].cpu()
            score_length = transition_scores.shape[0]
            result_rank.extend(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))
            prob_rank.extend(np.exp(transition_scores.reshape(score_length).numpy()))
            
        if i % 100 == 0:
            t = time.time() - start
            n_samples = (i+1)*dataloader.batch_size
            throughput = n_samples/t if t > 0 else 0
            print(f'rank {local_rank}, total samples：{n_samples} throughput： {throughput:.2f} /s')
        
        if i == 0 and local_rank == 0:
            p = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            print(p)
    
    print('rank', local_rank, 'inference done', len(result_rank))
    print(result_rank)
    
    print('rank', local_rank, 'inference done', len(prob_rank))
    print(prob_rank)

    df_rank['Prediction'] = result_rank
    df_rank["Probability"] = prob_rank
    
    if local_rank <= 0:
        parent_dir = os.path.dirname(args.output_file)
        if parent_dir and not os.path.exists(parent_dir):
            os.makedirs(parent_dir, exist_ok=True)
    
    with open(args.output_file + f'_rank_{local_rank}.pkl', 'wb') as f:
        pickle.dump(df_rank, f, protocol=pickle.HIGHEST_PROTOCOL)

    if world_size > 1:
        dist.barrier()
        dfs = []
        if local_rank <= 0:
            print('Run time ',time.time() - start)
            print('Merging results ...')
            for i in range(world_size):
                file_path = args.output_file + f'_rank_{i}.pkl'
                with open(file_path, 'rb') as f:
                    dfs.append(pickle.load(f))
                try:
                    os.remove(file_path)
                except OSError:
                    pass
            df_res = pd.concat(dfs, axis=0)
    else:
        df_res = df_rank
    
    if local_rank <= 0:
        print('total result:', df_res.shape)
        df_res.to_csv(args.output_file, index=False, sep='\t')
        print('saved to:', args.output_file)



Overwriting inf_test.py


# Evaluate Mixtral

In [21]:
%%writefile inf_multi_mixtral_model_test_5.py
# model_path
# eval_set_path
# prediction_result_path

import os
import time
import numpy as np
import pandas as pd
import csv
import pickle
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
from transformers.tokenization_utils_base import BatchEncoding
from peft import PeftModel

from tqdm import tqdm
import re

import copy
import pandas as pd
import gc, ctypes, torch

import torch
from torch.utils.data import Dataset, DataLoader

prompt_part1 = \
f'''You are a website spam expert. You are given information about a webpage to judge whether or not it is spam. 0 means nonspam and 1 means spam. Give your prediction after the <ANS>: tag.
    Url: {{Url}}
    UrlTitle: {{UrlTitle}}
    UrlSnippet: {{UrlSnippet}} 
    Site Content: {{FullBody}}
'''

prompt_part2_inference = \
'''
What is your prediction <ANS>: '''

IGNORE_INDEX = -100  # The default setting in CrossEntropyLoss
# MAX_LENGTH_EVAL = 1024

class EvalDataset(Dataset):
    def __init__(self, df, tokenizer, max_seq_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        def apply_prompt_template(sample):
            return prompt_part1.format(Url=sample['Url'],
                                        UrlTitle=sample['UrlTitle'],
                                        UrlSnippet=sample['UrlSnippet'],
                                        FullBody=sample['FullBody'])
        
        row = self.df.iloc[idx]
        text = apply_prompt_template(row)
        
        # res = self.tokenizer(text, prompt_part2_inference, add_special_tokens=False, max_length=MAX_LENGTH_EVAL, padding='max_length', truncation='only_first')
        res = self.tokenizer(f"{self.tokenizer.bos_token} {text}", prompt_part2_inference, add_special_tokens=False, max_length=self.max_seq_length, padding='max_length', truncation='only_first')
        return {
            'input_ids': torch.tensor(res['input_ids']),
            'attention_mask': torch.tensor(res['attention_mask']),
        }

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--load_from', type=str, required=True)
    parser.add_argument('--tokenizer_dir', type=str, default='/cosmos/local/IndexQuality/FinetuneLLM/Mixtral-8x7B-Instruct-v0.1/')
    parser.add_argument('--input_file', type=str, required=True) 
    parser.add_argument('--output_file', type=str, required=True)
    parser.add_argument('--batch_size', type=int, default=4)
    parser.add_argument('--max_seq_length', type=int, default=1024)
    parser.add_argument('--max_new_tokens', type=int, default=1)
    parser.add_argument('--drop_duplicates', type=bool, default=False)
    parser.add_argument('--cur_model_num', type=str, required=True)

    args = parser.parse_args()
    
    dist.init_process_group("nccl")
    world_size = dist.get_world_size()
    local_rank = dist.get_rank()
    print('local rank:', local_rank, torch.distributed.is_initialized(), world_size)
    if local_rank == 0:
        print(args)
            
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir,add_bos_token=True,trust_remote_code=True)
    # left padding for batch inference
    tokenizer.padding_side = 'left'
    tokenizer.pad_token = tokenizer.eos_token
    
    if args.input_file.endswith('.parquet'):
        df = pd.read_parquet(args.input_file)[["Url","UrlTitle","UrlSnippet","FullBody"]]
    elif args.input_file.endswith('.tsv'):
        # args.input_file.endswith('.tsv') or args.input_file.endswith('.csv'):
        if args.input_file.endswith('escape.tsv'):
            df = pd.read_csv(args.input_file, sep="\t",header=None, names=["Url","UrlTitle","UrlSnippet","FullBody", "Label"]).drop_duplicates()
            df = df[["Url","UrlTitle","UrlSnippet","FullBody"]]
        elif args.input_file.endswith('Clean_60k.tsv'):
            df = pd.read_csv(args.input_file, sep="\t",lineterminator="\n").drop_duplicates()
        else:
            df = pd.read_csv(args.input_file, sep='\t')
        df = df[["Url","UrlTitle","UrlSnippet","FullBody"]]
    elif args.input_file.contains('KRsets'):
        # original testset from Tyler
        df = pd.read_csv(args.input_file, sep='\t')
        # df = df.rename(columns={'CrowdJudgment':'Label'})
        df = df[["Url","UrlTitle","UrlSnippet","FullBody"]]
    else:
        # args.input_file.endswith('.csv')  # to predict zifan's training data
        df = pd.read_csv(args.input_file, sep='\t')
        df = df[["Id", "Url","UrlTitle","UrlSnippet","FullBody"]]
    
    if world_size > 1:
        df_rank = np.array_split(df, world_size)[local_rank]
    else:
        df_rank = df
    # df_rank = df_rank.head(20) # For quick test
    print(df.shape, df_rank.shape)    
    ds = EvalDataset(df_rank, tokenizer, args.max_seq_length)
    dataloader = DataLoader(ds, batch_size=args.batch_size, shuffle=False)

    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
    
    
    # prediction
    model = AutoModelForCausalLM.from_pretrained(args.load_from, 
                                        device_map=f'cuda:{local_rank}',
                                        trust_remote_code=True, 
                                        quantization_config=quantization_config,
                                        torch_dtype=torch.bfloat16,)
    if world_size > 1:
        ddp_model = DDP(model, device_ids=[local_rank])
    model.eval()

    ids = []
    result_rank = []
    prob_rank = []
    
    start = time.time()
    for i, x in tqdm(enumerate(dataloader), total=len(dataloader), disable=(local_rank!=0)):
        input_data = {key: value for key, value in x.items() if key in ['input_ids', 'attention_mask']}
        model_inputs = BatchEncoding(input_data).to(f'cuda:{local_rank}')
        with torch.no_grad():
            output = model.generate(**model_inputs, max_new_tokens=1,temperature=0.0,return_dict_in_generate=True, output_scores=True)
            transition_scores = model.compute_transition_scores(output.sequences, output.scores, normalize_logits=True).to("cpu")
            generated_tokens = output.sequences.detach()[:, args.max_seq_length:].cpu()
            score_length = transition_scores.shape[0]
            result_rank.extend(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))
            prob_rank.extend(np.exp(transition_scores.reshape(score_length).numpy()))
            
        if i % 100 == 0:
            t = time.time() - start
            n_samples = (i+1)*dataloader.batch_size
            throughput = n_samples/t if t > 0 else 0
            print(f'rank {local_rank}, total samples：{n_samples} throughput： {throughput:.2f} /s')
        
        # if i == 0 and local_rank == 0:
        #     p = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            # print(p)
    
    # print('rank', local_rank, 'inference done', len(result_rank))
    # print(result_rank)
    
    # print('rank', local_rank, 'inference done', len(prob_rank))
    # print(prob_rank)

    df_rank['Prediction'] = result_rank
    df_rank["Probability"] = prob_rank
        
    if local_rank <= 0:
        parent_dir = os.path.dirname(args.output_file)
        if parent_dir and not os.path.exists(parent_dir):
            os.makedirs(parent_dir, exist_ok=True)
    
    with open(args.output_file + f'_rank_{local_rank}.pkl', 'wb') as f:
        pickle.dump(df_rank, f, protocol=pickle.HIGHEST_PROTOCOL)

    if world_size > 1:
        dist.barrier()
        dfs = []
        if local_rank <= 0:
            # print('Run time ',time.time() - start)
            # print('Merging results ...')
            for i in range(world_size):
                file_path = args.output_file + f'_rank_{i}.pkl'
                with open(file_path, 'rb') as f:
                    dfs.append(pickle.load(f))
                try:
                    os.remove(file_path)
                except OSError:
                    pass
            df_res = pd.concat(dfs, axis=0)
    else:
        df_res = df_rank
    
    if local_rank <= 0:
        # print('total result:', df_res.shape)
        output_file_name = args.output_file+'_'+args.cur_model_num+'.tsv'
        df_res.to_csv(output_file_name, index=False, sep='\t')
        print('saved to:', output_file_name)

    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()



Overwriting inf_multi_mixtral_model_test_5.py


In [12]:
!export CUDA_VISIBLE_DEVICES="0,1,2"
!NCCL_DEBUG=WARN python -m torch.distributed.run  \
--nnodes 1 --nproc_per_node 3 inf_multi_mixtral_model_test.py --load_from /cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/Mixtral_Zifandata_v1/current_best_1800 \
--input_file "/cosmos/local/SpamLLM/Prod/KRsets/scrapekr1.2_UHRSValidation_withPlugin.tsv" \
--output_file "/cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/Mixtral_Zifandata_v1/current_best_1800/scrapekr1.2_UHRSValidation_withPlugin.tsv" \
--batch_size 12 \
--cur_model_num 1800

*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
*****************************************
[2024-07-11 03:39:54,770] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-07-11 03:39:54,807] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-07-11 03:39:54,882] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
local rank: 2 True 3
local rank: 1 True 3
local rank: 0 True 3
Namespace(load_from='/cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/Mixtral_Zifandata_v1/current_best_1800', tokenizer_dir='/cosmos/local/IndexQuality/FinetuneLLM/Mixtral-8x7B-Instruct-v0.1/', input_file='/cosmos/local/SpamLLM/Prod/KRsets/scrapekr1.2_UHRSValidation_withPlugin.

# Evaluate AutoModelForClassification

In [5]:
%%writefile inf_v1.py
import argparse
import os
from collections import Counter
import pandas as pd
import torch
import torch.distributed as dist
from torch.utils.data import DataLoader, Dataset, DistributedSampler
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoModelForCausalLM, default_data_collator, get_cosine_schedule_with_warmup, AutoConfig, BitsAndBytesConfig
from torch.nn.parallel import DistributedDataParallel as DDP
import numpy as np
from tqdm import tqdm
from transformers.tokenization_utils_base import BatchEncoding
import torch.nn.functional as F


def get_df_rank(df):
    if world_size > 1:
        num_samples_keep = (len(df) // world_size ) * world_size
        df = df.iloc[:num_samples_keep].copy()
        df_rank = np.array_split(df, world_size)[local_rank]
    else:
        df_rank = df
    return df_rank

class EvalDataset(Dataset):
    def __init__(self, df, tokenizer, max_seq_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        sample = self.df.iloc[idx]
        full_body = sample['FullBody']

        text = ' '.join([str(sample['Url']), str(sample['UrlTitle']), str(sample['UrlSnippet']), str(sample['FullBody'])])
        label = int(sample['Label'])
        
        res = self.tokenizer(text, max_length=self.max_seq_length-1, return_attention_mask=False, padding=False, truncation=True)
        res['input_ids'] = res['input_ids'] + [self.tokenizer.eos_token_id]
        res = self.tokenizer.pad(res, max_length=self.max_seq_length, padding='max_length', return_attention_mask=True, return_tensors='pt')
        
        return {
            'input_ids': res['input_ids'],  # shape: torch.Size([bs, 1024])
            'attention_mask': res['attention_mask'],    # shape: torch.Size([bs, 1024])
            'labels': torch.tensor(label),   # shape: torch.Size([bs])
            # 'ids': torch.tensor(int(sample['Id']))
        }

if __name__ == '__main__':    
    parser = argparse.ArgumentParser()
    # parser.add_argument('--load_from', type=str, default='intfloat/e5-mistral-7b-instruct')
    parser.add_argument('--load_from', type=str, default='Qwen/Qwen2.5-0.5B-Instruct')
    parser.add_argument('--model_path', type=str)
    parser.add_argument('--eval_set', type=str)
    parser.add_argument('--max_seq_length', type=int, default=1024)
    parser.add_argument('--batch_size', type=int, default=128)
    parser.add_argument('--is_output_embedding', type=bool, default=False)
    parser.add_argument('--output_dir', type=str)
    args = parser.parse_args()

    dist.init_process_group("nccl")
    world_size = dist.get_world_size()
    local_rank = dist.get_rank()
    print('local rank:', local_rank, torch.distributed.is_initialized(), world_size)
    if local_rank == 0:
        print(args)
    
    model = AutoModelForSequenceClassification.from_pretrained(args.load_from, load_in_8bit=False, device_map=f'cuda:{local_rank}', torch_dtype=torch.float16)
    model_state_dict = torch.load(args.model_path)
    model.load_state_dict(model_state_dict)
    model.config.pad_token_id = model.config.eos_token_id  # new 
    if world_size > 1:
        ddp_model = DDP(model, device_ids=[local_rank])
    model.eval() 
    
    if 'scrapekr' in args.eval_set:
        df = pd.read_parquet(args.eval_set)[["Url","UrlTitle","UrlSnippet","FullBody",'UrlExpectedLabel']]
        # df.rename(columns={'CrowdJudgment':'Label'}, inplace=True)
        mapping = {'detrimental spam':1, 'non-detrimental spam':1, 'not spam':0}
        df['UrlExpectedLabel'] = df['UrlExpectedLabel'].map(mapping)
        df.rename(columns={'UrlExpectedLabel':'Label'}, inplace=True)
    elif 'test_dataset_2024_03_05' in args.eval_set:
        df = pd.read_csv(args.eval_set,sep="\t")[["Url","UrlTitle","UrlSnippet","FullBody","Label"]]
        df["Label"] = df["Label"].apply(lambda x:int(x.replace("<ANS>","").replace("</ANS>","")))
    elif 'spamgtx5.0' in args.eval_set:
        df = pd.read_parquet(args.eval_set)
        df["Label"] = df["AuditorJudgment"].apply(lambda x: 0 if x =='not spam' else 1)
        df = df[["Url","UrlTitle","UrlSnippet","FullBody","Label"]]
    elif 'Clean_60k' in args.eval_set:
        df = pd.read_csv(args.eval_set,sep="\t",lineterminator="\n").drop_duplicates()  # columns: ['Unnamed: 0', 'Url', 'UrlTitle', 'UrlSnippet', 'FullBody', 'Label']
    elif '8k_with_flipped_labels' in args.eval_set:
        df = pd.read_csv(args.eval_set,sep="\t")
    elif 'SpamLLM_Output_2.6.0_v4_with_renamed_schema' in args.eval_set:
        df = pd.read_csv(args.eval_set, sep='\t')
        df = df[["Id", "Url","UrlTitle","UrlSnippet","FullBody","Label"]]
        df = df[df['Label'].notna()]
    elif 'auditor' in args.eval_set:
        df = pd.read_csv(args.eval_set, sep="\t",header=None, names=["Url","UrlTitle","UrlSnippet","FullBody", "Label"]) 
        df = df[df['Label'].notna()]

    doc_df_rank = get_df_rank(df)
    tokenizer = AutoTokenizer.from_pretrained(args.load_from)
    tokenizer.pad_token = tokenizer.eos_token
    
    eval_dataset = EvalDataset(doc_df_rank, tokenizer, args.max_seq_length)
    eval_dataloader = DataLoader(eval_dataset, batch_size=args.batch_size, shuffle=False)
    
    eval_data = []
    with open(args.output_dir+'_'+str(local_rank)+".tsv", 'w') as f:
        for cur, x in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader), disable=(local_rank!=0)):
            model_input = BatchEncoding(x).to(f'cuda:{local_rank}')
            with torch.no_grad():
                label = model_input["labels"]
                # id = model_input['ids']
                outputs = model(input_ids=model_input["input_ids"], attention_mask=model_input["attention_mask"])
                if is_output_embedding:
                    param_generator = model.named_parameters()
                    all_params = list(param_generator)
                    embedding_name = all_params[-2][0]
                    embedding = all_params[-2][1]
                    embedding_str = '|'.join(map(str, embedding.tolist()))
                output_logits = outputs.logits
                output_prob = F.softmax(output_logits, dim=1)
                output_prob = output_prob[:, 1]
                for i in range(len(label)):
                    # f.write("{0}\t{1}\t{2}\n".format(id[i].item(), label[i].item(), output_prob[i].item()))
                    f.write("{0}\t{1}\n".format(label[i].item(), output_prob[i].item()))
    
    print(f'the prediction result is saved here: {args.output_dir}_{str(local_rank)}.tsv')

Writing inf_v1.py


### E5 Inference Command

In [3]:
# !export CUDA_VISIBLE_DEVICES="0,1,2"
# !NCCL_DEBUG=WARN python -m torch.distributed.run  \
# --master_port 29501 --nnodes 1 --nproc_per_node 3 inf_multi_e5_model_for_predict_zifan_data6.py \
# --model_path /cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/Mixtral_New_ym_e5/current_best_1200/pytorch_model.bin \
# --eval_set /cosmos/local/IndexQuality/FinetuneLLM/TrainingData/SpamLLM_Output_2.6.0_v4_with_renamed_schema.csv \
# --max_seq_length 1024 \
# --batch_size 32 \
# --output_dir /cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/Mixtral_New_ym_e5/current_best_1200/zifan_data

!export CUDA_VISIBLE_DEVICES="1,2,3"
!NCCL_DEBUG=WARN python -m torch.distributed.run  \
--master_port 29501 --nnodes 1 --nproc_per_node 3 inf_qwen_v1.py \
--load_from 'intfloat/e5-mistral-7b-instruct' \
--model_path /cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/Mistral_New_ym_e5_v1/current_best_1000/pytorch_model.bin \
--eval_set /cosmos/local/users/zifanwang/SpamLLM/data/auditor_ym_escape1.tsv \
--max_seq_length 1024 \
--batch_size 32 \
--output_dir /cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/Mistral_New_ym_e5_v1/current_best_1000/auditor_ym_escape1.tsv

*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
*****************************************
local rank: 1 True 3
local rank: 0 True 3
Namespace(load_from='intfloat/e5-mistral-7b-instruct', model_path='/cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/Mistral_New_ym_e5_v1/current_best_1000/pytorch_model.bin', eval_set='/cosmos/local/users/zifanwang/SpamLLM/data/auditor_ym_escape1.tsv', max_seq_length=1024, batch_size=32, output_dir='/cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/Mistral_New_ym_e5_v1/current_best_1000/auditor_ym_escape1.tsv')
local rank: 2 True 3
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:03<00:00,  1.58s/it]
Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-mistral-7b-instruct and are n

### Qwen Inference Command

In [13]:
!export CUDA_VISIBLE_DEVICES="1,2,3"
!NCCL_DEBUG=WARN python -m torch.distributed.run  \
--master_port 29501 --nnodes 1 --nproc_per_node 3 inf_qwen_v1.py \
--model_path /cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/qwen_no_overlap_o1_a3_v1/model_1200/pytorch_model.bin \
--load_from Qwen/Qwen2.5-0.5B-Instruct \
--eval_set /cosmos/local/IndexQuality/FinetuneLLM/EvaluationSets/scrapekr1.2_spamllm2.4.parquet \
--max_seq_length 1024 \
--batch_size 32 \
--is_output_embedding True \
--output_dir /cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/qwen_no_overlap_o1_a3_v1/model_1200/model_1200

*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
*****************************************
local rank: 1 True 3
local rank: 0 True 3
Namespace(load_from='Qwen/Qwen2.5-0.5B-Instruct', model_path='/cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/qwen_no_overlap_o1_a3_v1/model_1200/pytorch_model.bin', eval_set='/cosmos/local/IndexQuality/FinetuneLLM/EvaluationSets/scrapekr1.2_spamllm2.4.parquet', max_seq_length=1024, batch_size=32, output_dir='/cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/qwen_no_overlap_o1_a3_v1/model_1200/model_1200')
local rank: 2 True 3
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream tas

# Compute all E5 models metrics

In [28]:
Label = []
Pred = []
for i in range(0,3,1):
    with open(f'/cosmos/local/IndexQuality/FinetuneLLM/FullTrainTest/Mixtral_New_ym_e5/model_100/eval_detail_{i}.tsv', 'r') as f:
        for line in f.readlines():
            split_line = line.strip().split('\t')
            Label.append(float(split_line[0]))
            Pred.append(float(split_line[1]))
print(f'total length: {len(Label)}')

from sklearn.metrics import precision_recall_curve
import numpy as np
from sklearn.metrics import average_precision_score

precision, recall, thresholds = precision_recall_curve(Label, Pred)
beta=0.5
fbeta = (1 + beta**2) * precision * recall / ((beta**2) * precision + recall + 0.00000000001)
index = np.nanargmax(fbeta)

print("Threshold: ",thresholds[index])
print("Fbeta: ",fbeta[index])
print("auprc: ", average_precision_score(Label, Pred))
print("Precision: ",precision[index])
print("Recall: ",recall[index]) 

total length: 2661
Threshold:  0.77392578125
Fbeta:  0.6760204081534599
auprc:  0.6982521300535821
Precision:  0.7386759581881533
Recall:  0.5047619047619047
