In [None]:
function = {}
function['A'] = """You are an extraction model designed to identify and extract Personally Identifiable Information (PII) from text and format it into a structured JSON. Your task is to analyze the provided text and extract the relevant data fields as specified below. 

Ensure that the output adheres to the exact JSON structure, including all fields, even if they are empty. If no data is found, output an empty JSON with the same structure.

"""
function['B'] = """
Please provide only the JSON output, with no additional comments or explanations. Use the following format:
"""
template = """
{
	"Personal_Information": {
		"Person": {
			"Name": "",
			"National_ID": "",
			"Passport_Number": "",
			"Social_Security_Number": "",
			"Birth_Date": "",
			"Age": "",
			"Height": "",
			"Weight": "",
			"Gender": "",
			"Marital_Status": "",
			"Number_of_Children": "",
			"Nationality_Citizenship": "",
			"Place_of_Birth": "",
			"Mother's_Maiden_Name": "",
			"Race_Ethnic": "",
			"Religion": "",
			"Philosophical_Belief": "",
			"Political_Affiliation": "",
			"Trade_Union_Affiliation": "",
			"Sexual_Preference": "",
			"Sex_Life": ""
		},
		"Appearance": {
			"Picture_of_Face": "",
			"Distinguishing_Characteristic": ""
		},
		"Contact_Information": {
			"Home_Address": {
				"Street_Address": "",
				"City": "",
				"State": "",
				"ZIP_Code": "",
				"Country": ""
			},
			"Phone_Number": "",
			"Email_Address": "",
			"Family_Friend_Contact_Information": ""
		},
		"Online_Identifiers": {
			"Screen_Name": "",
			"Social_Network_Profile": "",
			"Social_Network_Activity": "",
			"URLs": "",
			"Online_Identifiers": ""
		},
		"Location_Information": {
			"Home_Town_City": "",
			"Geographical_Indicators": "",
			"Geo_Location": "",
			"Country": "",
			"ZIP_Code": "",
			"Address": "",
			"Date_Time": ""
		}
	},
	"Work_Information": {
		"Job_Title": "",
		"Occupation": "",
		"Work_ID": "",
		"Work_Address": {
			"Street_Address": "",
			"City": "",
			"State": "",
			"ZIP_Code": "",
			"Country": ""
		},
		"Work_Contact_Information": {
			"Work_Phone_Number": "",
			"Work_Email_Address": ""
		},
		"Employment_Information": {
			"Employment_Status": "",
			"Work_Experience": "",
			"Skills": "",
			"Education": ""
		},
		"Income_Level": ""
	},
	"Financial_Information": {
		"Banking_Details": {
			"Credit_Card_Number": "",
			"Credit_Score": "",
			"ABA_Routing_Number": "",
			"Bank_Account_Number": "",
			"Individual_Taxpayer_Identification": "",
			"SWIFT_Code": "",
			"Crypto": ""
		},
		"Invoice_Payments": "",
		"Financial_Information": ""
	},
	"Security_Information": {
		"Digital_Signature": "",
		"Password": "",
		"License_Numbers": {
			"Drivers_License_Number": "",
			"Vehicle_Registration_Number": "",
			"License_Plate_Number": ""
		},
		"Biometric_Data": {
			"Fingerprint_Data": "",
			"Voice_Print": "",
			"Handwriting_Sample": "",
			"Physiological_Data": "",
			"Genetic_Data": "",
			"X_Ray": "",
			"Biometric_Data": ""
		}
	},
	"Health_Information": {
		"Health_Insurance_ID": "",
		"Medical_History": "",
		"Physiological_Data": ""
	},
	"Cultural_and_Social_Identity": {
		"Cultural_Social_Identity": "",
		"Shopping_Behavior": "",
		"Survey_Answers": "",
		"Signed_Petitions": "",
		"Activities": "",
		"Law_Enforcement_Records": ""
	}
}
"""


In [None]:
import json
import csv
import re

label_name = "./ShareGPT sample (N=200) - Sheet1.csv"
save_file = 'test.json'

dataset_name = "./sg_90k_part1.json"
dataset_name2 = "./sg_90k_part2.json"
# Open and read the JSON file
with open(dataset_name, 'r') as file:
    data_1 = json.load(file)

with open(dataset_name2, 'r') as file:
    data_2 = json.load(file)

data_1 = data_1 + data_2
data_2 = []

IDs = []
PIIs = []
with open(label_name, newline='') as csvfile:
    spamreader = csv.DictReader(csvfile)
    for row in spamreader:
            IDs.append(row['Chat ID'])
            PIIs.append(row['PII types'])
ii = 0
Valid_Data = []
dataset_classes = []
for dat in data_1:
    k = [0]* 8
    if dat['id'] not in IDs:
        ii = ii+1
        continue
    idx = IDs.index(dat['id'])
    PII = PIIs[idx]
    
    k[0] = 1 if 'DATE_TIME' in PII else 0
    k[1] = 1 if 'EMAIL_ADDRESS' in PII else 0
    k[2] = 1 if 'LOCATION' in PII else 0
    k[3] = 1 if 'NRP' in PII else 0
    k[4] = 1 if 'PASSPORT_NUMBER' in PII else 0
    k[5] = 1 if 'PERSON' in PII else 0
    k[6] = 1 if 'PHONE_NUMBER' in PII else 0
    k[7] = 1 if 'URL' in PII else 0
    #print(f"{k} - {idx} - {dat['id']} - {IDs[idx]}")
    Valid_Data.append({'id':IDs[idx],
                       'PII':k,
                       'conversations':dat['conversations']})

data_1 = []



In [None]:
from vllm import LLM, SamplingParams,AsyncLLMEngine,EngineArgs
from vllm import TokensPrompt
MAX_INPUT_SIZE = 8_192
MAX_NEW_TOKENS = 6000

sampling_p = SamplingParams(truncate_prompt_tokens = MAX_INPUT_SIZE,temperature= 1.0,top_p = 1,min_p = 0,top_k = 50,skip_special_tokens = True,max_tokens = MAX_NEW_TOKENS)

llm = LLM(model="numind/NuExtract-1.5-smol",
          gpu_memory_utilization = 0.95,
          dtype='bfloat16',
          max_model_len = MAX_INPUT_SIZE,
          max_num_seqs = 1)




In [None]:

def clean_json_text(text):
    text = text.strip()
    text = text.replace("\\#","#").replace("\\&","&")
    return text

def predict_chunk(text, template,current,model, tokenizer):
    current = clean_json_text(current)
    input_llm = f"<|input|>\n### Template:\n{template}\n### Current:\n{current}\n### Text:\n{text}\n\n<|output|>"+ "{"
    #print(input_ids)
    output = model.generate(input_llm,sampling_p)

    output = tokenizer.decode(output[0].outputs[0].token_ids,skip_special_tokens=True)

    return clean_json_text(output.split("<|output|>")[0])
def split_document(document, window_size, overlap,tokenizer):
    tokens = tokenizer.tokenize(document)
    print(f"\tLength of document: {len(tokens)} tokens")

    chunks = []
    if len(tokens) > window_size:
        for i in range(0, len(tokens), window_size-overlap):
            print(f"\t{i} to {i + len(tokens[i:i + window_size])}")
            chunk = tokenizer.convert_tokens_to_string(tokens[i:i + window_size])
            chunks.append(chunk)

            if i + len(tokens[i:i + window_size]) >= len(tokens):
                break
    else:
        chunks.append(document)
    print(f"\tSplit into {len(chunks)} chunks")

    return chunks
    
def check_structure(dic1,dic2):
    if dic1.keys() != dic2.keys():
        print(f"{dic1.keys()} is not {dic2.keys()}")
        return False
    for key in dic1:
        if isinstance(dic1[key], dict) and isinstance(dic2[key], dict):
            if not check_structure(dic1[key], dic2[key]):
                return False
        elif not isinstance(dic1[key], dict) and not isinstance(dic2[key], dict):
            continue
        else:
            print(f"{dic1[key]} is not {dic2[key]}" )
            return False
    return True
    
def handle_broken_output(pred, prev):
    if (pred[0] != "{"):
        pred = "\n{\n\t" + pred
    try:
        if not check_structure(dict(json.loads(prev)),dict(json.loads(pred))):
            print("structures does not match")
            pred = prev
            return pred
            
    except:
        print("Data Malformation detected.")
        #print(pred)
        pred = prev
        return pred
    print("matching structures and new data acquired.")
    return pred

def sliding_window_prediction(text, template, model, window_size=4000, overlap=128):
    # split text into chunks of n tokens
    tokenizer = model.get_tokenizer()
    tokens = tokenizer.tokenize(text)
    chunks = split_document(text, window_size, overlap,tokenizer)
    s = time.time()
    # iterate over text chunks
    prev = template
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i}...")
        pred = predict_chunk(chunk, template, prev, model, tokenizer)

        # handle broken output
        pred = handle_broken_output(pred, prev)
            
        # iterate
        prev = pred
    e = time.time()
    t = e - s
    return pred,t
def call_llm(llm, text, template , sampling_p):
    sub_texts = text_splitter.split_text(text)
    parts = len(sub_texts)
    if parts > 1:
        print(f"Split text in {parts} parts")
    t = 0 
    #Prompt Generation
    prompts_PIIs = [f"""<|input|>\n### Template:\n{template}\n### Current:\n{tt} Text:\n{text}\n\n<|output|>\n""" for tt in sub_texts]
    start = time.time()
    out = llm.generate(prompts_PIIs,sampling_p)
    end= time.time()
    t = end-start
        
    return out,t,parts

In [None]:
import time
from IPython.display import clear_output
from vllm.sampling_params import GuidedDecodingParams
from langchain_text_splitters import TokenTextSplitter
import vllm
outputs = []
prompts = []
texts = []
size = 4000
overlap = 128
text_splitter = TokenTextSplitter(chunk_size=size , chunk_overlap=overlap)
out = []
MyData = Valid_Data
hh = len(MyData)
out_conv = []
for ii,Data in enumerate(MyData):
    i = 0
    jj = 0
    if 'conversations' in Data:
        for n in Data['conversations']:
             if 'gpt' not in n['from']:
                 jj=jj+1
        out_text = []
        for tt in Data['conversations']:
            clear_output(wait=True)
            print(f"Conversation: {ii+1}/{hh} - Prompt: {i+1}/{jj} ID: {Data['id']}")
            if 'gpt' not in tt['from']:
                i= i+1;
                text = tt['value']
                part_out,t = sliding_window_prediction(text, template, llm, window_size=4000, overlap=128)
                out_text.append({'out':part_out, 'times':t})
        out_conv.append({"id":Data['id'],"data":out_text})
        
    else:
        text = Data
        part_out,t = call_llm(llm, analyzer, text, templates , sampling_p)
        out_text.append({'out':part_out, 'times':t})
        out_conv.append(out_text)
    out.append(out_conv)


In [None]:
ii

In [None]:

nn = 0
import datetime 
cur_time = datetime.date.today().strftime("%H_%M_%B_%d_%Y")
output_file = f"./outputs/t_vllm_tiny_test_single_template_single_out_date_{cur_time}_n_{nn}_6.json"
out_parsed = []

with open(output_file, 'w') as f:
    json.dump(out_conv, f, indent=4)