In [1]:
from huggingface_hub import login

login(token="")

  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/karthick/.cache/huggingface/token
Login successful


In [2]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

model_name = "microsoft/Phi-3-mini-4k-instruct"
torch.random.manual_seed(0) 
model = AutoModelForCausalLM.from_pretrained( 
    model_name,  
    device_map="cuda",  
    torch_dtype="auto",  
    trust_remote_code=True,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
) 

tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.29s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
import transformers
from datasets import load_dataset
from typing import Dict, List, Optional

max_len = 1024

dataset_name = "tryhighlight/task_dataset"
#Importing the dataset
input_dataset = load_dataset(dataset_name, split="train")
print (input_dataset)

Downloading readme: 100%|████████████████████████████████████████████████████████████████| 455/455 [00:00<00:00, 1.79MB/s]
Downloading data: 100%|██████████████████████████████████████████████████████████████| 20.6M/20.6M [00:01<00:00, 10.5MB/s]
Downloading data: 100%|██████████████████████████████████████████████████████████████| 5.69M/5.69M [00:01<00:00, 3.54MB/s]
Generating train split: 100%|████████████████████████████████████████████| 30200/30200 [00:00<00:00, 182748.32 examples/s]
Generating test split: 100%|███████████████████████████████████████████████| 7688/7688 [00:00<00:00, 201739.31 examples/s]

Dataset({
    features: ['NAME', 'CONVERSATION', 'TASK'],
    num_rows: 30200
})





In [4]:
input_dataset = input_dataset.select(range(100))

In [5]:
system_prompt = "You are a helpful ai assistant. User will provide full name followed by some email or messaging conversation seen on his/her computer screen. Looking at the conversation, detect if there are any TODOs that the above user has to complete as a result of the conversation. If yes, just provide the short single line task that can be directly added to the todo list. If there is no task detected as a TODO, just output the exact phrase \"No task\". If the conversation is about a promotional or advertisement related, please output \"No task\". If the conversation is directed or addressed to someone else, then output \"No task\"."

In [6]:
# Get the size of the dataset
dataset_size = len(input_dataset)
print(f"Size of the dataset: {dataset_size} samples")
#    dataset = dataset.shuffle(seed=65).select(range(10000)) # Only use 1000 samples for quick demo
def check_length(row):
    # This function checks if the tokenized length is within the max length allowed
    input_content = tokenizer.encode(system_prompt + row["NAME"] + row["CONVERSATION"] + row["TASK"],
                                        add_special_tokens=True,
                                        truncation=False,
                                        return_length=True,
                                        max_length=None)
    return len(input_content) <= max_len - 20 # extra 20 tokens for the chat template

# Filter the dataset to exclude entries that are too long
filtered_dataset = input_dataset.filter(check_length)
print(f"Size of the dataset after filtering: {len(filtered_dataset)} samples")

Size of the dataset: 100 samples


Filter: 100%|███████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 826.68 examples/s]

Size of the dataset after filtering: 87 samples





In [None]:
model

In [7]:
pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 



In [8]:
generation_args = { 
    "max_new_tokens": 20, 
    "return_full_text": False, 
    "do_sample": True,
    "temperature": 1.0
}

In [9]:
from peft import PeftModel, PeftConfig
adapter_model_name = "tryhighlight/phi3-mini-task-dataset"
config = PeftConfig.from_pretrained(adapter_model_name)
pipe.model = PeftModel.from_pretrained(model, adapter_model_name)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
for row in filtered_dataset:
    messages = [ 
        {"role": "system", "content": system_prompt}, 
        {"role": "user", "content": "My name is " + row["NAME"] + " \n" + row["CONVERSATION"]}
    ] 
    print ('name : ',  row["NAME"])
    print ('CONVERSATION :', row["CONVERSATION"])
    output = pipe(messages, **generation_args)
    print ('Expected output : ', row['TASK'])
    print ('Inference output : ', output[0]['generated_text'], '\n')

name :  Phillip Allen
CONVERSATION : RE: West Position
Dunton, Heather <heather.dunton@enron.com>  Fri, Dec 07, 2001 at 12:06 PM
to Allen, Phillip K. <k..allen@enron.com>
Please let me know if you still need Curve Shift.
Thanks,
Heather
On Fri, Dec 07, 2001 at 05:14 AM Allen, Phillip K. wrote:
Heather,
Did you attach the file to this email?

On Wed, Dec 05, 2001 at 01:43 PM Dunton, Heather wrote:
Attached is the Delta position for 1/16, 1/30, 6/19, 7/13, 9/21

On Wed, Dec 05, 2001 at 06:41 AM Allen, Phillip K. wrote:
Heather,
This is exactly what we need.  Would it possible to add the prior day for each of the dates below to the pivot table.  In order to validate the curve shift on the dates below we also need the prior days ending positions.
Thank you,
Phillip Allen

On Tue, Dec 04, 2001 at 03:12 PM Dunton, Heather wrote:
Attached is the Delta position for 1/18, 1/31, 6/20, 7/16, 9/24

Let me know if you have any questions.
Heather



The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


Expected output :  Task : Please let me know if you still need Curve Shift.
Inference output :  No task 

name :  Phillip Allen
CONVERSATION : NGI Publications - Monday, December 31st 2001
subscriptions@intelligencepress.com  Mon, Dec 31, 2001 at 01:42 AM
to pallen@enron.com
Dear phillip,

This e-mail is automated notification of the availability of your current Natural Gas Intelligence Newsletter(s). Please use your username of "pallen" and your password to access

       NGI's Daily Gas Price Index

       NGI's Weekly Gas Price Index

       Natural Gas Intelligence, the Weekly Newsletter

http://intelligencepress.com/subscribers/index.html

If you have forgotten your password please visit 
  http://intelligencepress.com/password.html
and we will send it to you.

If you would like to stop receiving e-mail notifications when your publications are available, please reply to this message with REMOVE E-MAIL in the subject line.

Thank you for your subscription.

For information about ot

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Expected output :  Task : Let Jeff know if you have an interest in the three property deals he will fax.
Inference output :  No task 

name :  Phillip Allen
CONVERSATION : Huntley/question
Robert W. Huntley, CFP <wise.counsel@lpl.com>  Thu, Oct 25, 2001 at 03:49 PM
to Allen, Phillip K. <k..allen@enron.com>
Phillip,

Could you please do me a favor?  I would like  to read your current title policy to see what it says about easements.  You  should have received a copy during your closing.  I don't know how many  pages it will be but let me know how you want to handle getting a copy  made.  I'll be happy to make the copy, or whatever makes it easy for  you.

Thanks,

Bob Huntley
Expected output :  Task : Provide a copy of the current title policy to Bob Huntley.
Inference output :  No task 

name :  Phillip Allen
CONVERSATION : Distribution Form
Ratcliff, Renee <renee.ratcliff@enron.com>  Thu, Oct 25, 2001 at 02:04 PM
to Allen, Phillip K. <k..allen@enron.com>
Phillip,
Pursuant to your requ

KeyboardInterrupt: 