## The purpose of this notebook is to fintune mosaic ml's 7B model to extract executives who are involved in IPO dealings or are High networth individuals

# Installations

In [None]:
!pip install transformers

In [2]:
!pip install einops

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# !pip install flash-attn==1.0.3.post0

In [None]:
# !pip install triton==2.0.0.dev20221202

In [None]:
# !pip install OpenAI

In [None]:
!pip install datasets

In [5]:
import transformers, torch
from transformers import AutoTokenizer,GenerationConfig
import pandas as pd

In [None]:
# model = transformers.AutoModelForCausalLM.from_pretrained(
#   'mosaicml/mpt-7b-instruct',
#   trust_remote_code=True
# )

In [None]:
config = transformers.AutoConfig.from_pretrained(
  'mosaicml/mpt-7b',
  trust_remote_code=True
)
# config.attn_config['attn_impl'] = 'flash'
config.update({"max_seq_len": 70000})
model = transformers.AutoModelForCausalLM.from_pretrained(
  'mosaicml/mpt-7b',
  config=config,
  torch_dtype=torch.bfloat16,
  trust_remote_code=True
)
model.to(device='cuda:0')

In [9]:
template = """Your task is to extract the person's name from the sentence attached as input in this prompt
along with their details mentioned below:
1. their designation, 
2. their companies ,
3. Number or percentage of shares bought
4. Number or percentage of shares sold
5. Type of shares
6. acquistions

The person should be either:
1. A person involved in sell or buying of any company's share at a large scale.
2. A person who is equivalent to CEO, chairman, board of directors,founder etc. 

If person name is not present then do not extract anything and respond 'None'
Extract the data in the list of python dictionaries format.
If the values are not available mark it as 'NA'.

"""
template1 = """
Your task is to extract the following details of the person from the sentence provided in the input:

1.Name (if available)
2.Designation (if available)
3.Company (if available)
4.Net Worth (if available)
5.Shares bought
6.Shares sold
7.Type of Shares (if available)
8.Acquisitions by the Company (if available)

Do not create codes or any garbage values that are not in the input provided

Extract the data in the list of python dictionaries format.If the values are not available, mark them as 'NA'.

The person whose details are to be extracted should meet any or all of the following criteria:

1.Involved in the buying or selling of any company's shares on a large scale.
2.Has a designation equivalent to CEO, chairman, board of directors, founder, etc.
3.Has a high net worth as indicated by indicators such as business ownership or executive leadership, 
real estate holdings, philanthropy, luxury lifestyle, or a diversified investment portfolio.


If the person's name is not present, but the designation, company, and net worth are mentioned, extract those details. 
If none of the details are available, respond with 'None'.

"""

template2 = """
Your task is to extract the following details from the sentence delimited by triple backticks:

1. Company from where shares were sold
2. Company from which shares were bought
3. Company of person who bought the share
4. Number of shares sold
5. Price at which shares were sold
6. Number of shares bought
7. Price at which shares were bought
8. Type of Shares (if available)

If the values are not available, mark them as 'NA'.

The sentence to be analyzed should mention the sale or purchase of shares by a company or individual. 
If none of the details are available, respond with 'None'.

The sentence to be analyzed is:

```{text}```

"""

In [None]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")

In [None]:
df = pd.read_excel(r"/content/check.xlsx")

In [None]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'sentence', 'extraction', 'extraction1'], dtype='object')

In [None]:
text = df.iloc[4]['sentence']

In [None]:
text

'Foreign investors Segantii India Mauritius, Morgan Stanley, and Goldman Sachs have picked shares worth Rs 640 crore in Zee Entertainment Enterprises via open market transactions on April 17.'

In [None]:
PROMPT = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{template}
### Input:
{text}
### Response:"""

In [None]:
inputs = tokenizer(
    PROMPT,
    return_tensors="pt",
)

In [None]:
input_ids = inputs["input_ids"].cuda()

generation_config = GenerationConfig(
    temperature=0.6,
    top_p=0.95,
    repetition_penalty=1.15,
)

In [None]:
print("Generating...")
generation_output = model.generate(
    input_ids=input_ids,
    generation_config=generation_config,
    return_dict_in_generate=True,
    output_scores=True,
    max_new_tokens=256,
)
for s in generation_output.sequences:
    print(tokenizer.decode(s))

Generating...
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Your task is to extract the person's name from the sentence attached as input in this prompt
along with their details mentioned below:
1. their designation, 
2. their companies,
3. Number or percentage of shares bought
4. Number or percentage of shares sold
5. Type of shares
6. acquistions

The person should be either:
1. A person involved in sell or buying of any company's share at a large scale.
2. A person who is equivalent to CEO, chairman, board of directors,founder etc. 

If person name is not present then do not extract anything and respond 'None'
Extract the data in the list of python dictionaries format.
If the values are not available mark it as 'NA'.


### Input:
Foreign investors Segantii India Mauritius, Morgan Stanley, and Goldman Sachs have picked shares worth Rs 640 crore in Zee Entertainment Enterprises via open market transactions on

## Attempting finetuning

In [4]:
import transformers, torch
from transformers import AutoTokenizer,GenerationConfig
import pandas as pd
import json,os,re
from datasets import load_dataset

# Data Preparation

In [5]:
df = pd.read_excel(r"/content/checkv1.0.xlsx")

In [8]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'sentence', 'extraction', 'extraction1'], dtype='object')

In [None]:
examples = []
for idx, row in df.iterrows():
  text = row['sentence']
  PROMPT = f"""Your task is to extract the person's name from the sentence delimited by triple backticks
along with their details mentioned below:
1. their designation, 
2. their companies ,
3. Number or percentage of shares bought
4. Number or percentage of shares sold
5. Type of shares
6. acquistions

Output should be list of dictionaries only. No python code in output 

```{text}```
"""
  extraction = row['extraction1']
  examples.append({"question":PROMPT,"answers":extraction})
data = json.dumps(examples)

In [None]:
with open("mpt_data,json","w") as f:
  f.write(data)

## Multifurcate data into train test and validation

In [None]:
test_dataset = load_dataset("json", data_files="/content/mpt_data,json",split=['train[:20%]'])

In [None]:
val_dataset = load_dataset("json", data_files="/content/mpt_data,json",split=['train[20%:40%]'])



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
train_dataset = load_dataset("json", data_files="/content/mpt_data,json",split=['train[40%:]'])

In [None]:
train_dataset[0]['question'][0]

In [None]:
train_dataset[0]['answers'][0]

"Output:\n[{'Name': 'Mihir B Manek', 'Designation': 'Research Analyst', 'Company': 'Aditya Birla Capital', 'Net Worth': 'NA', 'Shares bought': 'NA', 'Shares sold': 'NA', 'Type of Shares': 'NA', 'Acquisitions by the Company': 'NA'}]"

In [None]:
val_dataset

[Dataset({
     features: ['answers', 'question'],
     num_rows: 30
 })]

## Load model & Tokenization

In [None]:
config = transformers.AutoConfig.from_pretrained(
  'mosaicml/mpt-7b-instruct',
  trust_remote_code=True
)
# config.attn_config['attn_impl'] = 'flash'
config.update({"max_seq_len": 70000})
model = transformers.AutoModelForCausalLM.from_pretrained(
  'mosaicml/mpt-7b-instruct',
  config=config,
  torch_dtype=torch.bfloat16,
  trust_remote_code=True
)
model.to(device='cuda:0')

## load finetuned model

In [None]:
# config = transformers.AutoConfig.from_pretrained(
#   'gouravsinha/MPT-financial-NER',
#   trust_remote_code=True
# )
# # config.attn_config['attn_impl'] = 'flash'
# config.update({"max_seq_len": 70000})
# model1 = transformers.AutoModelForCausalLM.from_pretrained(
#   'gouravsinha/MPT-financial-NER',
#   config=config,
#   torch_dtype=torch.bfloat16,
#   trust_remote_code=True
# )
# model1.to(device='cuda:0')

In [8]:
model_checkpoint = "mosaicml/mpt-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

## Generate output through instruct model

In [9]:
def generate_output(text,model):
  PROMPT = f"""Your task is to extract the person's name from the sentence delimited by triple backticks
along with their details mentioned below:
1. their designation, 
2. their companies ,
3. Number or percentage of shares bought
4. Number or percentage of shares sold
5. Type of shares
6. acquistions

Output should be list of dictionaries only. No python code in output 

```{text}```
# """

  input_text = PROMPT
  tokenized_input = tokenizer.encode(input_text, return_tensors = "pt").cuda()
  # tokenized_ouptut = tokenizer.encode(output_text, return_tensors = "pt").cuda()


  generation_config = GenerationConfig(
      temperature=0.6,
      top_p=0.95,
      top_k = 5,
      repetition_penalty=1.15,
  )
  generation_output = model.generate(
      input_ids=tokenized_input,
      generation_config=generation_config,
      return_dict_in_generate=True,
      output_scores=True,
      max_new_tokens=200,
  )
  for s in generation_output.sequences:
  # delim = tokenizer.decode(s[135])
  # print(s)
  # prediction = tokenizer.encode(tokenizer.decode(s[-200:]).split(delim)[0])
    prediction = tokenizer.encode(tokenizer.decode(s[-200:]))
  return tokenizer.decode(prediction).split("<|endoftext|>")[0]

In [10]:
from tqdm import tqdm

In [22]:
# df.drop(columns=['MPT_without_finetune'],inplace=True)

In [11]:
for idx, row in tqdm(df.iterrows()):
  df.loc[idx,'MPT_with_finetune'] = generate_output(row['sentence'],model1)

151it [15:39,  6.22s/it]


In [12]:
df.to_excel("checkv2.0.xlsx")

In [21]:
df.loc[0,'MPT_without_finetune']

'<|endoftext|># Extract Data From Sentence Using Python - Problem Statement #\n\n\n\nYou need to write code that extracts information about people (name + other info)  from this text using regular expressions/NLP techniques. The output will look like following dictionary structure where each key represents one piece of extracted infromation \n\n    {   "person_name": "Manish Choudhary",     \n        "designation":"Head Of Reseach At StoXBox ",       \n         "company":"StoxboX","shares_bought":0,"percentage_of_share_sold": 0,"type_of_stock":"equity ","acquisitions": None}\n \nHere you can see there were multiple persons named Manish chowdhery so we used regex pattern matching technique which returned all possible matches for us alongwith some additional fields such as designations & Companies they work currently / previously worked\n\n\n\n\n\n\n\n\n\nThis problem statement was contributed under CodeFor'

In [None]:
# df.iloc[0:1]['sentence'].apply(lambda x : generate_output(x)).values[0]

In [None]:
!nvidia-smi

Mon May  8 22:09:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    54W / 400W |  13667MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
total_params = sum(
	param.numel() for param in model.parameters()
)
print(f"Total Parameters = {total_params/1000**2:.2f} Million parameters")

Total Parameters = 6649.29 Million parameters


In [None]:
trainable_params = sum(
	p.numel() for p in model.parameters() if p.requires_grad
)
print(f"Total trainable Parameters = {trainable_params/1000**2:.2f} Million parameters")

Total trainable Parameters = 6649.29 Million parameters


## observe **layers**

In [None]:
for name, params in model.named_parameters():
  print(f"{name} num_of_parameters:{params.numel()//1000**2:.2f} Million parameters")

In [None]:
model.transformer.blocks[-1]

MPTBlock(
  (norm_1): LPLayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  (attn): MultiheadAttention(
    (Wqkv): Linear(in_features=4096, out_features=12288, bias=False)
    (out_proj): Linear(in_features=4096, out_features=4096, bias=False)
  )
  (norm_2): LPLayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  (ffn): MPTMLP(
    (up_proj): Linear(in_features=4096, out_features=16384, bias=False)
    (act): GELU(approximate='none')
    (down_proj): Linear(in_features=16384, out_features=4096, bias=False)
  )
  (resid_attn_dropout): Dropout(p=0, inplace=False)
  (resid_ffn_dropout): Dropout(p=0, inplace=False)
)

## total number of parameters in last MPT block

In [None]:
total_params = sum(
	param.numel() for param in model.transformer.blocks[-1].parameters()
)
print(f"Total Parameters = {total_params/1000**2:.2f} Million parameters")

Total Parameters = 201.33 Million parameters


we can see that total number of parameters in the last block of mpt is ~ 200 million

## freeze all parameters except last block

In [None]:
for name, params in model.named_parameters():
  if '31' not in name:
    params.requires_grad = False
  print(f"{name} num_of_parameters:{params.numel()//1000**2:.2f} Million parameters")

#### unfreeze

In [None]:
# for param in model.transformer.blocks[:-2].parameters():
#   param.requires_grad = False

#### no. of trainable parameters

In [None]:
trainable_params = sum(
	p.numel() for p in model.parameters() if p.requires_grad
)
print(f"Total trainable Parameters = {trainable_params/1000**2:.2f} Million parameters")

Total trainable Parameters = 201.33 Million parameters


## Model Training

## Note : Training needs GPU of 40 GBs or higher

In [None]:
train_dataset[0]

Dataset({
    features: ['answers', 'question'],
    num_rows: 91
})

### Add padding

In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [None]:
train_data = train_dataset[0].shuffle().map(
    lambda data_point: tokenizer(
        text = data_point['question'],
        text_target = data_point['answers'],
        padding="max_length",
    )
)

test_data = test_dataset[0].shuffle().map(
    lambda data_point: tokenizer(
        text = data_point['question'],
        text_target = data_point['answers'],
        padding="max_length",
    )
) 

val_data = val_dataset[0].shuffle().map(
    lambda data_point: tokenizer(
        text = data_point['question'],
        text_target = data_point['answers'],
        padding="max_length",
    )
) 

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [None]:
#training arguments
args = transformers.TrainingArguments(
    learning_rate = 2e-5,
    evaluation_strategy = 'no',
    save_strategy = 'epoch',
    num_train_epochs = 3,
    weight_decay = 0.01,
    logging_steps=1,
    # warmup_steps = 5,
    output_dir = 'mpt_outputs'
)



In [None]:
trainer = transformers.Trainer(
    model = model, 
    args = args,
    train_dataset = train_data,
    eval_dataset = val_data,
    tokenizer = tokenizer,
)

In [None]:
trainer.train()

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,28.125
2,26.5
3,24.375
4,21.25
5,19.125
6,17.125
7,16.0
8,13.3125
9,12.3125
10,10.875


TrainOutput(global_step=36, training_loss=10.50607638888889, metrics={'train_runtime': 182.6793, 'train_samples_per_second': 1.494, 'train_steps_per_second': 0.197, 'total_flos': 2.1612893690658816e+16, 'train_loss': 10.50607638888889, 'epoch': 3.0})

### Test the model

In [None]:
text = df.iloc[3]['sentence']
print(text)
output_text = df.iloc[3]['extraction1']
PROMPT = f"""Your task is to extract the person's name from the sentence delimited by triple backticks
along with their details mentioned below:
1.Name (if available)
2.Designation (if available)
3.Company (if available)
4.Net Worth (if available)
5.Shares bought
6.Shares sold
7.Type of Shares (if available)
8.Acquisitions by the Company (if available)

Output should be list of dictionaries only. No python code in output

```{text}```
"""

input_text = PROMPT
tokenized_input = tokenizer.encode(input_text, return_tensors = "pt").cuda()
tokenized_ouptut = tokenizer.encode(output_text, return_tensors = "pt").cuda()


generation_config = GenerationConfig(
    temperature=0.6,
    top_p=0.95,
    top_k = 5,
    repetition_penalty=1.15,
)
generation_output = model.generate(
    input_ids=tokenized_input,
    generation_config=generation_config,
    return_dict_in_generate=True,
    output_scores=True,
    max_new_tokens=200,
)

# for s in generation_output.sequences:
#   for i, token in enumerate(s):
#     if tokenizer.decode(s[i]) == "<|endoftext|>":
#       print(i)
  # print(tokenizer.decode(s[-200:-39]))


for s in generation_output.sequences:
  # delim = tokenizer.decode(s[135])
  # print(s)
  # prediction = tokenizer.encode(tokenizer.decode(s[-200:]).split(delim)[0])
  prediction = tokenizer.encode(tokenizer.decode(s[-200:]))
  # print(delim)

print(tokenizer.decode(prediction))

The electronics manufacturing industry is known for its volatility, and raw material costs may adversely affect the business," Krishna Raghavan, Founder of Unlistedkart, said.

{'name': 'Krishna', 
  'designation':'Founder','company':'Unlistedkart'}<|endoftext|>#import pandas as pd #df =pd.read_csv('data/sample-input') print(list([dict({'acquisition':x})for xin df['acquisitions']])) {'acquisition': ['Sensex'],} [{'sharesbought': 100,'sharesold': 200}]

 [****,ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ145145001451450014514500ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ

## <li> I ran out of computing resources on colab thats why I could train the model only till 3 epochs.
## <li> With only 3 epochs the model was able to extract customised entities.
## <li> With more epochs the model can be trained to discard hallucinations/unwanted characters as well.

## Saving the model locally

In [None]:
trainer.save_model("MPT-finetuned")

## Loading the model from local disk

In [None]:
config = transformers.AutoConfig.from_pretrained(
  '/content/MPT-finetuned',
  trust_remote_code=True
)
# config.attn_config['attn_impl'] = 'flash'
config.update({"max_seq_len": 70000})
model1 = transformers.AutoModelForCausalLM.from_pretrained(
  '/content/MPT-finetuned',
  config=config,
  torch_dtype=torch.bfloat16,
  trust_remote_code=True
)
model1.to(device='cuda:0')

## Saving the model to huggingface

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("gouravsinha/MPT-financial-NER", use_auth_token=True)

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/gouravsinha/MPT-financial-NER/commit/cf7612319dacccab947576690d61dbdb76a9b12f', commit_message='Upload MPTForCausalLM', commit_description='', oid='cf7612319dacccab947576690d61dbdb76a9b12f', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
output = trainer.predict(val_data)

In [None]:
type(output)

transformers.trainer_utils.PredictionOutput

In [None]:
output[2]

{'test_loss': 6.633333206176758,
 'test_runtime': 12.944,
 'test_samples_per_second': 2.318,
 'test_steps_per_second': 0.309}

In [None]:
predictions,_,_ = output

In [None]:
predictions.shape

(30, 2048, 50432)