In [1]:
import pandas as pd
from transformers import AutoTokenizer, LlamaForCausalLM
from transformers import AutoTokenizer, OpenLlamaForCausalLM
from transformers import LlamaTokenizer
import pandas as pd
import torch
import loralib as lora
import numpy as np
import matplotlib.pyplot as plt
import random
import copy
from torch import nn
import os


def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
seed_everything()



def prompt_setting(train_first_party, train_second_party, fact):
    instruction = "Q : Guess if the first party can win in a legal case. Let me know the answer with O or X.\n\n"
    prompt = instruction + 'first_party : ' + train_first_party + '\n\nsecond_partys : "' + train_second_party +'"\n\nfact : ' + fact + '\n\nanswer : '
        
    if len(prompt)>1000:
        instruction = "Guess if the first party can win in a legal case. Let me know the answer with O or X.\n\n"
        prefix = 'first_party : ' + train_first_party + '\n\nsecond_partys : ' + train_second_party +'\n\nfact : '
        fact = fact[:1000-len(instruction)-len(prefix)]
        prompt = instruction + prefix + fact + '\n\nanswer : '
        # continue
    return prompt



Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


# model

In [2]:
train = pd.read_csv('train.csv')
model_select = "AlekseyKorshuk/vicuna-7b" #openlm-research/open_llama_3b
# model_select = "chainyo/alpaca-lora-7b"
# model_select = "openlm-research/open_llama_3b"

tokenizer = LlamaTokenizer.from_pretrained(model_select)
model = LlamaForCausalLM.from_pretrained(model_select).to(torch.bfloat16).to("cuda")


Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at AlekseyKorshuk/vicuna-7b and are newly initialized: ['model.layers.12.self_attn.v_proj.lora_A', 'model.layers.5.self_attn.v_proj.lora_B', 'model.layers.17.self_attn.v_proj.lora_A', 'model.layers.8.self_attn.v_proj.lora_B', 'model.layers.14.self_attn.v_proj.lora_B', 'model.layers.18.self_attn.q_proj.bias', 'model.layers.2.self_attn.q_proj.lora_A', 'model.layers.0.self_attn.v_proj.lora_A', 'model.layers.12.self_attn.q_proj.lora_A', 'model.layers.23.self_attn.q_proj.lora_A', 'model.layers.21.self_attn.v_proj.bias', 'model.layers.15.self_attn.v_proj.lora_B', 'model.layers.27.self_attn.v_proj.bias', 'model.layers.9.self_attn.q_proj.bias', 'model.layers.12.self_attn.q_proj.bias', 'model.layers.7.self_attn.v_proj.lora_A', 'model.layers.5.self_attn.q_proj.lora_B', 'model.layers.30.self_attn.q_proj.lora_B', 'model.layers.27.self_attn.v_proj.lora_B', 'model.layers.25.self_attn.q_proj.lora_A', 'model.layers.3.self_

# inference test

In [3]:
instruction = "Q: What is 1+1?\nA:"
#input_ctxt = 'For some tasks, you can provide an input context to help the model generate a better response.'

prompt = instruction#generate_prompt(instruction, input_ctxt)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)

with torch.no_grad():
    outputs = model.generate(
        input_ids=input_ids, max_new_tokens=32
    )
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)



Q: What is 1+1?
A: One plus one is equal to two.
Q: What is the capital of France?
A: The capital of France is Paris.
Q: What


# train setting

In [4]:
model.requires_grad = False
lora.mark_only_lora_as_trainable(model)

# if you don't want binary return, then remove lm_head code
model.lm_head = nn.Linear(4096, 2)
model.lm_head.requires_grad = True


my_model = model.to(torch.bfloat16).to('cuda')

# data setting

In [5]:
first_partys = train['first_party'].tolist()
second_partys = train['second_party'].tolist()
facts = train['facts'].tolist()
first_party_winners = train['first_party_winner'].tolist()
index = int(len(first_partys)*0.7)

train_temp = list(zip(first_partys, second_partys, facts, first_party_winners))

random.shuffle(train_temp)

first_partys, second_partys, facts, first_party_winners = zip(*train_temp)

In [6]:
train_first_partys = first_partys[:index]
train_second_partys = second_partys[:index]
train_facts = facts[:index]
train_first_party_winners = first_party_winners[:index]


train_facts = np.array(train_facts)
train_first_partys = np.array(train_first_partys)
train_second_partys = np.array(train_second_partys)
train_first_party_winners = np.array(train_first_party_winners)

num_zero = np.sum(train_first_party_winners==0)
zero_train_facts = train_facts[train_first_party_winners==0]
zero_train_first_partys = train_first_partys[train_first_party_winners==0]
zero_train_second_partys= train_second_partys[train_first_party_winners==0]
zero_train_first_party_winners = train_first_party_winners[train_first_party_winners==0]


one_train_facts = train_facts[train_first_party_winners==1]
one_train_first_partys = train_first_partys[train_first_party_winners==1]
one_train_second_partys= train_second_partys[train_first_party_winners==1]
one_train_first_party_winners = train_first_party_winners[train_first_party_winners==1]

one_train_facts = one_train_facts[:num_zero]
one_train_first_partys = one_train_first_partys[:num_zero]
one_train_second_partys = one_train_second_partys[:num_zero]
one_train_first_party_winners = one_train_first_party_winners[:num_zero]


print(len(one_train_first_party_winners))
print(len(one_train_first_party_winners))

train_first_partys = zero_train_first_partys.tolist() + one_train_first_partys.tolist()
train_second_partys = zero_train_second_partys.tolist() + one_train_second_partys.tolist()
train_facts = zero_train_facts.tolist() + one_train_facts.tolist()
train_first_party_winners = zero_train_first_party_winners.tolist() + one_train_first_party_winners.tolist()


train_temp = list(zip(train_first_partys, train_second_partys, train_facts, train_first_party_winners))

random.shuffle(train_temp)

train_first_partys, train_second_partys, train_facts, train_first_party_winners = zip(*train_temp)

568
568


In [7]:
test_first_partys = first_partys[index:]
test_second_partys = second_partys[index:]
test_facts = facts[index:]
test_first_party_winners = first_party_winners[index:]


test_facts = np.array(test_facts)
test_first_partys = np.array(test_first_partys)
test_second_partys = np.array(test_second_partys)
test_first_party_winners = np.array(test_first_party_winners)

num_zero = np.sum(test_first_party_winners==0)
zero_test_facts = test_facts[test_first_party_winners==0]
zero_test_first_partys = test_first_partys[test_first_party_winners==0]
zero_test_second_partys= test_second_partys[test_first_party_winners==0]
zero_test_first_party_winners = test_first_party_winners[test_first_party_winners==0]


one_test_facts = test_facts[test_first_party_winners==1]
one_test_first_partys = test_first_partys[test_first_party_winners==1]
one_test_second_partys= test_second_partys[test_first_party_winners==1]
one_test_first_party_winners = test_first_party_winners[test_first_party_winners==1]

one_test_facts = one_test_facts[:num_zero]
one_test_first_partys = one_test_first_partys[:num_zero]
one_test_second_partys = one_test_second_partys[:num_zero]
one_test_first_party_winners = one_test_first_party_winners[:num_zero]


print(len(zero_test_first_party_winners))
print(len(one_test_first_party_winners))

test_first_partys = zero_test_first_partys.tolist() + one_test_first_partys.tolist()
test_second_partys = zero_test_second_partys.tolist() + one_test_second_partys.tolist()
test_facts = zero_test_facts.tolist() + one_test_facts.tolist()
test_first_party_winners = zero_test_first_party_winners.tolist() + one_test_first_party_winners.tolist()

print(len(test_first_party_winners))

261
261
522


In [8]:
# train_first_partys, train_second_partys, train_facts, train_first_party_winners = zip(*train_temp)
# prompt = prompt_setting(train_first_partys[0], train_second_partys[0], facts[0])

In [9]:
# input_ids = tokenizer(prompt, return_tensors="pt").input_ids
# input_ids = input_ids.to(model.device)

# with torch.no_grad():
#     outputs = model.generate(
#         input_ids=input_ids, max_new_tokens=1
#     )
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print(response)

# train

In [None]:
epochs = 100
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

avgtrain = []
avgtest = []
bestloss = 0
maxprompt = 0
for epoch in range(epochs):
    trainlist = []
    testlist = []
    
    random.shuffle(train_temp)
    train_first_partys, train_second_partys, train_facts, train_first_party_winners = zip(*train_temp)
    count = 0
    loss = None
    for train_first_party, train_second_party, fact, first_party_winner in zip(train_first_partys, train_second_partys, train_facts, train_first_party_winners):
        prompt = prompt_setting(train_first_party, train_second_party, fact)
        
        if first_party_winner==1:
            labels = torch.tensor([1]).to('cuda') 
        else:
            labels = torch.tensor([0]).to('cuda')
            
        inputs = tokenizer(prompt, return_tensors="pt")
        input1 = inputs.input_ids.to('cuda')
        ouputs = my_model(input1)
        
        loss_temp = nn.CrossEntropyLoss()
        if (count+1)%4==0:
            loss += loss_temp(ouputs.logits[:,-1,:], labels)
            loss /= 4           
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # trainlist.append(loss.item())
            del loss
            loss = None
        else:
            if loss==None:
                loss = loss_temp(ouputs.logits[:,-1,:], labels)
            else:
                loss += loss_temp(ouputs.logits[:,-1,:], labels)
        
        del ouputs
        del inputs
        del input1
        count += 1
        torch.cuda.empty_cache()
        
    with torch.no_grad():
        for train_first_party, train_second_party, fact, first_party_winner in zip(test_first_partys, test_second_partys, test_facts, test_first_party_winners):
            prompt = prompt_setting(train_first_party, train_second_party, fact)

            if first_party_winner==1:
                labels = torch.tensor([1]).to('cuda') 
            else:
                labels = torch.tensor([0]).to('cuda')    
            
            loss_temp = nn.CrossEntropyLoss()
            inputs = tokenizer(prompt, return_tensors="pt")
            input1 = inputs.input_ids.to('cuda')
            ouputs = my_model(input1)
            loss = loss_temp(ouputs.logits[:,-1,:], labels)
            
            if first_party_winner==1:
                if ouputs.logits[:,-1,0]>ouputs.logits[:,-1,1]:
                    testlist.append(0)
                else:
                    testlist.append(1)
            else:
                if ouputs.logits[:,-1,0]>ouputs.logits[:,-1,1]:
                    testlist.append(1)
                else:
                    testlist.append(0)
                    
            torch.cuda.empty_cache()
            
        for train_first_party, train_second_party, fact, first_party_winner in zip(train_first_partys, train_second_partys, train_facts, train_first_party_winners):
            prompt = prompt_setting(train_first_party, train_second_party, fact)
            if first_party_winner==1:
                labels = torch.tensor([1]).to('cuda') 
            else:
                labels = torch.tensor([0]).to('cuda')    
            
            loss_temp = nn.CrossEntropyLoss()
            inputs = tokenizer(prompt, return_tensors="pt")
            input1 = inputs.input_ids.to('cuda')
            ouputs = my_model(input1)
            loss = loss_temp(ouputs.logits[:,-1,:], labels)
            
            if first_party_winner==1:
                if ouputs.logits[:,-1,0]>ouputs.logits[:,-1,1]:
                    trainlist.append(0)
                else:
                    trainlist.append(1)
            else:
                if ouputs.logits[:,-1,0]>ouputs.logits[:,-1,1]:
                    trainlist.append(1)
                else:
                    trainlist.append(0)
                    
            torch.cuda.empty_cache()
    avgtest.append(np.mean(testlist))
    avgtrain.append(np.mean(trainlist))
    if np.mean(testlist)>bestloss:
        bestloss = np.mean(testlist)
        torch.save(lora.lora_state_dict(model), "temp.pt")
        torch.save(model.lm_head.state_dict(), "temp_h.pt")
    trainlist = []
    testlist = []
    
    plt.clf()
    plt.plot(avgtest, color='r')
    plt.plot(avgtrain, color='b')
    plt.savefig('loss.png')
    

In [None]:
ouputs = model(input1, labels = input2)
print(ouputs.loss.item())
optimizer.zero_grad()
ouputs.loss.requires_grad_(True)
ouputs.loss.backward()
optimizer.step()
print(ouputs.loss.item())

In [None]:
len(input1[0])

In [None]:
len(input2[0])

In [None]:
len(input1[0])

In [None]:
'''
o 438, x ; 1060
'''