<a href="https://colab.research.google.com/github/donghuna/PromptGenerate/blob/main/mergeCSV%2BpushHub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install peft
!pip install datasets
!pip install bitsandbytes
!pip install sentence_transformers



In [None]:
from datetime import datetime
import transformers
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model
from datasets import load_dataset

In [None]:
from google.colab import userdata

from huggingface_hub import login
login(token=userdata.get('HUG_TOKEN'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
model_path = "meta-llama/Llama-3.1-8B-Instruct"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    # device_map="auto",
    # trust_remote_code=True,
    # use_auth_token=True,
)

model.eval()

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model_path = "sentence-transformers/paraphrase-MiniLM-L6-v2"

model_embedding = SentenceTransformer(embedding_model_path, device=device)

model_embedding.eval()

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [None]:
dataset = load_dataset("koutch/staqc", 'sca_python')
dataset

DatasetDict({
    train: Dataset({
        features: ['question_id', 'question', 'snippet'],
        num_rows: 85294
    })
})

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# 코사인 유사도를 계산하는 함수
def cosine_similarity(vec1, vec2):
    norm_vec1 = vec1 / np.linalg.norm(vec1)
    norm_vec2 = vec2 / np.linalg.norm(vec2)
    return np.dot(norm_vec1, norm_vec2)

# 두 문장의 코사인 유사도를 계산하는 함수
def get_similarity(sentence1, sentence2):
    # 모델을 통해 임베딩 계산
    embedding1 = model_embedding.encode(sentence1)
    embedding2 = model_embedding.encode(sentence2)

    # 코사인 유사도 계산
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity

# # 예시
# sentence1 = "What is the capital of France?"
# sentence2 = "Which city is the capital of France?"

# similarity_score = get_similarity(sentence1, sentence2)
# print(f"Cosine Similarity: {similarity_score}")


In [None]:
model.eval()

from tqdm import tqdm
import csv

csv_file = "generated_similarQuestion.csv"

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

start_row = 0
max_snippet_length = 1024
sliced_dataset = dataset['train'].select(range(10000, 12000))

with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["question_id", "question", "snippet", "similarQuestion", "Confidence"])

    for row in tqdm(sliced_dataset, desc="Generating similar questions", unit="sample"):
        question_id = row['question_id']
        question = row['question']
        snippet = row['snippet']

        if len(snippet) > max_snippet_length:
            snippet = snippet[:max_snippet_length]

        prompt = f"""given a code-related question and the code snippet that answers it, generate a new question that asks about the same code functionality but in a different way.
The new question should have a similar meaning, and both the original and the new question should be answerable with the same code snippet.
Once the new question is generated, indicate completion by generating the token <|eot_id|>.

### Original Question :
{question}
### Code Snippet :
{snippet}
### New Question :
"""
        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)

        # 입력 토큰의 길이 계산
        input_length = inputs['input_ids'].shape[1]

        # 입력 프롬프트 토큰 수에 128을 더한 max_length 설정
        max_length = input_length + 128

        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_length=max_length,
                eos_token_id=terminators,
                pad_token_id=tokenizer.eos_token_id,
                temperature=0.3,  # temperature 값을 조금 높여 다양성 확보
                # top_p=0.9,  # Top-p 샘플링 적용
                # top_k=50,  # Top-k 샘플링 적용
                # use_cache=False
            )

            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            generated_question = ""

            keyword = "### New Question :"
            if keyword in generated_text:
                generated_question = generated_text.split(keyword)[1].strip()

            keyword = "###"
            if keyword in generated_question:
                generated_question = generated_question.split(keyword)[0].strip()

            if not generated_question:
                continue

            similarity = get_similarity(question, generated_question)
            writer.writerow([question_id, question, snippet, generated_question, similarity])



Generating similar questions: 100%|██████████| 2000/2000 [4:44:32<00:00,  8.54s/sample]


In [None]:
# load generated_similarQuestion.csv
import pandas as pd

df = pd.read_csv('generated_similarQuestion.csv')
# print(df['question'][0])
# print(df['similarQuestion'][0])
# print(df['Confidence'][0])
# print(df)
print(df['snippet'][0])
print(df['snippet'][1])
# print(df['question'][5])
# print(df['question'][6])
# print(df['similarQuestion'][5])
# print(df['similarQuestion'][6])

>>> ut = 1460505599.0; frac = 0.720321;
>>> ufloat = ut + frac
>>> print str(ufloat)
1460505599.72
>>> print repr(ufloat)
1460505599.720321

strings = ['abc', 'abcde', 'abcd', 'abcdefgh']
outputs = map(lambda x: "%d %s" % (len(x), x), strings) # ['3 abc', '5 abcde', '4 abcd', '8 abcdefgh']
f = open("file.out", 'w')
data = '\n'.join(outputs) # Concat all strings in list, separated by line break
f.write(data)
f.close()



In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!cp generated_similarQuestion.csv /content/drive/MyDrive/


In [None]:
import pandas as pd

df = pd.read_csv('generated_similarQuestion.csv')
print(df['question'][0])
print(df['similarQuestion'][0])

Preventing rounding to 2 decimal places
How to preserve the full precision of floating point numbers in Python, preventing them from being rounded to 2 decimal places when converted to strings?


In [None]:
from datasets import Dataset
from google.colab import userdata

dataset = Dataset.from_pandas(df)

dataset.push_to_hub("donghuna/StaQC-similarQuestion", token=userdata.get('HUG_TOKEN'))

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/donghuna/StaQC-similarQuestion/commit/b137b5ab2711e8fa72a89aff0b9c251424922e5a', commit_message='Upload dataset', commit_description='', oid='b137b5ab2711e8fa72a89aff0b9c251424922e5a', pr_url=None, pr_revision=None, pr_num=None)