In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'chat-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5454516%2F9046883%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240805%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240805T204823Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1c28b0fb28a1410134c014fc223043e381d55ab74939acdd0a996f1f9a26ef7d01db608cb996b0f304661b1b105fbfb12da530db48b3df6882a76e33c876444cab2b814083642106af5605867a387ba7d8beee23ec9456c87ae0c7d25d638e111f46e2d52af05f7a919c7027c0364582d90e02379401e7178d0d4cb77e7a143308a404188cb3b0b5080db307524cab3cc7163953a33231ea7d8884d7eac1dcc2aeb825d16deeb051b982168cc18b6fd4869bb367943fddfc8234e7f923ad3f16181e78d78c36fc996eb6ce1a37b5b147c2c9c91f5ab4f2e0bde56fd876a4ac28c9222d1719d36a71c9509f59197c53f5b9683b7d201c99da035d3eea53f7d75f'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading chat-dataset, 123016976 bytes compressed
Downloaded and uncompressed: chat-dataset
Data source import complete.


In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00

In [3]:
import ast
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

In [4]:
file_path = '/kaggle/input/chat-dataset/chat_data.csv'
data = pd.read_csv(file_path)

In [5]:
data.columns = ['conversation','id']

In [6]:
data = data.drop(0)

In [7]:
data = data[:10]

In [8]:
def extract_qa_pairs(conversation):
  conversation = conversation.replace('""','"')
  conversation = conversation.replace('}','},')
  conversation = conversation.replace('},]','}]')
  conversation_list = ast.literal_eval(conversation)
  qa_pairs = []
  for i in range(len(conversation_list)-1):
    if conversation_list[i]['from'] == "human" and conversation_list[i+1]["from"] =="gpt":
      question = conversation_list[i]['value']
      answer = conversation_list[i+1]['value']
      qa_pairs.append((question,answer))
  return qa_pairs

In [9]:
data['qa_pairs'] = data['conversation'].apply(extract_qa_pairs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['qa_pairs'] = data['conversation'].apply(extract_qa_pairs)


In [10]:
#flatten the list of QA pairs
qa_pairs = [pair for pairs in data['qa_pairs'] for pair in pairs]


In [11]:
qa_df = pd.DataFrame(qa_pairs, columns = ['Question','Answer'])

In [12]:
print(qa_df.head())

                                            Question  \
0  Hi, I'm feeling really scared about something ...   
1  Well, lately, I've been feeling like my friend...   
2  No, I haven't. I'm afraid that they'll think I...   
3  You're right, I should talk to them. But how d...   
4  I see, so it's important to frame it as a dial...   

                                              Answer  
0  Hello there! I'm here to listen and offer supp...  
1  I understand how disheartening that can be. It...  
2  It's normal to feel hesitant about having diff...  
3  That's a great question! It can be helpful to ...  
4  Absolutely! Opening up a conversation can lead...  


In [13]:
qa_df.to_csv('qa_pairs.csv', index=False)

In [14]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

if gpt2_tokenizer.pad_token is None:
  gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [15]:
#tokenize the question-answer pairs

def tokenize_qa_pairs(qa_df):
  input_texts=[]
  output_texts = []
  for _, row in qa_df.iterrows():
    question = row['Question']
    answer = row['Answer']
    input_texts.append(gpt2_tokenizer.encode(question, add_special_tokens=True))
    output_texts.append(gpt2_tokenizer.encode(answer, add_special_tokens=True))
  return input_texts, output_texts

In [16]:
input_texts, output_texts = tokenize_qa_pairs(qa_df)

In [17]:
max_length = 512
input_ids = [text + [gpt2_tokenizer.pad_token_id] * (max_length - len(text)) if len(text) < max_length else text[:max_length] for text in input_texts]
output_ids = [text + [gpt2_tokenizer.pad_token_id] * (max_length - len(text)) if len(text) < max_length else text[:max_length] for text in output_texts]

In [18]:
import torch

In [19]:
dataset = Dataset.from_dict({'input_ids':input_ids,'labels':output_ids})
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [20]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(gpt2_tokenizer))

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50258, 768)

In [21]:
# training_args = TrainingArguments(
#     output_dir = './results',
#     num_train_epochs = 3,
#     per_device_train_batch_size = 4,
#     per_device_eval_batch_size = 4,
#     warmup_steps = 500,
#     weight_decay = 0.01,
#     logging_dir = './logs',
#     logging_steps =  10,
#     save_steps = 1000,
#     evaluation_strategy = 'epoch',
#     save_strategy = 'epoch'
# )
training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs = 3,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    warmup_steps = 500,
    weight_decay = 0.01,
    logging_dir = './logs',
    logging_steps =  10
)

In [22]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

In [23]:
trainer.train()

Step,Training Loss
10,10.1514
20,9.5369
30,9.4718
40,8.1135


TrainOutput(global_step=45, training_loss=9.154967753092448, metrics={'train_runtime': 35.9186, 'train_samples_per_second': 4.928, 'train_steps_per_second': 1.253, 'total_flos': 46248689664000.0, 'train_loss': 9.154967753092448, 'epoch': 3.0})

In [24]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 9.716632843017578, 'eval_runtime': 0.6621, 'eval_samples_per_second': 22.657, 'eval_steps_per_second': 6.042, 'epoch': 3.0}


In [25]:
model.save_pretrained('./fine_tuned_model')
gpt2_tokenizer.save_pretrained('./fine_tuned_model')

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

In [26]:
model_path = './fine_tuned_model'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

In [27]:
def generate_response(model, tokenizer, prompt, max_new_tokens = 50):
  input_ids = tokenizer.encode(prompt, return_tensors='pt')
  attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
  print(2)
  output = model.generate(input_ids,  attention_mask=attention_mask,max_length=max_length,pad_token_id=tokenizer.eos_token_id)
  print(3)
  response = tokenizer.decode(output[0], skip_special_tokens=True)
  return response

In [28]:
# List to store predictions and references
predictions = []
references = []

# Generate predictions
for i in range(len(test_dataset)):
    input_text = tokenizer.decode(test_dataset[i]['input_ids'], skip_special_tokens=True)
    reference_text = tokenizer.decode(test_dataset[i]['labels'], skip_special_tokens=True)
    generated_text = generate_response(model, tokenizer, input_text)
    predictions.append(generated_text)
    references.append(reference_text)

2
3
2
3
2
3
2
3
2
3
2
3
2
3
2
3
2
3
2
3
2
3
2
3
2
3
2
3
2
3


In [29]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=ba1215f238ce23a81eebfd62f4badd8759f01d94d3d697734c0e49e2563b943f
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [30]:
from rouge_score import rouge_scorer

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

# Calculate ROUGE-L scores
rouge_scores = [scorer.score(ref, pred)['rougeL'] for ref, pred in zip(references, predictions)]

# Average ROUGE-L score
average_rougeL = sum(score.fmeasure for score in rouge_scores) / len(rouge_scores)

print(f'Average ROUGE-L Score: {average_rougeL}')


Average ROUGE-L Score: 0.09092127675859593


In [34]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0.0->bert_score)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0.0->bert_score)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.0.0->bert_score)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.0.0->bert_score)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.0.0->bert_score)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.0.0->bert_s

In [35]:
import bert_score

# Calculate BERT scores
P, R, F1 = bert_score.score(predictions, references, lang='en', rescale_with_baseline=True)

# Average BERT scores
average_bert_P = P.mean().item()
average_bert_R = R.mean().item()
average_bert_F1 = F1.mean().item()

print(f'Average BERT Precision: {average_bert_P}')
print(f'Average BERT Recall: {average_bert_R}')
print(f'Average BERT F1 Score: {average_bert_F1}')


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERT Precision: -0.8409816026687622
Average BERT Recall: 0.15765587985515594
Average BERT F1 Score: -0.4148997664451599


In [36]:
from sklearn.metrics import confusion_matrix

# Define a threshold for correctness
similarity_threshold = 0.7

# Function to determine if the generated response is correct
def is_correct(pred, ref, threshold):
    score = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True).score(ref, pred)['rougeL'].fmeasure
    return score >= threshold

# Generate labels
y_true = [1] * len(references)  # 1 for all references (correct)
y_pred = [1 if is_correct(pred, ref, similarity_threshold) else 0 for pred, ref in zip(predictions, references)]

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)

print(f'Confusion Matrix:\n{conf_matrix}')


Confusion Matrix:
[[ 0  0]
 [15  0]]


In [37]:
pip install transformers datasets




In [41]:
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Set the TRANSFORMERS_CACHE environment variable
os.environ['TRANSFORMERS_CACHE'] = os.path.expanduser('~/hf_cache')

# Load your fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_model')

# Save the model and tokenizer
model.save_pretrained('./fine-tuned-gpt2')
tokenizer.save_pretrained('./fine-tuned-gpt2')

# Verify the directory contents
print("Directory contents:", os.listdir('./fine-tuned-gpt2'))

# Log into Hugging Face (ensure you're logged in)
!huggingface-cli login

# Push the model and tokenizer to Hugging Face
model.push_to_hub("dpatel9923/fine-tuned-gpt2")
tokenizer.push_to_hub("dpatel9923/fine-tuned-gpt2")


Directory contents: ['special_tokens_map.json', 'config.json', 'model.safetensors', 'merges.txt', 'tokenizer_config.json', 'generation_config.json', 'vocab.json', 'added_tokens.json']

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` require

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dpatel9923/fine-tuned-gpt2/commit/b5ef315c6253c621dac857a68b6280e35f48ee98', commit_message='Upload tokenizer', commit_description='', oid='b5ef315c6253c621dac857a68b6280e35f48ee98', pr_url=None, pr_revision=None, pr_num=None)