In [4]:
import json
import re
from transformers import TextDataset

### Load and process data

In [22]:
def extract_and_process_messages(json_file, output_file):
    # Load the JSON data from the file with UTF-8 encoding
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # Extract messages
    messages = data.get('messages', [])
    
    # Create a list to store the extracted and processed messages
    extracted_messages = []
    
    previous_message = None
    
    for message in messages:
        if type(message.get('text')) != str:
            continue
        
        if message.get('text').strip() == '':
            continue
        
        if re.match(r'^[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+$', message.get('text')):
            continue
        
        text = message.get('text')
            
        if text:
            # If the message has less than 3 words and there's a previous message from the same sender, concatenate it
            if previous_message and previous_message['from'] == message.get('from', ''):
                previous_message['text'] += ' ' + text
            else:
                if previous_message:
                    extracted_messages.append(previous_message)
                previous_message = {
                    'from': message.get('from', ''),
                    'text': text
                }
    
    # Append the last message if it exists
    if previous_message:
        extracted_messages.append(previous_message)
    
    # Save the processed messages to a new JSON file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(extracted_messages, outfile, ensure_ascii=False, indent=4)

In [23]:
json_file = 'Dataset/result.json'
output_file = 'Dataset/processed_chat_history.json'
extract_and_process_messages(json_file, output_file)

### Format Dataset as Instruction/Input/Output

In [31]:
def format_for_finetuning(json_file, output_file):
    # Load the JSON data from the file with UTF-8 encoding
    with open(json_file, 'r', encoding='utf-8') as file:
        messages = json.load(file)
    
    # Create a list to store the formatted dataset
    formatted_data = []

    # Iterate through messages to create prompt-response pairs
    for i in range(len(messages) - 1):
        current_message = messages[i]
        next_message = messages[i + 1]
        
        if current_message['from'] == "Person A" and next_message['from'] == "Person B":
            prompt_response_pair = {
                'instruction': current_message['text'],
                'input': "",
                'output': next_message['text']
            }
            formatted_data.append(prompt_response_pair)
        
        elif current_message['from'] == "Person B" and next_message['from'] == "Person A":
            prompt_response_pair = {
                'instruction': current_message['text'],
                'input': "",
                'output': next_message['text']
            }
            formatted_data.append(prompt_response_pair)

    # Save the formatted dataset to a new JSON file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(formatted_data, outfile)
        # outfile.write('\n')

In [32]:
json_file = 'Dataset/processed_chat_history.json'
output_file = 'Dataset/dataset.json'
format_for_finetuning(json_file, output_file)

### Upload to Huggingface

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()
# 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

In [None]:
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
    return data

In [None]:
train_data = load_dataset('Dataset/dataset.jsonl')

In [None]:
def prepare_data(examples):
    texts = [f"{example['input']} {example['output']}" for example in examples]
    return tokenizer(texts, truncation=True, padding="max_length", max_length=512)

In [None]:
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="training_data.jsonl",
    block_size=128
)

In [None]:
dataset.push_to_hub("e1010101/text_chat", private=True)