In [None]:
# Source and destination names
TICKER = "AAPL"
YEAR = 2023
MASK_NAME = "ORENIJI"
DATASET_SIZE = 10

In [5]:
# This cell will use financial_datasets to generate a dataset from a 10-K report
# for the given TICKER company and year.
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

from financial_datasets.generator import DatasetGenerator

# Create dataset generator
generator = DatasetGenerator(model="gpt-4o-mini", api_key=api_key)

# Generate dataset from 10-K
dataset = generator.generate_from_10K(
    ticker=TICKER,
    year=YEAR,
    max_questions=DATASET_SIZE,
)

Generating questions: 100%|[32m██████████[0m| 10/10 [00:31<00:00,  3.18s/it]


In [6]:
# Helper function for getting the next file name in directory
def get_latest_file(JSON_DIR, FILE_PREFIX):
    try:
        files = [f for f in os.listdir(JSON_DIR) if f.startswith(FILE_PREFIX) and f.endswith('.json')]
    except FileNotFoundError:
        os.makedirs(JSON_DIR, exist_ok=True)
        files = []
    print(files)
    return max(files) if files else FILE_PREFIX + '00.json'

In [None]:
os.makedirs(os.path.join("datasets", TICKER, "raw"), exist_ok=True)

import json

data = []
for id,line in enumerate(dataset.items):
    item = {}
    item['question'] = line.question
    item['answer'] = line.answer
    item['context'] = line.context
    item['id'] = id
    data.append(item)

# For keeping our directory organized
FILE_PREFIX = "dataset_"
JSON_DIR = os.path.join("datasets", TICKER, "raw")
latest_file = get_latest_file(JSON_DIR, FILE_PREFIX)
file_num = int(latest_file[-6:-5])
next_file_num = file_num + 1
next_output_file = f"{FILE_PREFIX}{next_file_num:02d}.json"
next_output_path = os.path.join(JSON_DIR, next_output_file)
# Save the data to JSON
with open (next_output_path, "w") as f:
    json.dump(data, f, indent=4)


[]


In [8]:
# Merge all separately generated files into one
all_data = []
current_id = 0
for file in os.listdir():
    if file.endswith(".json"):
        with open(file) as f:
            data = json.load(f)
            for idx in range(len(data)):
                data[idx]['id'] = current_id
                current_id += 1
            all_data.extend(data)

all_data_path = os.path.join("datasets", TICKER, TICKER+".json")
with open(all_data_path, "w") as f:
    json.dump(all_data, f, indent=4)

In [9]:
# Convert the JSON file to JSONL format for fine-tuning
from pathlib import Path

def load_json(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

def convert_to_jsonl(input_data, output_file):
    """
    Convert JSON array to JSONL format and save to file.
    
    Args:
        input_data (list): List of dictionaries containing the dataset
        output_file (str): Path to save the JSONL file
    """
    # Ensure output directory exists
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Write each JSON object on a new line
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in input_data:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

def prepare_dataset_files(filename):
    """
    Prepare the dataset files in the required structure.
    """
    # Sample data (replace with your actual data)
    data = load_json(filename)
    
    # Create the data directory
    data_dir = Path('datasets', TICKER, MASK_NAME+"-finetune-dataset")
    data_dir.mkdir(exist_ok=True)
    
    # Convert and save the training data
    convert_to_jsonl(data, str(data_dir / 'train.jsonl'))
    
    # Create the dataset_dict configuration
    dataset_dict = {
        "train": str(data_dir / 'train.jsonl')
    }
    
    # Save the dataset configuration
    with open(data_dir / 'dataset_dict.json', 'w') as f:
        json.dump(dataset_dict, f, indent=2)

# Usage example
if __name__ == "__main__":
    filepath = os.path.join("datasets", TICKER, TICKER+".json")
    prepare_dataset_files(filepath)