In [2]:
import csv
import json

## What platform can {product} be used with?

In [9]:
def process_csv_to_jsonl(input_csv, output_jsonl):
    def format_platforms(platforms):
        platforms_list = platforms.split('|')
        if len(platforms_list) == 1:
            return platforms
        elif len(platforms_list) == 2:
            return ' and '.join(platforms_list)
        else:
            return ', '.join(platforms_list[:-1]) + ', and ' + platforms_list[-1]

    print(f"Opening CSV file: {input_csv}")
    with open(input_csv, 'r', encoding='utf-8') as csv_file, open(output_jsonl, 'w', encoding='utf-8') as jsonl_file:
        csv_reader = csv.reader(csv_file)
        print("Skipping first 3 rows...")
        next(csv_reader)  # Skip the first 3 rows
        next(csv_reader)
        next(csv_reader)
        product_names = next(csv_reader)[4:]  # Get product names from row 4, starting from column E
        print(f"Found {len(product_names)} product names: {product_names[:5]}...")

        print("Searching for the platforms row...")
        for row_num, row in enumerate(csv_reader, start=5):
            print(f"Checking row {row_num}: {row[:5]}...")
            if row and len(row) > 2 and "What platforms can" in row[2]:
                print(f"Found platforms row: {row[:5]}...")
                for i, product in enumerate(product_names):
                    if i + 4 < len(row) and row[i+4]:  # Check if there's a value for this product
                        question = f"What platforms can {product} be used on?"
                        answer = f"{product} can be used on {format_platforms(row[i+4])}"
                        json_line = json.dumps({"question": question, "answer": answer})
                        jsonl_file.write(json_line + '\n')
                        print(f"Wrote entry for {product}")
                print("Finished processing platforms row")
                break  # We've found the row we need, no need to continue
        else:
            print("WARNING: Did not find a row containing platform information!")

    print("JSONL file creation process completed.")



In [None]:
process_csv_to_jsonl('/home/dave/Desktop/AI/pretraining/data/simpler/finetuning/simplerrawdata.csv', 'products2.jsonl')

In [12]:

def process_csv_to_jsonl(input_csv, output_jsonl):
    def format_platforms(platforms):
        if not platforms or platforms == "N/A":
            return platforms
        platforms_list = [p.strip() for p in platforms.split('|')]
        if len(platforms_list) == 1:
            return platforms
        elif len(platforms_list) == 2:
            return ' and '.join(platforms_list)
        else:
            return ', '.join(platforms_list[:-1]) + ', and ' + platforms_list[-1]

    question_templates = [
        "What platforms can the {product} be used on?",
        "On which platforms can {product} be used?",
        "Where is {product} available for use?",
        "If I want to use {product}, which platforms should I have access to?",
        "What platform features are required to run {product}?",
        "How does the platform compatibility of {product} compare to other products?",
        "Are there any major platforms that don't support {product}?",
        "Which platforms are compatible with {product}?"
    ]

    print(f"Opening CSV file: {input_csv}")
    with open(input_csv, 'r', encoding='utf-8') as csv_file, open(output_jsonl, 'w', encoding='utf-8') as jsonl_file:
        csv_reader = csv.reader(csv_file)
        rows = list(csv_reader)

        print("Processing product names...")
        product_names = [name for name in rows[3][4:] if name]
        print(f"Found {len(product_names)} product names.")

        print("Searching for the platforms row...")
        for row in rows:
            if row and len(row) > 2 and "What platforms can the" in row[2]:
                print("Found platforms row.")
                for i, product in enumerate(product_names):
                    if i + 4 < len(row) and row[i+4]:
                        platforms = format_platforms(row[i+4])
                        for template in question_templates:
                            question = template.format(product=product)
                            answer = f"{product} can be used on {platforms}"
                            json_line = json.dumps({"question": question, "answer": answer})
                            jsonl_file.write(json_line + '\n')
                        print(f"Wrote entries for {product}")
                print("Finished processing platforms row")
                break
        else:
            print("WARNING: Did not find a row containing platform information!")

    print("JSONL file creation process completed.")


In [13]:

# Usage
process_csv_to_jsonl('/home/dave/Desktop/AI/pretraining/data/simpler/finetuning/simplerrawdata.csv', 'multiqproducts.jsonl')

Opening CSV file: /home/dave/Desktop/AI/pretraining/data/simpler/finetuning/simplerrawdata.csv
Processing product names...
Found 68 product names.
Searching for the platforms row...
Found platforms row.
Wrote entries for John Carter's Sandbox Strategy
Wrote entries for Top Tier Pro System
Wrote entries for The PMZ System
Wrote entries for The New “Big 3” Squeeze Master Class
Wrote entries for Day Trading with Tr3ndy Zones
Wrote entries for Micro Voodoo Line Strategy
Wrote entries for DPMR Masterclass
Wrote entries for Chart Patterns Secrets
Wrote entries for The Stacked Profit Strategy
Wrote entries for The Multi Squeeze Pro System
Wrote entries for Raghee's Day Trading Options Strategy Course
Wrote entries for Tr3ndy Jon’s Supply
& Demand System
Wrote entries for The Ready. Aim. Fire!® Pro System

Wrote entries for True Momentum System
Wrote entries for Bulletproof Butterflies 2.0
Wrote entries for Decoding Volume
Wrote entries for 5 Star Options Income Plan
Wrote entries for The Moxi

### asset classes

In [5]:
import csv
import json

def process_asset_classes_to_jsonl(input_csv, output_jsonl):
    def format_asset_classes(classes):
        if not classes or classes == "N/A":
            return classes
        class_list = [c.strip() for c in classes.split(',')]
        if len(class_list) == 1:
            return classes
        elif len(class_list) == 2:
            return ' and '.join(class_list)
        else:
            return ', '.join(class_list[:-1]) + ', and ' + class_list[-1]

    question_templates = [
        "What asset classes is {product} traded in?",
        "Which financial instruments can be traded using {product}?",
        "Does {product} support trading in stocks, options, futures, or other asset classes?",
        "What types of securities can be traded with {product}?",
        "In terms of asset classes, what is the scope of {product}?",
        "For which asset classes is {product} designed?",
        "Can you tell me about the asset classes compatible with {product}?",
        "What range of financial instruments does {product} cover?"
    ]

    print(f"Opening CSV file: {input_csv}")
    with open(input_csv, 'r', encoding='utf-8') as csv_file, open(output_jsonl, 'w', encoding='utf-8') as jsonl_file:
        csv_reader = csv.reader(csv_file)
        rows = list(csv_reader)

        print("Processing product names...")
        product_names = [name for name in rows[3][4:] if name]
        print(f"Found {len(product_names)} product names.")

        print("Processing asset classes...")
        asset_class_row = rows[6][4:]  # Start from column E (index 4)
        
        for i, (product, asset_classes) in enumerate(zip(product_names, asset_class_row)):
            if asset_classes:
                formatted_classes = format_asset_classes(asset_classes)
                print(f"Processing {product}: Asset classes = {formatted_classes}")
                
                for template in question_templates:
                    question = template.format(product=product)
                    answer = f"{product} is traded in {formatted_classes}"
                    json_line = json.dumps({"question": question, "answer": answer})
                    jsonl_file.write(json_line + '\n')
                
                print(f"Wrote entries for {product}")
            else:
                print(f"Warning: No asset class information for {product}")

    print("JSONL file creation process completed.")

# Usage
process_asset_classes_to_jsonl('/home/dave/Desktop/AI/pretraining/data/simpler/finetuning/simplerrawdata.csv', 'asset_classes_output.jsonl')

Opening CSV file: /home/dave/Desktop/AI/pretraining/data/simpler/finetuning/simplerrawdata.csv
Processing product names...
Found 68 product names.
Processing asset classes...
Processing John Carter's Sandbox Strategy: Asset classes = Options
Wrote entries for John Carter's Sandbox Strategy
Processing Top Tier Pro System: Asset classes = All asset classes but most trading done in Options
Wrote entries for Top Tier Pro System
Processing The PMZ System: Asset classes = Options, Futures, Stocks, and 0DTE
Wrote entries for The PMZ System
Processing The New “Big 3” Squeeze Master Class: Asset classes = Options, Stocks, and Futures
Wrote entries for The New “Big 3” Squeeze Master Class
Processing Day Trading with Tr3ndy Zones: Asset classes = 0DTE, Options, Futures, and Stocks
Wrote entries for Day Trading with Tr3ndy Zones
Processing Micro Voodoo Line Strategy: Asset classes = Options and Futures
Wrote entries for Micro Voodoo Line Strategy
Processing DPMR Masterclass: Asset classes = Option

## product description

In [14]:
import pandas as pd
import json

# Load the CSV file
file_path = '/home/dave/Desktop/AI/pretraining/data/simpler/finetuning/simplerrawdata.csv'
df = pd.read_csv(file_path, header=None)

# Function to generate question-answer pairs with updated logic using E14 for the detailed description
def generate_qa_pairs_updated_v3(df):
    qa_pairs = []
    
    for col in range(4, df.shape[1]):  # Starting from column E (index 4)
        product_name = df.iloc[3, col]
        detailed_description = df.iloc[13, col]

        questions = [
            f"What is {product_name}?",
            f"Can you explain {product_name}?",
            f"What are the benefits of {product_name}?",
            f"How does {product_name} work?",
            f"Can you describe the {product_name} strategy?"
        ]
        
        answer = f"{product_name} - {detailed_description}"
        
        for question in questions:
            qa_pairs.append({"question": question, "answer": answer})
    
    return qa_pairs

# Generate the QA pairs
qa_pairs = generate_qa_pairs_updated_v3(df)

# Save the QA pairs to a JSONL file
output_file_path = 'qa_pairs_product_description.jsonl'
with open(output_file_path, 'w') as outfile:
    for pair in qa_pairs:
        json.dump(pair, outfile)
        outfile.write('\n')

# Output the path of the generated file
output_file_path


'qa_pairs_product_description.jsonl'

### cleanup nans

In [15]:
import json

# Function to clean up the JSONL file by removing entries with "can be used on nan"
def cleanup_jsonl(input_file_path, output_file_path):
    cleaned_data = []

    # Read the JSONL file
    with open(input_file_path, 'r') as infile:
        for line in infile:
            entry = json.loads(line)
            if "can be used on nan" not in entry['answer']:
                cleaned_data.append(entry)

    # Write the cleaned data back to a new JSONL file
    with open(output_file_path, 'w') as outfile:
        for entry in cleaned_data:
            json.dump(entry, outfile)
            outfile.write('\n')

# Define input and output file paths
input_file_path = '/home/dave/Desktop/AI/pretraining/datasetformater/simpler/qa_pairs_product_description.jsonl'
output_file_path = 'qa_pairs_cleaned.jsonl'

# Run the cleanup function
cleanup_jsonl(input_file_path, output_file_path)

# Output the path of the cleaned file
output_file_path


'qa_pairs_cleaned.jsonl'

## Bulleted list of what tool/strategy/service is designed to do

In [23]:
import pandas as pd
import json

# Load the CSV file
file_path = '/home/dave/Desktop/AI/pretraining/data/simpler/finetuning/simplerrawdata.csv'
df = pd.read_csv(file_path, header=None)

# Function to generate question-answer pairs with variations using E4 and E16
def generate_qa_pairs_e4_e16_variations(df):
    qa_pairs = []
    
    for col in range(4, df.shape[1]):  # Starting from column D (index 3)
        product_name = df.iloc[3, col]
        detailed_description = df.iloc[15, col]

        questions = [
            f"What is {product_name} designed to do?",
            f"What are the key features of {product_name}?",
            f"What functionalities does {product_name} offer?",
            f"How does {product_name} work?",
            f"Can you describe the primary purpose of {product_name}?",
            f"What is the main goal of {product_name}?",
            f"What benefits does {product_name} provide?",
            f"How is {product_name} designed to help users?",
            f"What problems does {product_name} aim to solve?"
        ]
        
        answer = f"The {product_name} - {detailed_description}"
        
        for question in questions:
            qa_pairs.append({"question": question, "answer": answer})
    
    return qa_pairs

# Generate the QA pairs
qa_pairs = generate_qa_pairs_e4_e16_variations(df)

# Save the QA pairs to a JSONL file
output_file_path = 'qa_pairs_e4_e16.jsonl'
with open(output_file_path, 'w') as outfile:
    for pair in qa_pairs:
        json.dump(pair, outfile)
        outfile.write('\n')

# Output the path of the generated file
output_file_path


'qa_pairs_e4_e16.jsonl'

## cleanup nans bulleted list

In [24]:
import json

# Function to clean up the JSONL file by removing entries with "can be used on nan"
def cleanup_jsonl(input_file_path, output_file_path):
    cleaned_data = []

    # Read the JSONL file
    with open(input_file_path, 'r') as infile:
        for line in infile:
            entry = json.loads(line)
            if "- nan" not in entry['answer']:
                cleaned_data.append(entry)

    # Write the cleaned data back to a new JSONL file
    with open(output_file_path, 'w') as outfile:
        for entry in cleaned_data:
            json.dump(entry, outfile)
            outfile.write('\n')

# Define input and output file paths
input_file_path = '/home/dave/Desktop/AI/pretraining/datasetformater/simpler/qa_pairs_e4_e16.jsonl'
output_file_path = 'cleaned_bulleted_qa_pairs_e4_e16.jsonl'

# Run the cleanup function
cleanup_jsonl(input_file_path, output_file_path)

# Output the path of the cleaned file
output_file_path


'cleaned_bulleted_qa_pairs_e4_e16.jsonl'

In [1]:
import pandas as pd
import json

# Load the CSV file
file_path = '/home/dave/Desktop/AI/pretraining/data/simpler/finetuning/simplerrawdata.csv'
df = pd.read_csv(file_path, header=None)

# Function to generate question-answer pairs with variations using E4 and E16
def generate_qa_pairs_e4_e5_variations(df):
    qa_pairs = []
    
    for col in range(4, df.shape[1]):  # Starting from column D (index 3)
        product_name = df.iloc[3, col]
        detailed_description = df.iloc[4, col]

        questions = [
            f"Who created {product_name}?",
            f"who are the traders of {product_name}?",
          
        ]
        
        answer = f"{detailed_description} is the brains behind {product_name}"
        
        for question in questions:
            qa_pairs.append({"question": question, "answer": answer})
    
    return qa_pairs

# Generate the QA pairs
qa_pairs = generate_qa_pairs_e4_e5_variations(df)

# Save the QA pairs to a JSONL file
output_file_path = './raw/who_are_tradersqa_pairs_e4_e5.jsonl'
with open(output_file_path, 'w') as outfile:
    for pair in qa_pairs:
        json.dump(pair, outfile)
        outfile.write('\n')

# Output the path of the generated file
output_file_path


'./raw/who_are_tradersqa_pairs_e4_e5.jsonl'

## Who is this strategy for

In [4]:
import pandas as pd
import json

# Load the CSV file
file_path = '/home/dave/Desktop/AI/pretraining/data/simpler/finetuning/simplerrawdata.csv'
df = pd.read_csv(file_path, header=None)

# Function to generate question-answer pairs with variations using E4 and E16
def generate_qa_pairs_e4_e17_variations(df):
    qa_pairs = []
    
    for col in range(4, df.shape[1]):  # Starting from column D (index 3)
        product_name = df.iloc[3, col]
        detailed_description = df.iloc[16, col]

        questions = [
            f"Who is {product_name} for?",
            f"what kind of trader is {product_name} best geared to?",
            f"Who would benefit the most from using {product_name}?",
          
        ]
        
        answer = f"{detailed_description}"
        
        for question in questions:
            qa_pairs.append({"question": question, "answer": answer})
    
    return qa_pairs

# Generate the QA pairs
qa_pairs = generate_qa_pairs_e4_e17_variations(df)

# Save the QA pairs to a JSONL file
output_file_path = './raw/who_is_product_for_tradersqa_pairs_e4_e17.jsonl'
with open(output_file_path, 'w') as outfile:
    for pair in qa_pairs:
        json.dump(pair, outfile)
        outfile.write('\n')

# Output the path of the generated file
output_file_path


'./raw/who_is_product_for_tradersqa_pairs_e4_e17.jsonl'

In [5]:
import json

# Function to clean up the JSONL file by removing entries with "can be used on nan"
def cleanup_jsonl(input_file_path, output_file_path):
    cleaned_data = []

    # Read the JSONL file
    with open(input_file_path, 'r') as infile:
        for line in infile:
            entry = json.loads(line)
            if "nan" not in entry['answer']:
                cleaned_data.append(entry)

    # Write the cleaned data back to a new JSONL file
    with open(output_file_path, 'w') as outfile:
        for entry in cleaned_data:
            json.dump(entry, outfile)
            outfile.write('\n')

# Define input and output file paths
input_file_path = './raw/who_is_product_for_tradersqa_pairs_e4_e17.jsonl'
output_file_path = './raw/cleaned_who_is_product_for_tradersqa_pairs_e4_e17.jsonl'

# Run the cleanup function
cleanup_jsonl(input_file_path, output_file_path)

# Output the path of the cleaned file
output_file_path


'./raw/cleaned_who_is_product_for_tradersqa_pairs_e4_e17.jsonl'

## Continued-Pretraining

In [6]:
import pandas as pd
import json

# Load the CSV file
file_path = '/home/dave/Desktop/AI/pretraining/data/simpler/finetuning/simplerrawdata.csv'
df = pd.read_csv(file_path, header=None)

# Function to generate the desired JSONL format
def generate_jsonl_format(df):
    jsonl_data = []
    
    for col in range(4, df.shape[1]):  # Starting from column E (index 4)
        product = f"Simpler Trading Product: {df.iloc[3, col]}"  # E4 (index 3 - 1)
        body = (
            f"{df.iloc[3, col]}\n\n"  # E4
            f"By: \n{df.iloc[4, col]}\n\n"  # E5
            f"Available on: \n{df.iloc[5, col]}\n\n"  # E6
            f"This product is designed for: \n{df.iloc[6, col]}\n\n"  # E7
            f"{df.iloc[7, col]}\n\n"  # E8
            f"{df.iloc[11, col]}\n\n"  # E12
            f"{df.iloc[12, col]}\n\n"  # E13
            f"{df.iloc[13, col]}\n\n"  # E14
            f"{df.iloc[14, col]}\n\n"  # E15
            f"{df.iloc[15, col]}\n\n"  # E16
            f"{df.iloc[16, col]}\n\n"  # E17
            f"{df.iloc[17, col]}\n\n"  # E18
            f"{df.iloc[18, col]}\n\n"  # E19
            f"{df.iloc[19, col]}\n\n"  # E20
            f"{df.iloc[20, col]}\n\n"  # E21
            f"{df.iloc[21, col]}\n\n"  # E22
            f"{df.iloc[22, col]}\n\n"  # E23
            f"{df.iloc[23, col]}\n\n"  # E24
            f"{df.iloc[24, col]}\n\n"  # E25
            f"{df.iloc[25, col]}\n\n"  # E26
            f"{df.iloc[26, col]}\n\n"  # E27
            f"{df.iloc[27, col]}\n\n"  # E28
            f"{df.iloc[28, col]}\n\n"  # E29
            f"{df.iloc[29, col]}\n\n"  # E30
            f"{df.iloc[30, col]}\n\n"  # E31
            f"{df.iloc[31, col]}\n\n"  # E32
            f"{df.iloc[32, col]}\n\n"  # E33
            f"{df.iloc[33, col]}\n\n"  # E34
            f"{df.iloc[34, col]}\n\n"  # E35
            f"{df.iloc[35, col]}\n\n"  # E36
            f"{df.iloc[36, col]}\n\n"  # E37
            f"{df.iloc[37, col]}\n\n"  # E38
            f"{df.iloc[38, col]}\n\n"  # E39
            f"{df.iloc[3, col]} Testimonials: \n\n"  # E4
            f"{df.iloc[39, col]}\n\n"  # E40
            f"{df.iloc[40, col]}\n\n"  # E41
            f"{df.iloc[41, col]}\n\n"  # E42
            f"{df.iloc[42, col]}\n\n"  # E43
            f"{df.iloc[43, col]}\n\n"  # E44
            f"{df.iloc[44, col]}\n\n"  # E45
            f"{df.iloc[45, col]}\n\n"  # E46
            f"{df.iloc[46, col]}\n\n"  # E47
            f"{df.iloc[47, col]}\n\n"  # E48
            f"{df.iloc[48, col]}\n\n"  # E49
            f"{df.iloc[49, col]}\n\n"  # E50
            f"{df.iloc[40, col]}\n\n"  # E51
            f"{df.iloc[3, col]} Frequently Asked Questions: \n\n"  # E4
            f"{df.iloc[51, col]}\n\n"  # E52
            f"{df.iloc[52, col]}\n\n"  # E53
            f"{df.iloc[53, col]}\n\n"  # E54
            f"{df.iloc[54, col]}\n\n"  # E55
            f"{df.iloc[55, col]}\n\n"  # E56
            f"{df.iloc[56, col]}\n\n"  # E57
            f"{df.iloc[57, col]}\n\n"  # E58
            f"{df.iloc[58, col]}\n\n"  # E59
        )

        jsonl_data.append({"product": product, "body": body})
    
    return jsonl_data

# Generate the JSONL data
jsonl_data = generate_jsonl_format(df)

# Save the data to a JSONL file
output_file_path = 'pretrain.jsonl'
with open(output_file_path, 'w') as outfile:
    for item in jsonl_data:
        json.dump(item, outfile)
        outfile.write('\n')

# Output the path of the generated file
print(output_file_path)


pretrain.jsonl


## add a reference to simpler trading

In [2]:
import json

def modify_questions(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            data = json.loads(line)
            data['question'] = f"According to Simpler Trading, {data['question']}"
            json.dump(data, outfile)
            outfile.write('\n')

# Usage
input_file = '/home/dave/Desktop/AI/pretraining/datasetformater/simpler/finetune/all.jsonl'
output_file = 'all-qa-final.jsonl'
modify_questions(input_file, output_file)

JSONDecodeError: Expecting value: line 2 column 1 (char 1)