# Generate dataset of sample responses for each proficiency level.

In [6]:
import os

folder_path = "../data/guideline-descriptions"

files = os.listdir(folder_path)

guidelines = {}

for file_name in files:
    file_path = os.path.join(folder_path, file_name)
    
    # Check if the path is a file (not a directory)
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            content = file.read()
            file_name_without_extension = os.path.splitext(file_name)[0]
            guidelines[file_name_without_extension] = content

In [8]:
import os

for key in guidelines:
    if not os.path.exists(f"../data/guideline-data/{key}"):
        os.makedirs(f"../data/guideline-data/{key}")

In [49]:
PROMPT = """
Your role is to generate a potential statement from a Spanish speaker that would be categorized as "{level}".

Here is the description for "{level}":

"{description}"

Using this context, generate 5 statements that would be classified as coming from an "{level}" Spanish speaker. The statements should be rather brief, as if they were part of a conversation, but need not only be one sentence. Each statement should vary in length, from 1–3 sentences. Only provide the statements–do NOT provide English translations.
"""

In [51]:
from dotenv import load_dotenv
load_dotenv()

True

In [52]:
import os
from concurrent.futures import ThreadPoolExecutor
from openai import OpenAI

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

def process_level(level, description, i):
    print(f"Processing level {level}: iter {i}")
    # print(PROMPT.format(level=level, description=description),)
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": PROMPT.format(level=level, description=description),
            },
        ],
        model="gpt-3.5-turbo",
        temperature=1.3,
        frequency_penalty=0.7
    )

    response = chat_completion.choices[0].message.content

    with open(f"../data/guideline-data/{level}/{i}.txt", 'w') as file:
        file.write(response)

In [55]:
import time

with ThreadPoolExecutor() as executor:
    for level, description in guidelines.items():
        futures = []
        # Use the executor to submit the tasks in parallel
        for i in range(20):
            futures.append(executor.submit(process_level, level, description, i))
            # Delay to preserve QPS
            time.sleep(.5)

        # Wait for all tasks to complete before moving to the next level
        for future in futures:
            future.result()            

Processing level Advanced Mid: iter 0
Processing level Advanced Mid: iter 1
Processing level Advanced Mid: iter 2
Processing level Advanced Mid: iter 3
Processing level Advanced Mid: iter 4
Processing level Advanced Mid: iter 5
Processing level Advanced Mid: iter 6
Processing level Advanced Mid: iter 7
Processing level Advanced Mid: iter 8
Processing level Advanced Mid: iter 9
Processing level Advanced Mid: iter 10
Processing level Advanced Mid: iter 11
Processing level Advanced Mid: iter 12
Processing level Advanced Mid: iter 13
Processing level Advanced Mid: iter 14
Processing level Advanced Mid: iter 15
Processing level Advanced Mid: iter 16
Processing level Advanced Mid: iter 17
Processing level Advanced Mid: iter 18
Processing level Advanced Mid: iter 19
Processing level Distinguished: iter 0
Processing level Distinguished: iter 1
Processing level Distinguished: iter 2
Processing level Distinguished: iter 3
Processing level Distinguished: iter 4
Processing level Distinguished: ite

In [61]:
def remove_blank_lines(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Remove blank lines
    non_blank_lines = [line.strip() for line in lines if line.strip()]

    with open(file_path, 'w') as file:
        file.write('\n'.join(non_blank_lines))

def process_folder(folder_path):
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                remove_blank_lines(file_path)
                print(f"Processed: {file_path}")

folder_path = '../data/guideline-data'
process_folder(folder_path)


Processed: ../data/guideline-data/Advanced High/15.txt
Processed: ../data/guideline-data/Advanced High/14.txt
Processed: ../data/guideline-data/Advanced High/16.txt
Processed: ../data/guideline-data/Advanced High/17.txt
Processed: ../data/guideline-data/Advanced High/13.txt
Processed: ../data/guideline-data/Advanced High/12.txt
Processed: ../data/guideline-data/Advanced High/10.txt
Processed: ../data/guideline-data/Advanced High/11.txt
Processed: ../data/guideline-data/Advanced High/9.txt
Processed: ../data/guideline-data/Advanced High/8.txt
Processed: ../data/guideline-data/Advanced High/5.txt
Processed: ../data/guideline-data/Advanced High/4.txt
Processed: ../data/guideline-data/Advanced High/6.txt
Processed: ../data/guideline-data/Advanced High/7.txt
Processed: ../data/guideline-data/Advanced High/3.txt
Processed: ../data/guideline-data/Advanced High/2.txt
Processed: ../data/guideline-data/Advanced High/0.txt
Processed: ../data/guideline-data/Advanced High/1.txt
Processed: ../data/g

In [62]:
def split_markdown_list(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
        
    file_num = int(file_path.split('/')[-1].split('.')[0])
    
    lines = content.split('\n')
    i = 0
    for line in lines:
        line = line.strip()[3:]
        item_file_name = f"item{file_num * 5 + i}.txt"
        
        item_file_path = os.path.join(os.path.dirname(file_path), item_file_name)
        with open(item_file_path, 'w') as item_file:
            item_file.write(line)

        print(f"Created: {item_file_path}")
        i += 1

def process_folder(folder_path):
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                split_markdown_list(file_path)
                print(f"Processed: {file_path}")

folder_path = '../data/guideline-data'
process_folder(folder_path)

Created: ../data/guideline-data/Advanced High/item75.txt
Created: ../data/guideline-data/Advanced High/item76.txt
Created: ../data/guideline-data/Advanced High/item77.txt
Created: ../data/guideline-data/Advanced High/item78.txt
Created: ../data/guideline-data/Advanced High/item79.txt
Processed: ../data/guideline-data/Advanced High/15.txt
Created: ../data/guideline-data/Advanced High/item70.txt
Created: ../data/guideline-data/Advanced High/item71.txt
Created: ../data/guideline-data/Advanced High/item72.txt
Created: ../data/guideline-data/Advanced High/item73.txt
Created: ../data/guideline-data/Advanced High/item74.txt
Processed: ../data/guideline-data/Advanced High/14.txt
Created: ../data/guideline-data/Advanced High/item80.txt
Created: ../data/guideline-data/Advanced High/item81.txt
Created: ../data/guideline-data/Advanced High/item82.txt
Created: ../data/guideline-data/Advanced High/item83.txt
Created: ../data/guideline-data/Advanced High/item84.txt
Processed: ../data/guideline-data/A