# Description
Takes a jsonl file, splits it into 3 portions. Further split these sub portions into smaller chunks so chatGPT can "digest." Finally, these chunks are appended with a "prompt" to input into chatGPT.

### Split 3 portions

In [1]:
import json
import os

def split_dataset(input_file, num_splits):
    # Create output directory
    output_dir = os.path.splitext(input_file)[0] + "_split"
    os.makedirs(output_dir, exist_ok=True)

    # Open the input file
    with open(input_file, 'r', encoding='utf-8') as f:
        # Read all lines from the input file
        lines = f.readlines()
        total_lines = len(lines)
        lines_per_split = total_lines // num_splits

        # Split the lines into chunks
        chunks = [lines[i:i + lines_per_split] for i in range(0, total_lines, lines_per_split)]

        # Write chunks to separate files
        for i, chunk in enumerate(chunks):
            output_file = os.path.join(output_dir, f'dataset_{i+1}.jsonl')
            with open(output_file, 'w', encoding='utf-8') as out:
                out.writelines(chunk)


In [2]:
input_file = "shuffled_oberlin_data_with_clubs.jsonl"  # Change this to your dataset file
num_splits = 3  # Change this to the number of splits you want

split_dataset(input_file, num_splits)

### Further split into chunks

In [4]:
def jsonl_to_chunks(input_file):
    chunks = []
    chunk = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            chunk.append(json.loads(line))
            if len(chunk) == 10:
                chunks.append(chunk)
                chunk = []
        # Add the remaining lines as a last chunk if they are less than 10
        if chunk:
            chunks.append(chunk)
    return chunks

TODO: In the below code chunk, please replace with the right `input_file` assigned to you

In [17]:
'''
An: dataset_1.jsonl
Danha: dataset_2.jsonl
Yen: dataset_3.jsonl and dataset_4.jsonl
'''
input_file = "./shuffled_oberlin_data_with_clubs_split/dataset_3.jsonl"  # Change this to your JSONL file.
chunks = jsonl_to_chunks(input_file)
chunks

[[{'input': 'TO BE COMPLETED',
   'output': 'Apply! 👇🏼\nhttps://jobs.oberlin.edu/postings/7703',
   'org': 'scoberlin'},
  {'input': 'TO BE COMPLETED',
   'output': 'Check in with Zahida and Ilianna to hear about their Fall Highlights and get a sneak peek into what the MRC has planned for the Spring Term.',
   'org': 'obiemrc'},
  {'input': 'TO BE COMPLETED',
   'output': 'Mid-march of this year, we all had to pack our bags and adjust to a new normal, scrambling to understand what it means to live, learn, and create art in the time of COVID. Students coping with the shock of relocation. Faculty adapting to the shift of virtual teaching. Those still in Oberlin dealt with a campus laid bare. All grieving for rituals lost.\n\nYet, as the spring and summer unfolded, and in the wake of social and cultural revolution, Oberlin’s students resilience, adaptivity, and care shines through. Watch our Department reflect on possibilities of art in this moment.\n\nStudents, welcome back to this new y

# Create prompts from chunks

In [18]:
n = len(chunks)
new_chunks = []

for i in range(n):
    data = chunks[i]
    prompt = """
    Below is a portion from a dataset that will be used to finetune mistral7b model.
    The dataset is intended to make the model help Oberlin College's student organizations write Instagram posts for their events.  
    Each observation consists of 3 fields: 'input,' 'output,' and 'org.' 
    'input' is a prompt given to the text generation model. 'output' is the desired output based on that prompt, and it is a real post taken from Instagram. 'org' is the student org that actually wrote the output.
    Right now, all inputs needed to be filled in. Please fill in each input.
    Please give me jsonl format in return. 

    """ + str(data)

    new_chunks.append(prompt)
new_chunks    

['\n    Below is a portion from a dataset that will be used to finetune mistral7b model.\n    The dataset is intended to make the model help Oberlin College\'s student organizations write Instagram posts for their events.  \n    Each observation consists of 3 fields: \'input,\' \'output,\' and \'org.\' \n    \'input\' is a prompt given to the text generation model. \'output\' is the desired output based on that prompt, and it is a real post taken from Instagram. \'org\' is the student org that actually wrote the output.\n    Right now, all inputs needed to be filled in. Please fill in each input.\n    Please give me jsonl format in return. \n\n    [{\'input\': \'TO BE COMPLETED\', \'output\': \'Apply! 👇🏼\\nhttps://jobs.oberlin.edu/postings/7703\', \'org\': \'scoberlin\'}, {\'input\': \'TO BE COMPLETED\', \'output\': \'Check in with Zahida and Ilianna to hear about their Fall Highlights and get a sneak peek into what the MRC has planned for the Spring Term.\', \'org\': \'obiemrc\'}, {\'

# Write the prompts to a txt file for easier copying

In [31]:
def write_lines_to_txt(lines, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in lines:
            f.write(str(line) + '\n')
            f.write('\n')

In [32]:
prompts_file = 'prompts.txt'

write_lines_to_txt(new_chunks, prompts_file)

Each line in `prompts.txt` is a prompt that you can input in chatGPT/Bing.
Please check out `prompts.txt`.

For each output produced by chatGPT, please copy it into a file. Synthesize all outputs and then push to our repo!