## Imports

In [1]:
import os
import pandas as pd

from pathlib import Path

## Constants

In [2]:
M4_FOLDER_PATH = '../../data/raw/m4'
M4_UNIFIED_FOLDER_PATH = '../../data/raw/m4-unified'

## Get all jsonl files

In [45]:
pathlist = Path(M4_FOLDER_PATH).glob('**/*.jsonl')

## Extract data from files

In [36]:
def get_json_machine_text_field_name(file_name):
    json_text_field = 'machine_text'
    if file_name == 'reddit_bloomz.jsonl':
        json_text_field = 'machine_answer'
    elif file_name == 'arxiv_bloomz.jsonl':
        json_text_field = 'machine_abstract'
    elif file_name == 'wikihow_bloomz.jsonl':
        json_text_field = 'machine_abstract'
    elif file_name == 'wikipedia_bloomz.jsonl':
        json_text_field = 'machine_abstract'

    return json_text_field

In [46]:
for path in pathlist:
    path_str, file_name= os.path.split(path)

    json_machine_text_field = get_json_machine_text_field_name(file_name)
    folder_name = file_name.split('_')[0]

    input_df = pd.read_json(path_or_buf=path, lines=True)

    unified_df = pd.DataFrame(columns=['text'])
    unified_df['text'] = input_df[json_machine_text_field]

    output_folder = f'{M4_UNIFIED_FOLDER_PATH}/{folder_name}'
    if not os.path.exists(output_folder):
        os.mkdir(output_folder)
        
    unified_df.to_json(f'{output_folder}/{file_name}', orient='records', lines=True)

## Extract human texts per domain from ChatGPT datasets

In [13]:
chat_gpt_datasets_paths = Path(M4_FOLDER_PATH).glob('**/*_chatGPT.jsonl')

In [5]:
list(chat_gpt_datasets_paths)

[WindowsPath('../../data/raw/m4/arxiv/arxiv_chatGPT.jsonl'),
 WindowsPath('../../data/raw/m4/reddit/reddit_chatGPT.jsonl'),
 WindowsPath('../../data/raw/m4/wikihow/wikihow_chatGPT.jsonl'),
 WindowsPath('../../data/raw/m4/wikipedia/wikipedia_chatgpt.jsonl')]

In [14]:
for path in chat_gpt_datasets_paths:
    path_str, file_name= os.path.split(path)
    folder_name = file_name.split('_')[0]

    input_df = pd.read_json(path_or_buf=path, lines=True)
    
    json_human_text_field = 'human_text'
    unified_df = pd.DataFrame(columns=['text'])
    unified_df['text'] = input_df[json_human_text_field]
    
    output_folder = f'{M4_UNIFIED_FOLDER_PATH}/{folder_name}'
    unified_df.to_json(f'{output_folder}/{folder_name}_human.jsonl', orient='records', lines=True)