# Code for data preprocessing

Function to handle missing values:

In [None]:
import json

def handle_missing_values(json_file='your_json_file.json', output_file='cleaned_json.json'):
    """
    Handles missing values in a JSON file with 'text' and 'code' fields, replacing null values with empty strings.

    Args:
    - json_file (str): The path to the input JSON file.
    - output_file (str): The path to the output cleaned JSON file.

    This function reads a JSON file, handles missing (null) values in the 'text' and 'code' fields, replacing null values
    with empty strings, and writes the cleaned content to a new JSON file.
    """
    with open(json_file, 'r') as file:
        data = json.load(file)

    # Handle missing values in 'text' and 'code' fields
    for entry in data:
        entry['text'] = entry.get('text', '') or ''
        entry['code'] = entry.get('code', '') or ''

    # Write cleaned data to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(data, outfile, indent=4)

# Calling the function with default file names
handle_missing_values()

Function to remove other languages:

In [None]:
import pandas as pd
import json

def process_json_data(input_file='data.json', output_file='filtered.json'):
    """
    Loads JSON data from a file, processes it to filter out rows with Chinese characters,
    and writes the filtered data to a new JSON file.

    Args:
    - input_file (str): The path to the input JSON file.
    - output_file (str): The path to the output filtered JSON file.

    This function loads JSON data from an input file into a DataFrame, creates a 'Has_Chinese'
    column indicating the presence of Chinese characters in 'text' and 'code' columns,
    filters rows with Chinese characters, and writes the resulting filtered DataFrame to a new JSON file.
    """
    # Load JSON data into a DataFrame
    with open(input_file, "r") as file:
        data = json.load(file)

    df = pd.DataFrame(data)

    # Function to check for Chinese characters
    def has_chinese(text):
        return any('\u4e00' <= char <= '\u9fff' for char in text)

    # Apply the function to check for Chinese characters in 'text' column
    df['Has_Chinese'] = df['text'].apply(has_chinese)

    # Filter rows where 'Has_Chinese' column is True
    result = df[df['Has_Chinese']]

    # Remove rows where 'Has_Chinese' is True
    filtered_df = df[~df['Has_Chinese']]
    filtered_df = filtered_df[['text', 'code']]

    # Apply the function to check for Chinese characters in 'code' column
    filtered_df['Has_Chinese'] = filtered_df['code'].apply(has_chinese)

    # Filter rows where 'Has_Chinese' column is True
    result = filtered_df[filtered_df['Has_Chinese']]
    filtered_df = filtered_df[~filtered_df['Has_Chinese']]
    filtered_df = filtered_df[['text', 'code']]

    # Convert filtered DataFrame to JSON
    json_output = filtered_df.to_json(orient='columns')
    with open(output_file, 'w') as json_file:
        json_file.write(json_output)

# Calling the function with default file names
process_json_data()


Function to standardize the data:

In [None]:
import json

def standardize_tabs(json_file='your_json_file.json', output_file='standardized_json.json'):
    """
    Standardizes tab characters in 'text' and 'code' fields of a JSON file to '\t'.

    Args:
    - json_file (str): The path to the input JSON file.
    - output_file (str): The path to the output standardized JSON file.

    This function reads a JSON file, standardizes tab characters in 'text' and 'code' fields to '\t',
    and writes the standardized content to a new JSON file.
    """
    with open(json_file, 'r') as file:
        data = json.load(file)

    # Standardize tab characters in 'text' and 'code' fields to '\t'
    for entry in data:
        entry['text'] = entry.get('text', '').replace('    ', '\t')  # Replace 4 spaces with '\t'
        entry['code'] = entry.get('code', '').replace('    ', '\t')  # Replace 4 spaces with '\t'

    # Write standardized data to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(data, outfile, indent=4)

# Calling the function with default file names
standardize_tabs()


Function to standardize the line breaks in the data:

In [None]:
import json

def standardize_linebreaks(json_file='your_json_file.json', output_file='standardized_json.json'):
    """
    Standardizes line breaks, dedent, and carriage returns in 'text' and 'code' fields of a JSON file.

    Args:
    - json_file (str): The path to the input JSON file.
    - output_file (str): The path to the output standardized JSON file.

    This function reads a JSON file, standardizes line breaks, dedent, and carriage returns in 'text' and 'code' fields,
    and writes the standardized content to a new JSON file.
    """
    with open(json_file, 'r') as file:
        data = json.load(file)

    # Standardize line breaks, dedent, and carriage returns in 'text' and 'code' fields
    for entry in data:
        entry['text'] = entry.get('text', '').replace('\n', '\\n').replace('\r', '\\r')
        entry['code'] = entry.get('code', '').replace('\n', '\\n').replace('\r', '\\r')

    # Write standardized data to a new JSON file
    with open(output_file, 'w') as outfile:
        json.dump(data, outfile, indent=4)

# Calling the function with default file names
standardize_linebreaks()


Function to merge all the json files:

In [None]:
import json
import glob

def merge_json_files(input_folder='folder_with_json_files', output_file='merged_file.json'):
    """
    Merges multiple JSON files into a single JSON file.

    Args:
    - input_folder (str): The path to the folder containing JSON files to be merged.
    - output_file (str): The path to the output merged JSON file.

    This function reads all JSON files from a folder, merges their content into a single JSON object, and writes
    the merged content to a new JSON file.
    """
    # Fetch all JSON files in the folder
    files = glob.glob(f"{input_folder}/*.json")

    merged_data = []

    # Read data from each file and append to the merged list
    for file in files:
        with open(file, 'r') as f:
            data = json.load(f)
            merged_data.append(data)

    # Write merged data to a single JSON file
    with open(output_file, 'w') as outfile:
        json.dump(merged_data, outfile)

# Calling the function with default folder and output file names
merge_json_files()

Function to convert json to jsonl:

In [None]:
import json

def convert_json_to_jsonl(input_file='data.json', output_file='data.jsonl'):
    """
    Converts a JSON file to a JSON lines file.

    Args:
    - input_file (str): The path to the input JSON file.
    - output_file (str): The path to the output JSON lines file.

    Converts each entry in the input JSON file to a separate line in the output JSON lines file.
    """
    with open(input_file, 'r') as f:
        json_data = json.load(f)

    with open(output_file, 'w') as jsonl_output:
        for entry in json_data:
            json.dump(entry, jsonl_output)
            jsonl_output.write('\n')

# Calling the function with default file names
convert_json_to_jsonl()