In [1]:
import json
def read_jsonfile(file_path="toy_testset.jsonl"):
    all_samples = []
    # Open and read the file line by line
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            data = json.loads(line)  # Parse the JSON object from each line
            # print(data)              # Do something with the JSON object
            all_samples += [data]
    return all_samples

In [20]:
model_samples = read_jsonfile() 
model_samples[33]

{'input': 'Context: The Panthers finished the regular season with a 15–1 record, and quarterback Cam Newton was named the NFL Most Valuable Player (MVP). They defeated the Arizona Cardinals 49–15 in the NFC Championship Game and advanced to their second Super Bowl appearance since the franchise was founded in 1995. The Broncos finished the regular season with a 12–4 record, and denied the New England Patriots a chance to defend their title from Super Bowl XLIX by defeating them 20–18 in the AFC Championship Game. They joined the Patriots, Dallas Cowboys, and Pittsburgh Steelers as one of four teams that have made eight appearances in the Super Bowl. Question: What team did the Panthers defeat? Answer:',
 'output': 'Arizona Cardinals',
 'original_answers': ['Arizona Cardinals',
  'the Arizona Cardinals',
  'Arizona Cardinals']}

In [1]:
import os
import pandas as pd

def read_all_tsv_in_folder(folder_path):
    """
    Reads all .tsv files in a folder and returns a list of DataFrames.
    
    Args:
        folder_path (str): Path to the folder containing .tsv files.
    
    Returns:
        list: A list of pandas DataFrames, each representing a .tsv file.
    """
    dataframes = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".tsv"):  # Check if the file has a .tsv extension
            file_path = os.path.join(folder_path, file_name)
            if not file_name in ["DREsS_New.tsv", "DREsS_Std.tsv"]: continue
            try:
                df = pd.read_csv(file_path, sep='\t')  # Read the TSV file
                dataframes.append(df)  # Add it to the list
                print(f"Successfully read {file_name}")
            except Exception as e:
                print(f"Error reading {file_name}: {e}")
    return dataframes

# Example usage
folder_path = "/workspace/datasets/DREsS"  # Replace with the path to your folder
tsv_data = read_all_tsv_in_folder(folder_path)


Successfully read DREsS_Std.tsv
Successfully read DREsS_New.tsv


In [2]:
import pandas as pd

def concatenate_with_missing_columns(dataframes, required_columns):
    """
    Concatenates a list of DataFrames, ensuring each has the required columns.
    
    Args:
        dataframes (list): List of pandas DataFrames to concatenate.
        required_columns (list): List of required column names.
    
    Returns:
        pandas.DataFrame: A single DataFrame with all required columns.
    """
    updated_dataframes = []

    for df in dataframes:
        # Add missing columns with NaN values
        missing_columns = [col for col in required_columns if col not in df.columns]
        for col in missing_columns:
            df[col] = None  # Add missing column with NaN values
        
        # Ensure the DataFrame has the correct column order
        df = df[required_columns]
        updated_dataframes.append(df)
    
    # Concatenate all DataFrames into one
    result = pd.concat(updated_dataframes, ignore_index=True)
    return result

# Example usage
required_columns = ["id", "prompt", "essay", "content", "organization", "language", "total"]
# Assuming `tsv_data` is your list of DataFrames
final_dataframe = concatenate_with_missing_columns(tsv_data, required_columns)

# Check the result
print(final_dataframe.head())

   id                                             prompt  \
0   1  More and more people use computers, but not ev...   
1   2  More and more people use computers, but not ev...   
2   3  More and more people use computers, but not ev...   
3   4  More and more people use computers, but not ev...   
4   5  More and more people use computers, but not ev...   

                                               essay   content  organization  \
0  Dear local newspaper, I think effects computer...  3.333333      2.500000   
1  Dear @CAPS1 @CAPS2, I believe that using compu...  3.333333      3.333333   
2  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...  2.500000      2.500000   
3  Dear Local Newspaper, @CAPS1 I have found that...  4.166667      3.333333   
4  Dear @LOCATION1, I know having computers has a...  3.333333      2.500000   

   language      total  
0  2.500000   8.333333  
1  3.055556   9.722222  
2  3.055556   8.055556  
3  3.611111  11.111111  
4  3.333333   9.166667  


In [33]:
final_dataframe.fillna(-1)

Unnamed: 0,id,prompt,essay,content,organization,language,total
0,1,"More and more people use computers, but not ev...","Dear local newspaper, I think effects computer...",3.333333,2.500000,2.500000,8.333333
1,2,"More and more people use computers, but not ev...","Dear @CAPS1 @CAPS2, I believe that using compu...",3.333333,3.333333,3.055556,9.722222
2,3,"More and more people use computers, but not ev...","Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",2.500000,2.500000,3.055556,8.055556
3,4,"More and more people use computers, but not ev...","Dear Local Newspaper, @CAPS1 I have found that...",4.166667,3.333333,3.611111,11.111111
4,5,"More and more people use computers, but not ev...","Dear @LOCATION1, I know having computers has a...",3.333333,2.500000,3.333333,9.166667
...,...,...,...,...,...,...,...
8782,2275,If you could change one important aspect about...,"What's the matter?\n\n\n Nowadays, many count...",2.000000,2.000000,2.500000,6.500000
8783,2276,If you could change one important aspect about...,I think there are lots of things to change a...,2.000000,2.000000,2.500000,6.500000
8784,2277,If you could change one important aspect about...,"My country, Korea is a nice country. It is w...",4.000000,3.500000,4.000000,11.500000
8785,2278,If you could change one important aspect about...,Every country have its own culture and special...,3.500000,3.000000,3.500000,10.000000


In [3]:
import pandas as pd
import re



def round_score(x):
    """
    Rounds the score according to the custom rule:
      - If the fractional part is <= 0.25, round down.
      - If the fractional part is > 0.25 and <= 0.75, round to the nearest 0.5.
      - If the fractional part is > 0.75, round up.
    
    Examples:
      3.25 -> 3
      3.26 -> 3.5
      3.75 -> 3.5
      3.8  -> 4
    """
    try:
        score = float(x)
    except (ValueError, TypeError):
        return x  # Return as is if it's not numeric
    integer_part = int(score)
    remainder = score - integer_part
    if remainder <= 0.25:
        return integer_part
    elif remainder <= 0.75:
        return integer_part + 0.5
    else:
        return integer_part + 1

def generate_training_samples(dataframe):
    """
    Generate training samples for an LLM for automated TOEFL essay scoring.
    
    Expected DataFrame columns:
      - 'prompt': The question prompt.
      - 'essay': The student's essay.
      - 'content': Content score (range 0-5).
      - 'organization': Organization score (range 0-5).
      - 'language': Language score (range 0-5).
      - 'total': (Optional) Composite total score.
    
    Rubrics:
      - Organization: (range 0-5) The argument is very effectively structured and developed, 
        making it easy for the reader to follow the ideas and understand how the writer is building the argument. 
        Paragraphs use coherence devices effectively while focusing on a single main idea.
      - Content: (range 0-5) The paragraph is well-developed and relevant to the argument, 
        supported with strong reasons and examples.
      - Language: (range 0-5) The writing displays sophisticated control of a wide range of vocabulary and collocations. 
        The essay follows grammar and usage rules throughout the paper. Spelling and punctuation are correct throughout the paper.
    
    The function applies a custom rounding to the score values as follows:
      - If the fractional part is ≤ 0.25, round down.
      - If the fractional part is > 0.25 and ≤ 0.75, round to the nearest 0.5.
      - If the fractional part is > 0.75, round up.
    
    It then dynamically builds the training prompt based on the available score columns.
    
    Args:
        dataframe (pd.DataFrame): The merged DataFrame containing the necessary columns.
        
    Returns:
        list: A list of training sample dictionaries with 'input', 'output', and 'original_answers' keys.
    """
    
    training_samples = []
    
    # Define the rubric text to include with every prompt.
    # rubric_text = (
    #     "Rubrics:\n"
    #     "- Organization: (range 0-5) The argument is very effectively structured and developed, "
    #     "making it easy for the reader to follow the ideas and understand how the writer is building the argument. "
    #     "Paragraphs use coherence devices effectively while focusing on a single main idea.\n"
    #     "- Content: (range 0-5) The paragraph is well-developed and relevant to the argument, "
    #     "supported with strong reasons and examples.\n"
    #     "- Language: (range 0-5) The writing displays sophisticated control of a wide range of vocabulary and collocations. "
    #     "The essay follows grammar and usage rules throughout the paper. Spelling and punctuation are correct throughout the paper.\n"
    # )
    t1=t2=t3=t4=0
    for _, row in dataframe.iterrows():
        # Skip rows missing essential fields.
        # Build the base context with the student essay and rubric definitions.
        input_text = f"Question: {row['prompt']}\n\n"
        input_text += f"Essay: {row['essay']}\n\n"
        
        # Determine which scores are available and build the task instruction accordingly.
        if row.get('content') > -1 and row.get('organization') == -1 and row.get('language')==-1:
            task_description = "Please score the essay for the rubric dimension: Content (0-5)."
            input_text += f"\n{task_description}"
            output = f"""The content_score for this essay is {round_score(row['content'])}"""
            t1+=1

        elif row.get('content') == -1 and row.get('organization') > -1 and row.get('language')==-1:
            task_description = "Please score the essay for the rubric dimension: Organization (0-5)."
            input_text += f"\n{task_description}"
            output = f"""The organization_score for this essay is {round_score(row['organization'])}"""
            t2+=1
        elif row.get('content') == -1 and row.get('organization') == -1 and row.get('language')>-1:
            task_description = "Please score the essay for the rubric dimension: Language (0-5)."
            input_text += f"\n{task_description}"
            output = f"""The language_score for this essay is {round_score(row['language'])}"""
            t3+=1
        elif row.get('content') > -1 and row.get('organization') > -1 and row.get('language')>-1:
            task_description = "Please score the essay for the rubric dimensions: Content (0-5), Organization (0-5), and Language (0-5)."
            input_text += f"\n{task_description}"
            output = f"""For this essay, I will mark content_score as {round_score(row['content'])}, organization_score as {round_score(row['organization'])} and language_score as {round_score(row['language'])} """
            t4+=1
        else:
            # Skip rows that do not match any recognized scenario.
            print("skip", row)
            continue

        training_sample = {
            'input': input_text.strip(),
            'output': output,
            'category': "EssayScoring",
            "taskname": "EssayScoring"
        }
        training_samples.append(training_sample)
    print("type 1 | 2 | 3 | 4  = ", t1, t2, t3, t4)
    return training_samples

# Example usage:
# Assuming 'merged_dataframe' is your DataFrame with the merged TSV data and proper score columns.
# training_data = generate_training_samples(merged_dataframe)
# for sample in training_data[:3]:
#     print(sample)

# Example usage
# Assuming 'merged_dataframe' is the DataFrame with the required columns
training_data = generate_training_samples(final_dataframe.fillna(-1))

# Inspect the first few training samples
print(len(training_data))
for sample in training_data[-3:-1]:
    print(sample['output'])

type 1 | 2 | 3 | 4  =  0 0 0 8787
8787
For this essay, I will mark content_score as 4, organization_score as 3.5 and language_score as 4 
For this essay, I will mark content_score as 3.5, organization_score as 3 and language_score as 3.5 


In [5]:
import pandas as pd
import random, json
def extract_scores(text):
    """
    Extracts content_score, language_score, and organization_score from the given text.

    Args:
        text (str): The input string containing the scores.

    Returns:
        dict: A dictionary with keys 'content_score', 'language_score', and 'organization_score'.
              Values are floats if found, otherwise None.
    """
    # Define the regular expressions for each score
    content_pattern = r"content_score (?:for this essay )?\w{2} (\d+(\.\d+)?)"
    language_pattern = r"language_score (?:for this essay )?\w{2} (\d+(\.\d+)?)"
    organization_pattern = r"organization_score (?:for this essay )?\w{2} (\d+(\.\d+)?)"
    
    # Search for matches in the text
    content_match = re.search(content_pattern, text)
    language_match = re.search(language_pattern, text)
    organization_match = re.search(organization_pattern, text)
    
    # Extract values or set to None if not found
    content_score = float(content_match.group(1)) if content_match else None
    language_score = float(language_match.group(1)) if language_match else None
    organization_score = float(organization_match.group(1)) if organization_match else None
    def level_catgorize(point):
        if point is None: return "U"
        if point >= 4: return "H"
        if point >= 2: return "M" 
        return "L"
    total_score = level_catgorize(content_score) + level_catgorize(language_score) + level_catgorize(organization_score)
    # Return results as a dictionary
    return {
        "content_score": content_score,
        "language_score": language_score,
        "organization_score": organization_score,
        "total_score": total_score
    }

# Function to split dataset into train/validate/test
def split_dataset(samples, test_size=100, valid_ratio=0.2):
    """
    Splits the samples into training, validation, and test sets, ensuring equal label distribution in the test set.

    Args:
        samples (list of dict): The list of sample dictionaries.
        test_size (int): Number of samples in the test set.
        valid_ratio (float): Proportion of the remaining data to allocate to validation.

    Returns:
        tuple: (training_set, validation_set, test_set)
    """
    # Extract labels for each sample and group by label
    labeled_samples = {}
    for sample in samples:
        # print(sample['output'])
        label = extract_scores(sample['output'])['total_score']
        if label is not None:
            labeled_samples.setdefault(label, []).append(sample)
    print(labeled_samples.keys())
    # Create test set ensuring balanced label distribution
    test_set = []
    for label, group in labeled_samples.items():
        # Proportional sampling for each label
        num_samples = min(test_size // len(labeled_samples), len(group))
        test_set.extend(random.sample(group, num_samples))
    
    # Remove test samples from the original dataset
    remaining_samples = [sample for sample in samples if sample not in test_set]

    # Split remaining samples into training and validation
    valid_size = int(len(remaining_samples) * valid_ratio)
    validation_set = random.sample(remaining_samples, valid_size)
    training_set = [sample for sample in remaining_samples if sample not in validation_set]

    return training_set, validation_set, test_set

# Save dataset to JSONL files
def split_and_save_jsonl(
    data,
    test_size=100,
    valid_ratio=0.2,
    train_file="training.jsonl",
    valid_file="validation.jsonl",
    test_file="testing.jsonl"
):
    """
    Splits the data into training, validation, and testing sets, and saves them as JSONL files.

    Args:
        data (list): The list of samples to split, where each sample is a dictionary.
        test_size (int): The number of samples to include in the test set.
        valid_ratio (float): Proportion of the remaining data (after test split) to allocate to validation.
        train_file (str): The filename for the training set.
        valid_file (str): The filename for the validation set.
        test_file (str): The filename for the testing set.

    Returns:
        None
    """
    # Shuffle the data to ensure randomness.
    train_data, valid_data, test_data = split_dataset(data, test_size=100, valid_ratio=0.2)

    # Save the test data to a JSONL file.
    with open(test_file, "w", encoding="utf-8") as test_f:
        for sample in test_data:
            json.dump(sample, test_f)
            test_f.write("\n")

    # Save the validation data to a JSONL file.
    with open(valid_file, "w", encoding="utf-8") as valid_f:
        for sample in valid_data:
            json.dump(sample, valid_f)
            valid_f.write("\n")

    # Save the training data to a JSONL file.
    with open(train_file, "w", encoding="utf-8") as train_f:
        for sample in train_data:
            json.dump(sample, train_f)
            train_f.write("\n")

    print(f"Training data saved to {train_file}")
    print(f"Validation data saved to {valid_file}")
    print(f"Testing data saved to {test_file}")

# Example usage:
# Assuming 'training_data' is your list of generated samples:
split_and_save_jsonl(
    training_data,
    test_size=100,
    valid_ratio=0.1,
    train_file="/workspace/datasets/hey2/training.jsonl",
    valid_file="/workspace/datasets/hey2/validation.jsonl",
    test_file="/workspace/datasets/hey2/testing.jsonl"
)



dict_keys(['MMM', 'HMM', 'HHM', 'HHH', 'MMH', 'LLL', 'MML', 'MHH', 'LML', 'MLM', 'MHM', 'HMH', 'LMM', 'MLL', 'LLM', 'LMH', 'HLH', 'LHH', 'LHM'])
Training data saved to /workspace/datasets/hey2/training.jsonl
Validation data saved to /workspace/datasets/hey2/validation.jsonl
Testing data saved to /workspace/datasets/hey2/testing.jsonl
