### CEFR evlaution with 2 variations of ground truth

In [18]:
import ktrain
import tensorflow

In [1]:
model_path = '../Evaluation/models'
model = ktrain.load_predictor(model_path)

In [None]:
def get_cefr_level(text, model):
    
    predictions = model.predict(text)

    # Map the prediction to CEFR levels (customize this mapping as needed)
    cefr_levels = {
        'label_0':'Elementary',
        'label_1':'Intermediate',
        'label_2':'Advanced'
                }

    # Return the CEFR level based on the prediction
    return [cefr_levels[prediction] for prediction in predictions]

def map_instruct_level(level):
    if 'Level 1' in level:
        return 'Elementary'
    elif 'Level 2' in level:
        return 'Intermediate'
    elif 'Level 3' in level:
        return 'Advanced'
    elif 'beginner' in level:
        return 'Elementary'
    elif 'intermediate' in level:
        return 'Intermediate'
    elif 'advanced' in level:
        return 'Advanced'
    else:
        return 'Unknown'

def eval_cefr(df, model_response_columns):
    # Batch processing for context and response columns
    df['train_CEFR'] = get_cefr_level(df['context'].tolist(), model)
    df['response_CEFR'] = get_cefr_level(df['response'].tolist(), model)
    
    # Loop through each model_response_column and process in batches
    for col in model_response_columns:
        new_col_name = f"{col}_CEFR"
        df[new_col_name] = get_cefr_level(df[col].tolist(), model)
    
    df_cefr_no_instruct = df.copy()
    df_cefr_no_instruct = df_cefr_no_instruct.rename(
        columns={'train_CEFR': 'ground_truth_context', 'response_CEFR': 'ground_truth_target'}
    )
    
    for col in model_response_columns:
        new_col_name = f"{col}_CEFR"
        result_col_name = f"result_{col}"
        df_cefr_no_instruct[result_col_name] = (df_cefr_no_instruct['ground_truth_target'] == df_cefr_no_instruct[new_col_name]).astype(int)
    
    columns_to_keep = ['ground_truth_context', 'ground_truth_target'] + [f"result_{col}" for col in model_response_columns]
    df_cefr_no_instruct = df_cefr_no_instruct[columns_to_keep]
    df_cefr_no_instruct = df_cefr_no_instruct.groupby(['ground_truth_context', 'ground_truth_target']).mean().reset_index()
    
    df_cefr_instruct = df.copy()
    
    df_cefr_instruct['ground_truth_context'] = df_cefr_instruct['instruction'].apply(lambda x: map_instruct_level(x.split(' to ')[0]))
    df_cefr_instruct['ground_truth_target'] = df_cefr_instruct['instruction'].apply(lambda x: map_instruct_level(x.split(' to ')[1]))
    
    for col in model_response_columns:
        new_col_name = f"{col}_CEFR"
        result_col_name = f"result_{col}"
        df_cefr_instruct[result_col_name] = (df_cefr_instruct['ground_truth_target'] == df_cefr_instruct[new_col_name]).astype(int)
    
    columns_to_keep = ['ground_truth_context', 'ground_truth_target'] + [f"result_{col}" for col in model_response_columns]
    df_cefr_instruct = df_cefr_instruct[columns_to_keep]
    df_cefr_instruct = df_cefr_instruct.groupby(['ground_truth_context', 'ground_truth_target']).mean().reset_index()
    
    return df_cefr_no_instruct, df_cefr_instruct
    

In [None]:
model_response_columns = ['model_7b_chat_response']  # Add more column names as needed
df_cefr_no_instruct, df_cefr_instruct = eval_cefr(df_clean, model_response_columns)

In [50]:
df_cefr_no_instruct

Unnamed: 0,ground_truth_context,ground_truth_target,result_model_7b_chat_response
0,Advanced,Advanced,0.730769
1,Advanced,Elementary,0.175
2,Advanced,Intermediate,0.514706
3,Elementary,Advanced,0.368421
4,Elementary,Intermediate,0.25
5,Intermediate,Advanced,0.5
6,Intermediate,Elementary,1.0
7,Intermediate,Intermediate,0.862637


In [51]:
df_cefr_instruct

Unnamed: 0,ground_truth_context,ground_truth_target,result_model_7b_chat_response
0,Advanced,Elementary,0.11625
1,Advanced,Intermediate,0.431498
2,Intermediate,Elementary,0.13125


In [13]:
df_cefr_no_instruct

Unnamed: 0,ground_truth_context,ground_truth_target,result_model_response,result_oob_model_response,result_model_v2_response
0,Advanced,Advanced,0.845361,0.865979,0.773196
1,Advanced,Elementary,0.0,0.125,0.375
2,Advanced,Intermediate,0.504673,0.295327,0.500935
3,Elementary,Advanced,0.5,0.5,0.5
4,Elementary,Intermediate,0.35,0.35,0.3
5,Intermediate,Advanced,1.0,0.5,0.75
6,Intermediate,Elementary,0.2,0.4,0.8
7,Intermediate,Intermediate,0.925595,0.78869,0.800595


In [14]:
df_cefr_instruct

Unnamed: 0,ground_truth_context,ground_truth_target,result_model_response,result_oob_model_response,result_model_v2_response
0,Advanced,Elementary,0.025,0.046875,0.221875
1,Advanced,Intermediate,0.46875,0.278125,0.4
2,Intermediate,Elementary,0.03937,0.060367,0.183727


In [83]:
def evaluate_text(text):
    # text = text.replace('"', "") 
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an English language evaluator. Your role is to evaluate the readability of any text based on language complexity, length of text, and vocabulary. The texts are categorized into three levels as defined by CEFR: Advanced, Intermediate, and Beginner. You have to score the text between 1 to 100, where 1-33 represents Beginner level, 34-66 represents Intermediate level, and 67-100 represents Advanced level. Return the overall readability score only."},
            {"role": "user", "content": f'Evaluate the readability of the following text: "{text}"'}
            ]
        )
    return response['choices'][0]['message']['content']
    time.sleep(0.5)
    
    
def process_columns(df, model_response_columns):
    
    # Create new evaluation columns for each specified model response column
    for col in model_response_columns:
        new_col_name = f"{col}_gpt4_eval"
        df[new_col_name] = df[col].apply(evaluate_text)
        print(f"{col} processed")
        time.sleep(10)  # Adding sleep to avoid hitting rate limits of the API
    
    # Convert evaluation results to float values
    eval_cols = [f"{col}_gpt4_eval" for col in model_response_columns]
    for col in eval_cols:
        df[col] = df[col].apply(lambda value: float(value) if value.replace('.', '', 1).isdigit() else 0.0)
    
    # Group by 'instruction' and calculate the mean of the evaluations
    df_result = df.groupby(['instruction'])[eval_cols].mean().reset_index()
    
    return df_result, df

In [108]:
df_smpl = df_clean.groupby('instruction').apply(lambda x: x.sample(n=20, replace=False)).reset_index(drop=True)
df_smpl.shape

(60, 6)

In [84]:
%%time

model_response_columns = ['context', 'response', 'model_response', 'model_v2_response', 'oob_model_response']
df_result, df_score = process_columns(df_smpl, model_response_columns)

context processed
response processed
model_response processed
model_v2_response processed
oob_model_response processed
CPU times: user 383 ms, sys: 56.8 ms, total: 440 ms
Wall time: 3min 31s


In [85]:
df_result

Unnamed: 0,instruction,context_gpt4_eval,response_gpt4_eval,model_response_gpt4_eval,model_v2_response_gpt4_eval,oob_model_response_gpt4_eval
0,Simplify the following context from CEFR Level 2 to CEFR Level 1,40.3,35.1,32.4,26.3,15.0
1,Simplify the following context from CEFR Level 3 to CEFR Level 1,45.1,30.7,35.1,31.9,20.6
2,Simplify the following context from CEFR Level 3 to CEFR Level 2,54.7,39.1,44.5,29.8,44.0


In [None]:
def process_batch(batch):
    global total_tokens_used  # Declare the variable as global to modify its value
    
    # Create a new column to store the evaluation results
    batch_results = batch.apply(evaluate_text)
    
    # Check if the token limit is going to be exceeded in the next batch, assuming a maximum of 2000 tokens per request
    # if total_tokens_used + 2000 > 10000:
    #     # If the limit is going to be exceeded, pause the processing for the rest of the minute
    #     time_remaining = 60 - (time.time() % 60)
    #     print(f"Sleep for {time_remaining} sec") 
    #     time.sleep(time_remaining)
    #     total_tokens_used = 0  # Reset the total tokens used counter
    return batch_results

In [None]:
model_response_columns = ['context', 'response', 'model_response', 'model_v2_response', 'oob_model_response']
batch_size = 20

for col in model_response_columns:
    new_col_name = f"{col}_gpt4_eval"
    # Split the column data into batches
    batches = np.array_split(df_smpl[col], np.ceil(len(df_smpl[col]) / float(batch_size)))
    # Process each batch individually
    results = pd.concat([process_batch(batch) for batch in batches])
    df_smpl[new_col_name] = results
    print(f"{col} processed")

# Convert evaluation results to float values
eval_cols = [f"{col}_gpt4_eval" for col in model_response_columns]
for col in eval_cols:
    df_smpl[col] = df_smpl[col].apply(lambda value: float(value) if value.replace('.', '', 1).isdigit() else 0.0)

# Group by 'instruction' and calculate the mean of the evaluations
df_result = df_smpl.groupby(['instruction'])[eval_cols].mean().reset_index()
df_result