In [51]:
import pandas as pd
from datasets import load_dataset
import re
huggingface_ds = load_dataset("chillies/IELTS-writing-task-2-evaluation")['train']
huggingface_ds = pd.DataFrame(huggingface_ds)

In [52]:
evaluation = huggingface_ds["evaluation"]
evaluation.head()
task_achievement_df = pd.DataFrame()
task_achievement_df['prompt'] = huggingface_ds['prompt']
task_achievement_df['essay'] = huggingface_ds['essay']
len(task_achievement_df)

9833

In [53]:
def extract_task_achievement(text):
    start_keyword = "Task Achievement"
    end_keyword = "Coherence and Cohesion"
    start_index = text.find(start_keyword)
    end_index = text.find(end_keyword)
    if start_index != -1 and end_index != -1:
        return text[start_index + len(start_keyword):end_index].strip()
    else:
        return ""

def clean_text(text):
    cleaned_text = text.replace('*', '').replace('#', '').replace('\n', '').replace('-', '').replace('*', '').strip()
    cleaned_text = cleaned_text.strip(": -")
    cleaned_text = cleaned_text.replace("Suggested Band Score", "").strip()
    cleaned_text = cleaned_text.replace("(Task Achievement)", "").strip()
    return cleaned_text

In [54]:
task_achievement_df['text'] = huggingface_ds["evaluation"].apply(extract_task_achievement)
task_achievement_df['text'] = task_achievement_df['text'].apply(clean_text)
task_achievement_df['text'][10]

'4.5 The candidate has effectively addressed the given task by providing a clear and coherent response to the prompt. The essay adequately covers all aspects of the task, providing arguments and evidence to support both sides of the debate. The candidate demonstrates a good understanding of the topic and presents a balanced perspective.'

In [55]:
task_achievement_df['text'][20]

'The candidate has adequately addressed the task by providing a clear stance and supporting arguments. The ideas presented are relevant to the topic and demonstrate a basic understanding of the issue. However, there are some aspects of the task that have not been fully covered, such as exploring alternative methods of employee selection and providing specific examples to support the arguments.  : 3.5'

In [56]:
task_achievement_df['text'][30]

"[6]The candidate has addressed the task by discussing both the views that interviews are a reliable method of choosing whom to employ and that there are other better methods. They have provided relevant arguments and examples to support their opinion that there are other reliable methods to assess a person for a job. However, the essay could have benefited from a more explicit statement of the candidate's position in the introduction."

In [57]:
empty_string_count = (task_achievement_df['text'] == '').sum()
print(f"Number of rows with empty '': {empty_string_count}")

Number of rows with empty '': 0


In [58]:
task_achievement_df["score"] = task_achievement_df['text'].str.extract(r'(\d+\.?\d*)').astype(float)
task_achievement_df["score"]

0       7.0
1       5.0
2       6.5
3       6.0
4       3.5
       ... 
9828    6.5
9829    4.0
9830    8.0
9831    8.0
9832    6.5
Name: score, Length: 9833, dtype: float64

In [59]:
nan_count = task_achievement_df['score'].isna().sum()
nan_count

42

In [60]:
def clean_text_v2(text):
    cleaned_text = text.replace('*', '').replace('#', '').replace('\n', '').replace('-', '').replace('*', '').strip()
    cleaned_text = re.sub(r"\d+\.*\d*", "", cleaned_text).strip()
    cleaned_text = re.sub(r"\[\d+\.*\d*\]", "", cleaned_text).strip()
    cleaned_text = cleaned_text.strip(": -")
    cleaned_text = cleaned_text.replace("Suggested Band Score", "").strip()
    cleaned_text = cleaned_text.replace("(Task Achievement)", "").strip()
    cleaned_text = cleaned_text.replace("[]", "").strip()
    return cleaned_text

In [61]:
task_achievement_df['text'] = task_achievement_df['text'].apply(clean_text_v2)

In [62]:
task_achievement_df['text'][10]

'The candidate has effectively addressed the given task by providing a clear and coherent response to the prompt. The essay adequately covers all aspects of the task, providing arguments and evidence to support both sides of the debate. The candidate demonstrates a good understanding of the topic and presents a balanced perspective.'

In [63]:
task_achievement_df['text'][20]

'The candidate has adequately addressed the task by providing a clear stance and supporting arguments. The ideas presented are relevant to the topic and demonstrate a basic understanding of the issue. However, there are some aspects of the task that have not been fully covered, such as exploring alternative methods of employee selection and providing specific examples to support the arguments.'

In [64]:
task_achievement_df['text'][30]

"The candidate has addressed the task by discussing both the views that interviews are a reliable method of choosing whom to employ and that there are other better methods. They have provided relevant arguments and examples to support their opinion that there are other reliable methods to assess a person for a job. However, the essay could have benefited from a more explicit statement of the candidate's position in the introduction."

In [65]:
task_achievement_df = task_achievement_df[['prompt', 'essay', 'text', 'score']]
task_achievement_df

Unnamed: 0,prompt,essay,text,score
0,Interviews form the basic criteria for most la...,It is believed by some experts that the tradit...,The essay effectively addresses the given task...,7.0
1,Interviews form the basic selecting criteria f...,Nowadays numerous huge firms allocate an inter...,The candidate has effectively addressed the gi...,5.0
2,Interview form the basic selection criteria fo...,The interview section is the most vital part o...,The candidate has effectively addressed the gi...,6.5
3,Interviews form the basic selection criteria f...,It is argued that the best method to recruit e...,The candidate has adequately addressed the tas...,6.0
4,Interviews from the basic selecting criteria f...,Nowadays many companies conduct interviews bef...,The essay adequately addresses the task by dis...,3.5
...,...,...,...,...
9828,Nations should spend more money on skills and ...,"These days, many countries use a huge amount o...",The essay addresses the task by discussing the...,6.5
9829,Nations should spend more money on skills and ...,Skills are required in order to achieve succes...,The candidate has not effectively addressed th...,4.0
9830,Nations should spend more money on skills and ...,There is a contradictory view among the people...,The essay adequately addresses the given task ...,8.0
9831,Nations should spend more money on skills and ...,Many today feel that countries should prioriti...,The candidate has effectively addressed the ta...,8.0


In [66]:
nan_count = task_achievement_df.isna().any(axis=1).sum()
nan_count

42

In [67]:
task_achievement_df = task_achievement_df[task_achievement_df['text'] != '']

In [68]:
task_achievement_df = task_achievement_df.dropna(subset=['score'])
nan_count = task_achievement_df.isna().any(axis=1).sum()
nan_count

0

In [69]:
csv_path = 'data/Task_Achievement.csv'
task_achievement_df.to_csv(csv_path, index=False)

In [70]:
len(task_achievement_df)

9791