In [25]:
import pandas as pd
from datasets import load_dataset
import re
huggingface_ds = load_dataset("chillies/IELTS-writing-task-2-evaluation")['train']
huggingface_ds = pd.DataFrame(huggingface_ds)

In [26]:
evaluation = huggingface_ds["evaluation"]
evaluation.head()
Coherence_df = pd.DataFrame()
Coherence_df['prompt'] = huggingface_ds['prompt']
Coherence_df['essay'] = huggingface_ds['essay']
len(Coherence_df)

9833

In [27]:
def extract(text):
    start_keyword = "Coherence and Cohesion"
    end_keyword = "Lexical Resource"
    start_index = text.find(start_keyword)
    end_index = text.find(end_keyword)
    if start_index != -1 and end_index != -1:
        return text[start_index + len(start_keyword):end_index].strip()
    else:
        return ""

def clean_text(text):
    cleaned_text = text.replace('*', '').replace('#', '').replace('\n', '').replace('-', '').replace('*', '').strip()
    cleaned_text = cleaned_text.strip(": -")
    cleaned_text = cleaned_text.replace("Suggested Band Score", "").strip()
    cleaned_text = cleaned_text.replace("(Coherence and Cohesion)", "").strip()
    return cleaned_text

In [28]:
Coherence_df['text'] = huggingface_ds["evaluation"].apply(extract)
Coherence_df['text'] = Coherence_df['text'].apply(clean_text)
Coherence_df['text'][10]

'4.0 The essay is wellorganized and easy to follow, with clear transitions between sentences and paragraphs. The use of connecting words and phrases helps maintain a smooth progression of ideas. The information is presented in a logical sequence, with each paragraph building on the previous one.'

In [29]:
Coherence_df['text'][20]

'The essay lacks a clear structure and organization, with ideas presented in a disjointed manner. Transitions between sentences and paragraphs are often abrupt and unclear. The logical sequence of information is not always evident, making the essay difficult to follow.  : 2.5'

In [30]:
Coherence_df['text'][30]

'[6]The essay is generally wellorganized and easy to follow. The candidate has used transition words and phrases to connect sentences and paragraphs, and the ideas flow logically. However, there are some instances where the transitions could be smoother, and the essay could benefit from more explicit signposting to guide the reader through the different sections.'

In [31]:
empty_string_count = (Coherence_df['text'] == '').sum()
print(f"Number of rows with empty '': {empty_string_count}")

Number of rows with empty '': 0


In [32]:
Coherence_df["score"] = Coherence_df['text'].str.extract(r'(\d+\.?\d*)').astype(float)
Coherence_df["score"]

0       7.5
1       4.5
2       6.5
3       5.5
4       3.0
       ... 
9828    6.5
9829    4.0
9830    7.5
9831    8.5
9832    6.0
Name: score, Length: 9833, dtype: float64

In [33]:
nan_count = Coherence_df['score'].isna().sum()
nan_count

43

In [34]:
def clean_text_v2(text):
    cleaned_text = text.replace('*', '').replace('#', '').replace('\n', '').replace('-', '').replace('*', '').strip()
    cleaned_text = re.sub(r"\d+\.*\d*", "", cleaned_text).strip()
    cleaned_text = re.sub(r"\[\d+\.*\d*\]", "", cleaned_text).strip()
    cleaned_text = cleaned_text.strip(": -")
    cleaned_text = cleaned_text.replace("Suggested Band Score", "").strip()
    cleaned_text = cleaned_text.replace("(Coherence and Cohesion)", "").strip()
    cleaned_text = cleaned_text.replace("[]", "").strip()
    return cleaned_text

In [35]:
Coherence_df['text'] = Coherence_df['text'].apply(clean_text_v2)

In [36]:
Coherence_df['text'][10]

'The essay is wellorganized and easy to follow, with clear transitions between sentences and paragraphs. The use of connecting words and phrases helps maintain a smooth progression of ideas. The information is presented in a logical sequence, with each paragraph building on the previous one.'

In [37]:
Coherence_df['text'][20]

'The essay lacks a clear structure and organization, with ideas presented in a disjointed manner. Transitions between sentences and paragraphs are often abrupt and unclear. The logical sequence of information is not always evident, making the essay difficult to follow.'

In [38]:
Coherence_df['text'][30]

'The essay is generally wellorganized and easy to follow. The candidate has used transition words and phrases to connect sentences and paragraphs, and the ideas flow logically. However, there are some instances where the transitions could be smoother, and the essay could benefit from more explicit signposting to guide the reader through the different sections.'

In [39]:
Coherence_df = Coherence_df[['prompt', 'essay', 'text', 'score']]
Coherence_df

Unnamed: 0,prompt,essay,text,score
0,Interviews form the basic criteria for most la...,It is believed by some experts that the tradit...,The essay is wellorganized and easy to follow....,7.5
1,Interviews form the basic selecting criteria f...,Nowadays numerous huge firms allocate an inter...,The essay is wellorganized and easy to follow....,4.5
2,Interview form the basic selection criteria fo...,The interview section is the most vital part o...,The essay is wellorganized and presents a logi...,6.5
3,Interviews form the basic selection criteria f...,It is argued that the best method to recruit e...,"The essay lacks a clear and logical structure,...",5.5
4,Interviews from the basic selecting criteria f...,Nowadays many companies conduct interviews bef...,The essay lacks coherence and cohesion. Ideas ...,3.0
...,...,...,...,...
9828,Nations should spend more money on skills and ...,"These days, many countries use a huge amount o...","The essay is generally coherent and cohesive, ...",6.5
9829,Nations should spend more money on skills and ...,Skills are required in order to achieve succes...,The essay lacks clarity and fluidity in transi...,4.0
9830,Nations should spend more money on skills and ...,There is a contradictory view among the people...,Transitions between sentences and paragraphs a...,7.5
9831,Nations should spend more money on skills and ...,Many today feel that countries should prioriti...,The essay flows smoothly with effective transi...,8.5


In [40]:
nan_count = Coherence_df.isna().any(axis=1).sum()
nan_count

43

In [41]:
empty_string_count = (Coherence_df['text'] == '').sum()
print(f"Number of rows with empty '': {empty_string_count}")
Coherence_df = Coherence_df[Coherence_df['text'] != '']

Number of rows with empty '': 0


In [42]:
Coherence_df = Coherence_df.dropna(subset=['score'])
nan_count = Coherence_df.isna().any(axis=1).sum()
nan_count

0

In [43]:
len(Coherence_df)

9790

In [44]:
csv_path = 'data/Coherence.csv'
Coherence_df.to_csv(csv_path, index=False)