### Step 1, grab all URLs that each URL contains one example essay, their prompt and comment that including band score

In [1]:
import requests
from bs4 import BeautifulSoup
from pprint import pprint


def crawl_and_find_element(url, target_element, target_class=None, target_id=None):
    """
    Fetches the HTML content of the specified URL and searches for the target element.

    Parameters:
    - url (str): The URL of the web page to crawl.
    - target_element (str): The HTML tag to search for (e.g., 'div', 'p', 'h1').
    - target_class (str, optional): The class attribute to filter the target element.
    - target_id (str, optional): The id attribute to filter the target element.

    Returns:
    - str: The HTML content of the target element(s) found.
    """
    try:
        # Send a GET request to the specified URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Search for the target element with optional class and id filters
        if target_class and target_id:
            elements = soup.find_all(target_element, class_=target_class, id=target_id)
        elif target_class:
            elements = soup.find_all(target_element, class_=target_class)
        elif target_id:
            elements = soup.find_all(target_element, id=target_id)
        else:
            elements = soup.find_all(target_element)

        # Extract and return the HTML content of the found elements
        return '\n'.join(str(element) for element in elements)

    except requests.exceptions.RequestException as e:
        return f"Error fetching {url}: {e}"


In [2]:
def crawl_URL(url):
    matching_links = []
    try:
        # Send a GET request to the specified URL
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all <a> tags with the specified 'rel' and 'target' attributes
        for link in soup.find_all('a', rel="noopener", target="_blank"):
            href = link.get('href')
            rel = link.get('rel')
            target = link.get('target')
            matching_links.append({'href': href, 'rel': rel, 'target': target})

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
    
    return matching_links


In [3]:
score_to_urls = {}
scores = [5, 6, 7, 8, 9]
for score in scores:
    url = f"https://www.ielts-blog.com/category/ielts-writing-samples/ielts-essays-band-{score}/"
    raw_data = crawl_URL(url)
    links = [each['href'] for each in raw_data]
    score_to_urls[score] = links

In [4]:
pprint(score_to_urls)


{5: ['https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-capital-punishment/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-education-with-or-without-a-teacher/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-popular-hobby-rather-than-favorite-passtime/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-university-money-better-spent-on-libraries-or-sports/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-modern-medicine-helps-to-live-longer/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-mothers-and-fathers-role-in-a-family/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-keeping-pets-to-live-a-more-enjoyable-life/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielt

In [5]:
# https://www.ielts-blog.com/category/ielts-writing-samples/ielts-essays-band-6/ is diff, treat accordingly and add to dict score_to_urls
def crawl_URL_for_band_6(url="https://www.ielts-blog.com/category/ielts-writing-samples/ielts-essays-band-6/"):
    matching_links = []
    try:
        # Send a GET request to the specified URL
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all <a> tags with the specified 'rel' and 'target' attributes
        for link in soup.find_all('a'):
            href = link.get('href')
            matching_links.append({'href': href})

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
    
    return matching_links

In [6]:
URLS = crawl_URL_for_band_6()

In [7]:
pprint(URLS)

[{'href': '#content'},
 {'href': 'https://www.ielts-blog.com/'},
 {'href': 'https://www.ielts-blog.com/ielts-preparation-services/'},
 {'href': 'https://www.ielts-blog.com/check-your-ielts-writing/'},
 {'href': 'https://test.ielts-blog.com'},
 {'href': 'https://www.ielts-blog.com/ielts-preparation-highly-effective-and-with-proven-results/'},
 {'href': 'https://www.ielts-blog.com/improve-your-ielts-speaking/'},
 {'href': 'https://www.ielts-blog.com/download-free-ielts-general-academic-study-books/'},
 {'href': 'https://www.ielts-blog.com/ace-the-ielts-offer-page/'},
 {'href': 'https://www.ielts-blog.com/ielts-target-band-7-offer-page/'},
 {'href': 'https://www.ielts-blog.com/estimate-your-band-score/'},
 {'href': 'https://www.ielts-blog.com/ielts-success-formula/'},
 {'href': 'https://www.ielts-blog.com/ielts-writing-samples-essays-letters-reports/'},
 {'href': 'https://www.ielts-blog.com/ielts-writing-samples-essays-letters-reports/'},
 {'href': 'https://www.ielts-blog.com/category/iel

In [8]:
# notice all wanted URL has common prefix

common_prefix = "https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-6/"
clean_data_band6 = []
for each in URLS:
    the_url = each['href']
    if the_url and common_prefix in the_url:
        clean_data_band6.append(the_url.strip())

pprint(clean_data_band6)

['https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-6/ielts-essay-topic-globalization/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-6/ielts-essay-topic-leisure-time-activities/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-6/ielts-essay-topic-financial-education/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-6/ielts-essay-learning-about-past/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-6/ielts-essay-topic-education-critical-factor/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-6/ielts-essay-topic-reasons-to-attend-college/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-6/ielts-essay-topic-dieting-changes-a-persons-life/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-6/ielts-essay-topic-aspects-of-globalization/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-6/ielts-essay-t

In [9]:
score_to_urls[6] = clean_data_band6
url_count = 0
for each in score_to_urls:
    url_count += len(score_to_urls[each])
print(f"found {url_count} example essays from band score 5-9")

found 210 example essays from band score 5-9


In [10]:
pprint(score_to_urls)

{5: ['https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-capital-punishment/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-education-with-or-without-a-teacher/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-popular-hobby-rather-than-favorite-passtime/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-university-money-better-spent-on-libraries-or-sports/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-modern-medicine-helps-to-live-longer/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-mothers-and-fathers-role-in-a-family/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-keeping-pets-to-live-a-more-enjoyable-life/',
     'https://www.ielts-blog.com/ielts-writing-samples/ielt

**Step 1 done**
### Step 2: extract useful information from each URL
At this point found 200+ URL each contain an essay example (prompt + essay + comment). 
The organization is messy, need to do a lot of work of cleaning.

##### Experiment cell are closed up. They are useless

In [38]:



def extract_paragraphs_between_divs(url, start_div_class, end_div_class):
    """
    Fetches the HTML content of the specified URL and extracts all <p> elements
    located between the first occurrence of start_div_class and end_div_class.

    Parameters:
    - url (str): The URL of the web page to fetch.
    - start_div_class (str): The class attribute of the starting <div>.
    - end_div_class (str): The class attribute of the ending <div>.

    Returns:
    - list: A list of strings, each representing the text content of a <p> element.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses

        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the starting and ending <div> elements
        start_div = soup.find('div', class_=start_div_class)
        end_div = soup.find('div', class_=end_div_class)

        if not start_div or not end_div:
            print("One or both of the specified <div> elements were not found.")
            return []

        # Initialize a list to hold the text of <p> elements
        paragraphs = []

        # Traverse the elements between start_div and end_div
        current_element = start_div.find_next_sibling()
        while current_element and current_element != end_div:
            if isinstance(current_element, Tag) and current_element.name == 'p':
                paragraphs.append(current_element.get_text(strip=True))
            current_element = current_element.find_next_sibling()

        return paragraphs

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        return []
    except Exception as err:
        print(f"An error occurred: {err}")
        return []

In [32]:
# url = 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-9/ielts-essay-topic-do-schools-still-need-to-teach-handwriting-and-mental-mathematics-skills/'
# div_class = 'nv-content-wrap entry-content'
# content = fetch_div_content(url, div_class)


In [42]:

url = 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-9/ielts-essay-topic-do-schools-still-need-to-teach-handwriting-and-mental-mathematics-skills/'
start_div_classes = [
    'nv-content-wrap', 
    'entry-content'
]
end_div_classes = [
        'yarpp',
        'yarpp-related',
        'yarpp-related-website',
        'yarpp-related-none',
        'yarpp-template-list'
    ]

paragraphs = extract_paragraphs_between_divs(url, start_div_classes, end_div_classes)


In [43]:
print(paragraphs)
for para in paragraphs:
    print(para)

[]


##### Useful from here

In [85]:
from bs4 import Tag
import re

In [13]:
def url_find_text_and_clean(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raises an HTTPError for bad responses

    soup = BeautifulSoup(response.content, 'html.parser')
    start_div_class = 'nv-content-wrap entry-content'
    start_div = soup.find('div', class_=start_div_class)
    print(start_div)

    # Step 1: remove div which class is tippy. Useless to our task.
    for div in start_div.find_all('div', class_='tippy'):
        div.decompose()  # Removes the tag from the tree

    # Step 2: find end div. This div tells where prompt + essay + comment ends. everything from there is useless to our task.
    end_div = start_div.find('div', class_=['yarpp', 'yarpp-related', 'yarpp-related-website', 'yarpp-related-none', 'yarpp-template-list'])
    if end_div:
        # Remove all siblings after end_div, including end_div itself
        for sibling in end_div.find_next_siblings():
            sibling.decompose()
        end_div.decompose()  # Remove end_div itself

    # Step 3: get rid of HTML elements like <a> and <strong>, extract all text
    text_content = start_div.get_text(separator=" ").strip()

    # Step 4: bad char exists in extract text due to different encoding methods. replace all with correct char
    bad_char = {
        "‘": "'",
        "’": "'",
        "–": "-",
        "   ": " ",
        "  ": " ",
        "\n ": "\n",
    }
    for bad in bad_char:
        text_content = text_content.replace(bad, bad_char[bad])

    return text_content

In [14]:
url = 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-5/ielts-essay-topic-popular-hobby-rather-than-favorite-passtime/'
print(url_find_text_and_clean(url))

<div class="nv-content-wrap entry-content"><p>Nowadays people like to change their day by day activities according to the latest trends and also they are following popular things <a id="tippy_tip0_7272_anchor"></a>. This essay will explain the reason why the people are spending more time<br/>
<a id="tippy_tip1_6985_anchor"></a>popular hobbies rather than their individual activities.<br/>
<div class="g g-1"><div class="g-single a-11"><a class="gofollow" data-track="MTEsMSwxLDYw" href="https://vip.ielts-blog.com/?utm_source=ielts-blog&amp;utm_medium=adrotate&amp;utm_campaign=vipclub_ad1"><img data-lazy-src="https://www.ielts-blog.com/images/banners/vip-club-banner-try-free.jpg" decoding="async" height="280" src="data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20336%20280'%3E%3C/svg%3E" width="336"/><noscript><img decoding="async" height="280" src="https://www.ielts-blog.com/images/banners/vip-club-banner-try-free.jpg" width="336"/></noscript></a></div></di

### Step 3: Crawl essays from all URLs
now we have a function that take single URL and extract raw text from URL, cleaned, no HTML tags included, but no seperation, prompt, essay, comment are together. Consider human labour

In [35]:
import csv
import time
import random
from tqdm import tqdm

In [97]:
data = [
    ["score", "raw_data"]
]

total = url_count
with tqdm(total=total) as pbar:
    for score in score_to_urls:
        for url in score_to_urls[score]:
            # print(score, url)
            data.append([score, url_find_text_and_clean(url)])
            time.sleep(random.uniform(0, 3))  # avoid banned by firewall 
            pbar.update(1)

100%|██████████| 210/210 [05:46<00:00,  1.65s/it]


In [99]:
pprint(data)

[['score', 'raw_data'],
 [5,
  'Without capital punishment our lives are less secure and crimes or violence '
  'increase. To what extent do you agree or disagree with this opinion? \n'
  'Serious crimes need capital punishment so that the are unable to get '
  'involved in the crime in the future. However, they want to stop the acts of '
  'violation in the future then it would be better to . \n'
  '\n'
  'Overall, I agree with the fact that punishment is the way to avoid the and '
  'hence our lives become more secure. If the wrongdoer wants to be a good and '
  'there is a particular financial or personal problem that led him to the '
  'wrong way, then it would be the nice option to forgive him and try to solve '
  'the problem he . Although by this way, some bad may become effective of the '
  'society but some do not bring themselves to the right path because they are '
  '. The person that involved in the crime and never to stop the law-breaking '
  'act should be punished in ex

In [102]:
with open('output.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=';')
    writer.writerows(data)

### playground

In [15]:
for each in score_to_urls:
    print(f"band score {each}: {len(score_to_urls[each])} entries")

band score 5: 8 entries
band score 6: 22 entries
band score 7: 18 entries
band score 8: 109 entries
band score 9: 53 entries


### Step 4: Advance cleaning. Clean useless commertial text in data like "click here for more band score x essays"
Used Google sheets to help cleaning

### Step 5: manual seperation of band 5 6 7
Used Google sheets to help 


### Step 6: Regex to match band 8 and 9 patterns

band 8:

In [18]:
band_8_url = score_to_urls[8]
pprint(band_8_url)

['https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-nowadays-celebrities-earn-more-money-than-politicians-what-are-the-reasons-for-this/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-celebrities-can-be-poor-role-models-for-teenagers-agree-disagree/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-why-do-criminals-commit-another-offence-after-being-punished/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-crime-rates-decline-due-to-advancements-technology-agree-disagree/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-if-child-commits-crime-who-should-be-punished-opinion/',
 'https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-criminals-commit-crimes-after-release-from-prison-reasons-solutions/',
 'https://www.ielts-blog.com/ielts-writing-samples

In [121]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def get_div_content(url):
    # Set up Chrome options for headless mode
    chrome_options = Options()
    # chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")  # Disable GPU for headless mode
    chrome_options.add_argument("--no-sandbox")  # For environments without sandbox
    chrome_options.add_argument("--disable-dev-shm-usage")  # Avoid shared memory issues

    # Set up the WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    
    try:
        # Navigate to the URL
        driver.get(url)
        
        # Wait for the content to load
        wait = WebDriverWait(driver, 10)  # Wait up to 10 seconds
        wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, "nv-content-wrap"))
        )
        time.sleep(1)

        # Get the content of the target div
        div_content = driver.find_element(By.CLASS_NAME, "nv-content-wrap").get_attribute("outerHTML")
        return div_content
    
    finally:
        # Close the browser
        driver.quit()

In [136]:
def clean_text(text):
    bad_char = {
        "‘": "'",
        "’": "'",
        "–": "-",
        "   ": " ",
        "  ": " ",
        "\n ": "\n",
    }
    for bad in bad_char:
        text = text.replace(bad, bad_char[bad])
    return text
    

def content_cleaner(content):
    # headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
    # response = requests.get(url, headers=headers)
    # response.raise_for_status()  # Raises an HTTPError for bad responses

    # soup = BeautifulSoup(response.content, 'html.parser')
    start_div_class = 'nv-content-wrap entry-content'
    start_div = content.find('div', class_=start_div_class)
    # print(start_div)

    # Step 1: remove div which class is tippy. Useless to our task.
    for div in start_div.find_all('div', class_='tippy'):
        div.decompose()  # Removes the tag from the tree

    # Step 2: find end div. This div tells where prompt + essay + comment ends. everything from there is useless to our task.
    end_div = start_div.find('div', class_=['yarpp', 'yarpp-related', 'yarpp-related-website', 'yarpp-related-none', 'yarpp-template-list'])
    if end_div:
        # Remove all siblings after end_div, including end_div itself
        for sibling in end_div.find_next_siblings():
            sibling.decompose()
        end_div.decompose()  # Remove end_div itself
    # print(start_div)
    
    result = []
    if start_div:
        # Iterate through all child elements within the start div
        for element in start_div.find_all(True):  # find_all(True) gets all tags
            # Check if the element has no child elements (leaf node)
            if not element.find_all(True) or element.name in ['strong', 'p', 'a']:
                text = element.get_text(strip=False)  # Get the text content of the element
                if text:  # Only process elements that have non-empty text
                    # print(f"Element: {element.name}, Text: {text}")
                    
                    result.append((element.name, clean_text(text)))
    return result
    # Step 3: get rid of HTML elements like <a> and <strong>, extract all text
    text_content = start_div.get_text(separator=" ").strip()

    # Step 4: bad char exists in extract text due to different encoding methods. replace all with correct char
    

    return text_content

def state_machine_process(contents):
    skip_key_words = [
        "Sample Band",
        "Sample Essay", 
        "Download the",
        "Band 8",
    ]
    state_machine = ['strong', 'essay', 'done']
    state = 0
    prompt, essay, comment = "", "", ""
    for element, text in contents:
        for each in skip_key_words:
            if text.startswith(each):
                continue
        # print(state, element, text[:100])
        if state == 0:  # before strong
            if element == "strong":
                prompt = text
                state += 1
        elif state == 1:  # strong
            if element == "p":
                essay += text + "\n"
            elif element == "strong":
                comment = text
                state += 1
        
        elif state == 2:  # essay
            break
        else:
            print("wrong")

    return prompt, essay, comment

In [107]:
url = "https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-the-development-of-technology-causes-traditional-skills-to-die-out-agree-or-disagree/"
content = get_div_content(url)
soup = BeautifulSoup(content, "html.parser")

In [108]:
contents = content_cleaner(soup)
# pprint(contents)
pprint(state_machine_process(contents))

('When a country develops its technology, the traditional skills and ways of '
 'life die out. It is pointless to try and keep them alive.To what extent do '
 'you agree or disagree with this opinion?',
 'Nowadays, technological advances and their rapid and wide applications are '
 "having a significant impact on a nation's traditional skills and ways of "
 'life. Some argue that such impact is so extraordinary that it would make '
 'conventional skills and life styles obsolete. However, I believe they would '
 'continue to thrive by providing alternatives to modern ways of life, and '
 'innovative ideas for modern technologies.\n'
 'First of all, traditional skills and ways of life are becoming an '
 'alternative solution to the problems caused by “mainstreamed” ways of life '
 'which are greatly influenced by modern technologies. For instance, a cozy '
 'restaurant where traditional, home-brewed beer is served, offers another '
 'experience to people who are bored with branded beers 

**Now we have everything to retrieve everything from every band 8 examples, run all band 8**, and store to csv file.

In [109]:
print(len(band_8_url))

109


In [112]:
data = [
    ["score", "prompt", "essay", "comment", "URL"]
]

total = len(band_8_url)
with tqdm(total=total) as pbar:
    for url in band_8_url:
        # print(score, url)
        # url = "https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-the-development-of-technology-causes-traditional-skills-to-die-out-agree-or-disagree/"
        content = get_div_content(url)
        soup = BeautifulSoup(content, "html.parser")

        contents = content_cleaner(soup)
        # pprint(contents)
        # pprint(state_machine_process(contents))
        prompt, essay, comment = state_machine_process(contents)
        
        data.append([8, prompt, essay, comment, url])
        time.sleep(random.uniform(0, 3))  # avoid banned by firewall 
        pbar.update(1)

 34%|███▍      | 37/109 [07:25<14:26, 12.03s/it]


SessionNotCreatedException: Message: session not created
from chrome not reachable
Stacktrace:
	GetHandleVerifier [0x00007FF601DB38A5+3004357]
	(No symbol) [0x00007FF601A49970]
	(No symbol) [0x00007FF6018F5659]
	(No symbol) [0x00007FF6018E729D]
	(No symbol) [0x00007FF6019332A2]
	(No symbol) [0x00007FF60192D56F]
	(No symbol) [0x00007FF601928DBE]
	(No symbol) [0x00007FF601976E01]
	(No symbol) [0x00007FF601976480]
	(No symbol) [0x00007FF60196B983]
	(No symbol) [0x00007FF601937628]
	(No symbol) [0x00007FF601938791]
	GetHandleVerifier [0x00007FF601DDA00D+3161901]
	GetHandleVerifier [0x00007FF601E2E060+3506048]
	GetHandleVerifier [0x00007FF601E2400D+3465005]
	GetHandleVerifier [0x00007FF601BA0EEB+830987]
	(No symbol) [0x00007FF601A5467F]
	(No symbol) [0x00007FF601A509D4]
	(No symbol) [0x00007FF601A50B6D]
	(No symbol) [0x00007FF601A40149]
	BaseThreadInitThunk [0x00007FFF72C2259D+29]
	RtlUserThreadStart [0x00007FFF74F4AF38+40]


**37 examples are successfully retrieved before the module crushed. Safe and munual cleanup**

In [116]:
print(len(data))
with open('band8half.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=';')
    writer.writerows(data)

38


In [119]:
re_crawl =[
    "https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-in-the-past-people-wore-their-traditional-clothes-but-these-days-most-people-wear-similar-clothes-is-this-a-positive-or-negative-development/",
    "https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-some-think-that-children-should-leave-their-family-home-as-soon-as-possible-while-others-believe-they-should-stay-for-as-long-as-they-like-discuss/",
]

band_8_url_cont = re_crawl + band_8_url[37:]

print(len(band_8_url_cont))

74


In [123]:
data2 = [
    ["score", "prompt", "essay", "comment", "URL"]
]

total = len(band_8_url_cont)
with tqdm(total=total) as pbar:
    for url in band_8_url:
        # print(score, url)
        # url = "https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-the-development-of-technology-causes-traditional-skills-to-die-out-agree-or-disagree/"
        content = get_div_content(url)
        soup = BeautifulSoup(content, "html.parser")

        contents = content_cleaner(soup)
        # pprint(contents)
        # pprint(state_machine_process(contents))
        prompt, essay, comment = state_machine_process(contents)
        
        data.append([8, prompt, essay, comment, url])
        time.sleep(random.uniform(0, 3))  # avoid banned by firewall 
        pbar.update(1)

109it [18:50, 10.37s/it]                       


In [135]:
print(data[35 + 35 + 3][4])
new_data = [data[0]] + data[35 + 35 + 3:]

print(new_data[1])

https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-childcare-training-courses-should-be-mandatory-for-all-parents-agree-disagree/
[8, 'Childcare training courses should be mandatory for all parents. Do you agree or disagree with this statement? Give your own opinion and include relevant examples.', "It is true that parents need some knowledge and experience in order to raise their in an appropriate way. In this regard, many families attend different training courses. While I agree that these courses might be useful for some of them, I totally disagree with the recommended obligatory of such training.\nOn the one hand, training courses usually offered by governments could be very useful for those guardians who have a with special needs. For example, if a child has a birth defect or has been diagnosed with any mental or musculoskeletal disorders, these kinds of training could be lifesaving. Additionally, as we know, the first few months after birth are

In [133]:
print(data[35 + 35 + 3][4])
new_data = [data[0]] + data[35 + 35 + 3:]
# pprint(new_data)

print(len(new_data))
with open('band8secondhalf.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=';')
    writer.writerows(new_data)

https://www.ielts-blog.com/ielts-writing-samples/ielts-essays-band-8/ielts-essay-topic-childcare-training-courses-should-be-mandatory-for-all-parents-agree-disagree/
75


band 9:

In [38]:
import re
import pandas as pd


pattern = ".*(Write about the following topic: )(.*)(Sample Band 9 Essay )(.*)(Go here for more IELTS Band 9 Essays)"

file_path = 'data_v1.csv'
csv_read = pd.read_csv(file_path, delimiter=';')
data = [
    ['score', 'prompt', 'essay', 'comment']
]
bad_collection = []
good, bad = 0, 0
try:
    for index, row in csv_read.iterrows():
        # print(f"Row {index + 1}:")
        # for column_name, item in row.items():  # Access each item in the row
        #     print(f"  {column_name}: {item}")
        score = row['score']
        if score != 9:
            continue
        raw = row['raw_data']
        match = re.match(pattern, raw, re.DOTALL)  # Check if it matches the pattern
            
        if match:
            # Print group 2 and group 4 if there's a match
            # print(f"Group 2: {match.group(2)}, Group 4: {match.group(4)}")
            data.append([9, match.group(2), match.group(4), "N/A"])
            good += 1
        else:
            # print(f"Row {index + 1}: No match")
            bad += 1
            bad_collection.append(raw)
except Exception as e:
    print(f"An error occurred: {e}")
    
print(f"{good} good and {bad} bad")

32 good and 21 bad


In [37]:
# We successfully match 32 examples from band 9 collection, store them up
with open('band9_1.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=';')
    writer.writerows(data)

In [40]:
for each in bad_collection:
    print(each)
    break

This is a model response to a Writing Task 2 topic from High Scorer's Choice IELTS Practice Tests book series (reprinted with permission). This answer is likely to score IELTS Band 9. 
Set 1 Academic book, Practice Test 1 
Writing Task 2 
You should spend about 40 minutes on this task. 

Write about the following topic: 
Today's schools should teach their students how to survive financially in the world today. To what extent do you agree or disagree with this statement? 
Give reasons for your answer and include any relevant examples from your knowledge or experience. 

You should write at least 250 words. 

Answer 
In schools, students learn to analyse literature, calculate using trigonometry and understand how photosynthesis works, but often students are inexperienced and helpless after graduation when encountering the real world. It is therefore argued that schools should teach their students how to survive financially in the world. 
This topic is difficult, though. Many educators be

In [50]:
# band 9 batch 2
pattern2 = "(.*)(Write about the following topic: )(.*)(Answer )(.*)(Go here for more IELTS Band 9 Essays)"
data_b2 = [
    ['score', 'prompt', 'essay', 'comment']
]
bad_collection2 = []
good, bad = 0, 0
try:
    for text in bad_collection:
        # print(text)
        # print(f"Row {index + 1}:")
        # for column_name, item in row.items():  # Access each item in the row
        #     print(f"  {column_name}: {item}")
        match = re.match(pattern2, text, re.DOTALL)  # Check if it matches the pattern
            
        if match:
            # Print group 2 and group 4 if there's a match
            # print(f"Group 2: {match.group(2)}, Group 4: {match.group(4)}")
            data_b2.append([9, match.group(3).strip(), match.group(5).strip(), "N/A"])
            good += 1
        else:
            # print(f"Row {index + 1}: No match")
            bad += 1
            bad_collection2.append(text)
except Exception as e:
    print(f"An error occurred: {e}")
    
print(f"{good} good and {bad} bad")

21 good and 0 bad


In [54]:
# We successfully match 32 examples from band 9 collection, store them up
with open('band9_2.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=';')
    writer.writerows(data_b2)