# Scrapping Kwayedza for Shona dataset
I will be scrapping [https://www.kwayedza.co.zw/](https://www.kwayedza.co.zw/) to build a Shona language dataset which I will use to build the vocabulary for
the autocompletion web application which I will call **ShonaSense**.

In [1]:
# Required libraries
from bs4 import BeautifulSoup
import requests

In [2]:
DATASETS_DIR = "../datasets/"

In [9]:
# blog categories on kwayedza.co.zw
blog_categories = [
    "https://www.kwayedza.co.zw/category/nhau-dzemuno/",
]

In [10]:
# Get all blog links in each category
def get_blog_links(categories):
    links = set()

    for category in categories:
        try:
            res = requests.get(category)
            res.raise_for_status()  # Raise an exception for HTTP errors (e.g., 404)

            soup = BeautifulSoup(res.text, 'html.parser')
            news_cards = soup.find_all('div', attrs={'itemtype': 'http://schema.org/NewsArticle'})
            headings = [card.find('h3', attrs={'class': 'entry-title list-article-title'}) for card in news_cards]
            links.update([heading.a['href'] for heading in headings])

        except requests.exceptions.RequestException as e:
            # Handle HTTP request errors (e.g., connection error)
            print(f"Error making HTTP request for {category}: {e}")
            continue
        except Exception as e:
            # Handle other exceptions (e.g., parsing error)
            print(f"Error parsing {category}: {e}")
            continue

    return list(links)


blog_links = get_blog_links(blog_categories)
blog_links[:5]

['https://www.kwayedza.co.zw/psl-yoshora-mhirizhonga-munhabvu/',
 'https://www.kwayedza.co.zw/musungwa-akarova-gadhijeri-mucourt/',
 'https://www.kwayedza.co.zw/bishop-wekukiira-mwana-mumba-atiza/',
 'https://www.kwayedza.co.zw/vanhu-12-vasungirwa-mhirizhonga-yekubabourfields/',
 'https://www.kwayedza.co.zw/vechidiki-musangogara-zvishandirei/']

In [11]:
# How many blog links were scrapped?
len(blog_links)

20

In [12]:
# Save links to file (checkpoint)
def save_links_to_file(links, file_path):
    try:
        with open(file_path, "w") as file:
            for link in links:
                file.write(link + "\n")
        print("Links have been saved to", file_path)
    except Exception as e:
        print("An error occurred:", str(e))

file_path = f"{DATASETS_DIR}/blog_links.txt"
save_links_to_file(blog_links, file_path)

Links have been saved to ../datasets//blog_links.txt


In [9]:
# Get the content of all the blogs
def get_blog_content(links):
    content = []

    for link in links:
        try:
            res = requests.get(link)
            res.raise_for_status()  # Raise an exception for HTTP errors (e.g., 404)

            soup = BeautifulSoup(res.text, 'html.parser')
            blog_content = soup.find('div', attrs={"class":"entry-content", "id":"article-content"})
            paragraphs = blog_content.find_all('p')

            for paragraph in paragraphs:
                if 'class' in paragraph.attrs:
                    if paragraph['class'] in [['fn', 'org'], ['wp-caption-text'], ['']]:
                        continue

                if paragraph.strong:
                    continue

                content.append(paragraph.string)

        except requests.exceptions.RequestException as e:
            # Handle HTTP request errors (e.g., connection error)
            print(f"Error making HTTP request for {link}: {e}")
            continue
        except Exception as e:
            # Handle other exceptions (e.g., parsing error)
            print(f"Error parsing {link}: {e}")
            continue

    return content


content = get_blog_content(blog_links)
content[:5]

['MUFUNDISI wekuMakokoba, kuBulawayo, akashamisa vagari venzvimbo iyi apo akarova muroja wake uyo anove n’anga asina kupfeka sechirango chekuridza mimhanzi zvine ruzha rwakanyanyisa apo iye aive pakati pekuitisa svondo.',
 'Mufundisi uye muvambi wechechi yeTsime Rase Birthsider, Pastor Khumbulani Mzizi, uyo aiitisa chechi pamba pake, akarova Sikhululiwe Dube apo airidza mimhanzi zvine ruzha pawairesi yake.',
 'Zvinonzi Mzizi paaiita svondo kumba kwake, Dube akatanga kuridza mimhanzi zvine ruzha.',
 'Izvi hazvina kufadza mufundisi uyu, ndokutanga kupomera Dube mhosva yekumuvhiringa iye neveungano rake vari pakati pekunamata.',
 'Mzizi akaudza Dube kuti adzikise ruzha rwewairesi yake asi akaramba.']

In [11]:
# Save the content to file (checkpoint)
# This will be used later to build the vocabulary for the model
def save_content(content, file_path):
    # Filter out None values and join the remaining strings into a single document
    document_text = "\n".join([text for text in content if text is not None])

    # Save the document to the specified file path
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(document_text)



file_path = f"{DATASETS_DIR}/content.txt"
save_content(content, file_path)