In [3]:
!pip install datasets
!pip install langdetect

from datasets import load_dataset
import pandas as pd
import re
from bs4 import BeautifulSoup
from langdetect import detect


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=99f87743768d3091d199260764163ecaaac1680dc4f228c7baa05f13a0cfa46c
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [4]:
dataset = load_dataset("derekiya/swahili_news")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/32.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24210 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7338 [00:00<?, ? examples/s]

In [5]:
print(dataset['train'])

Dataset({
    features: ['text', 'label'],
    num_rows: 24210
})


In [6]:
print(dataset['test'])

Dataset({
    features: ['text', 'label'],
    num_rows: 7338
})


In [7]:
df_train = pd.DataFrame(dataset['train'])

In [8]:
df_test = pd.DataFrame(dataset['test'])

In [10]:
print(df_train.head())

                                                text  label
0  Chanzo cha picha, Getty Images\nPazia limeshus...      2
1  Chanzo cha picha, Getty Images\nManchester Uni...      2
2  Chanzo cha picha, Getty Images\nMeneja wa Burn...      2
3  Chanzo cha picha, Getty Images\nManchester Uni...      2
4  Chanzo cha picha, Getty Images\nAston Villa wa...      2


In [23]:
print(df_test.head())

                                                text  label
0   BUNGE limehakikishiwa kuwa hakuna changamoto ...      1
1   Twiga ilicheza mechi ya kirafiki na Kenya kwe...      2
2  ['Miaka mitano iliyopita Harry Maguire alikuwa...      2
3  Bethsheba Wambura, Dar es Salaam Msanii wa Bon...      4
4  \nMwekezaji wa Klabu ya Simba, Mohammed Dewji ...      2


In [15]:
def remove_punctuation_and_normalize_whitespace(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_html_tags_and_special_characters(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    return text

def is_swahili(text):
    try:
        return detect(text) == 'sw'
    except:
        return False

def remove_credits(text):
    phrases_to_remove = [
        'Chanzo cha picha, Getty Images',
        'Chanzo cha picha, GETTY IMAGES',
        'Chanzo cha picha, BBC Sport',
        'Chanzo cha picha, Mwenda',
        'Chanzo cha picha, Dkt Berno Mwambe',
        'Chanzo cha picha, PACE Sports Management',
        'Chanzo cha picha, Alamy',
        'Chanzo cha picha, ADELE JOHNSTON',
        'Chanzo cha picha, Reuters',
        'bbc news swahili',
        'BBC  Edinburgh',
        'BBC News, Lusaka',
        'Chanzo cha picha, Andrew Kazadi',
        'BBC Africa',
        'BBC Korea',
        'Chanzo cha picha, BMJ',
        'BBC News',
        'Chanzo cha picha, BBC / TWO RIVERS MEDIA',
        'Chanzo cha picha, AFP',
        'Chanzo cha picha, NEWS1',
        'Chanzo cha picha, ANG JUN-HA',
        'Chanzo cha picha, ADRIAN PEACOCK',
        'BBC News, Norfolk',
        'Chanzo cha picha, Google',
        'Chanzo cha picha, ISABEL WALTON',
        'Chanzo cha picha, BBC/FIRECRACKER FILMS',
        'Chanzo cha picha, PERSONAL ARCHIVES',
        'Chanzo cha picha, KAREN COOPER',
        'Chanzo cha picha, STEVE HUNTLEY/BBC',
        'Chanzo cha picha, MINISTRY OF PUBLIC HEALTH OF CAMEROON',
        'Chanzo cha picha, ZEBA GUFRAN',
        'Chanzo cha picha, ALPANA SHARMA',
        'Chanzo cha picha, ABHINAND',
        'Chanzo cha picha, PERSONAL FILE',
        'Chanzo cha picha, MIT',
        'Chanzo cha picha, Lucy Owen/BBC',
        'Chanzo cha picha, Hirima Mbilu',
        'Chanzo cha picha, Shirika la Women Empowerment Network',
        'Chanzo cha picha, Amina Abdhalla',
        'Chanzo cha picha, AJ_Watt/Getty Images',
        'Chanzo cha picha, Rubens Alarcon/Alamy',
        'Chanzo: TUKO.co.ke',
        'Chanzo cha picha, IKEA',
        'Chanzo cha picha, iStock',
        'Chanzo cha picha, Thinkstock',
        'Chanzo cha picha, Martha Sepúlveda',
        'Chanzo cha picha, AFP',
        'Chanzo cha picha, BBC news',
        'Chanzo cha picha, Loretta Herms',
        'Chanzo cha picha, RAINER JUATI',
        'Chanzo cha picha, KENT POLICE',
        'Chanzo cha picha, DR DAKSHAYANI PURINI',
        'Chanzo cha picha'
    ]

    pattern = '|'.join([re.escape(phrase) for phrase in phrases_to_remove])
    text = re.sub(pattern, '', text)

    return text


def preprocess_text(text):
    text = str(text)
    text = remove_credits(text)
    text = remove_punctuation_and_normalize_whitespace(text)
    text = remove_html_tags_and_special_characters(text)
    return text



In [16]:
cleaned_texts = []
labels = []

for text, label in zip(df_train['text'], df_train['label']):
    cleaned_text = preprocess_text(text)

    if is_swahili(cleaned_text):
        cleaned_texts.append(cleaned_text)
        labels.append(label)

In [17]:
cleaned_train = pd.DataFrame({'text': cleaned_texts, 'label': labels})

In [30]:
print(cleaned_train.head())

                                                text  label
0  Pazia limeshushwa katika kampeni za Ligi Kuu y...      2
1  Manchester United wanaweza kubadilisha meneja ...      2
2  Meneja wa Burnley Vincent Kompany 38 ni miongo...      2
3  Manchester United wanataka takriban asilimia 7...      2
4  Aston Villa wana nia ya kumsajili kiungo wa ka...      2


In [22]:
cleaned_text_test = []
labels_test = []

for text, label in zip(df_test['text'], df_test['label']):
    cleaned_text = preprocess_text(text)

    if is_swahili(cleaned_text):
        cleaned_text_test.append(cleaned_text)
        labels_test.append(label)

In [24]:
cleaned_test = pd.DataFrame({'text': cleaned_text_test, 'label': labels_test})

In [25]:
print(cleaned_test.tail())

                                                   text  label
7331  Kamati hiyo ilibainisha kuwa moja ya mapungufu...      0
7332  ARODIA PETERDODOMA HOSPITALI ya Rufaa ya Benja...      1
7333  WAKATI mazoezi ya timu ya taifa ya Tanzania Ta...      2
7334  Na Suleiman Rashid OmarPemba WIZARA ya Afya na...      1
7335  BAO pekee lililofungwa na mshambuliaji wa Yang...      2
