In [23]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from langdetect import detect

In [10]:
df=pd.read_csv('cleaned-df.csv')

In [44]:
print(df.tail())

                                                  Title  \
2177  Mama Amtelekeza Mtoto Wake Mchanga Kanisani, A...   
2178  Homa Bay: Wanakijiji Waingia Baridi Baba Kumzi...   
2179  Kirinyaga: Shamba Boi Mganda Anayehusishwa na ...   
2180  Jamaa Murang'a Ajisalimisha kwa Polisi baada y...   
2181  Watoto 7 wa William Ruto, Kazi Wanazofanya na ...   

                                                Content  
2177  Lucy Chege, mama wa mtoto mmoja, alimwacha bin...  
2178  Wanakijiji wa Nyaburu huko Gwasi, Kaunti ya Ho...  
2179  Baada ya miezi saba ya kukaa mafichoni, Philli...  
2180  Polisi huko Kagumo-ini wanachunguza kisa cha k...  
2181  Rais mteule William Ruto ni baba wa watoto sab...  


In [16]:
# Read raw data from CSV
titles = df['Title'].tolist()
content = df['Content'].tolist()

In [45]:
# Define preprocessing functions
def normalize_text(text):
    return text.lower()

def remove_punctuation_and_normalize_whitespace(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_html_tags_and_special_characters(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    return text

def is_swahili(text):
    try:
        return detect(text) == 'sw'
    except:
        return False

def preprocess_text(text):
    text = normalize_text(text)
    text = remove_punctuation_and_normalize_whitespace(text)
    text = remove_html_tags_and_special_characters(text)
    return text

def tokenize_text(text):
    tokens = text.split()
    return tokens

In [46]:
# Preprocess text data
cleaned_titles = []
cleaned_contents = []

for title, content in zip(df['Title'], df['Content']):
    cleaned_title = preprocess_text(title)
    cleaned_content = preprocess_text(content)
    
    if is_swahili(cleaned_title) and is_swahili(cleaned_content):
        cleaned_titles.append(tokenize_text(cleaned_title))
        cleaned_contents.append(tokenize_text(cleaned_content))

In [47]:
# Store cleaned data in a new DataFrame and save to CSV
cleaned_df = pd.DataFrame({'cleaned_title': cleaned_titles, 'cleaned_content': cleaned_contents})
cleaned_df.to_csv('cleaned_data.csv', index=False)

In [48]:
print(cleaned_df.tail())

                                          cleaned_title  \
1825  [mama, amtelekeza, mtoto, wake, mchanga, kanis...   
1826  [homa, bay, wanakijiji, waingia, baridi, baba,...   
1827  [kirinyaga, shamba, boi, mganda, anayehusishwa...   
1828  [jamaa, muranga, ajisalimisha, kwa, polisi, ba...   
1829  [watoto, 7, wa, william, ruto, kazi, wanazofan...   

                                        cleaned_content  
1825  [lucy, chege, mama, wa, mtoto, mmoja, alimwach...  
1826  [wanakijiji, wa, nyaburu, huko, gwasi, kaunti,...  
1827  [baada, ya, miezi, saba, ya, kukaa, mafichoni,...  
1828  [polisi, huko, kagumoini, wanachunguza, kisa, ...  
1829  [rais, mteule, william, ruto, ni, baba, wa, wa...  
