In [1]:
import pandas as pd
import re
import unicodedata
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

# nltk.download('stopwords')
# nltk.download('wordnet')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to C:\Users\MY
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\MY
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load datasets

In [2]:
df_sample = pd.read_csv("data/sample_submission.csv")
df_sample.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


In [4]:
df_test = pd.read_csv("data/test_set.csv")
df_test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [5]:
df_train = pd.read_csv("data/train_set.csv")

In [7]:
pd.set_option('display.max_colwidth', None)
df_train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi naphi na kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga i-dha mayibize uncedo olufanelekileyo elungelweni layo
2,eng,the province of kwazulu-natal department of transport invites tenders from established contractors experienced in bridge construction for the construction of the kwajolwayo tugela river pedestrian bridge near tugela ferry the duration of the project will be months
3,nso,o netefatša gore o ba file dilo ka moka tše le dumelelanego ka tšona mohlala maleri a magolo a a šomišwago go fihlelela meagong e metelele scaffolds a a bolokegilego lefelo la maleba la go šomela go phela gabotse bjbj
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana u ya nga mulayo wa khomishini ya ndinganyiso ya mbeu u thetshelesa mbilaelo dzine dza tshimbilelana na tshialula u ya nga mbeu nahone i ivhea sa foramu ya thungo u ya nga mulayo wa ndinganyiso


In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


In [9]:
# Print the unique items in the 'lang_id' column
unique_lang_ids = df_train['lang_id'].unique()
unique_lang_ids

array(['xho', 'eng', 'nso', 'ven', 'tsn', 'nbl', 'zul', 'ssw', 'tso',
       'sot', 'afr'], dtype=object)

In [None]:
## Clean datasets using regular expressions

In [44]:
def clean_text(text, lang):
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove words with length 1 or 2
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
    text = re.sub(r'^\s+|\s+$', '', text)  # Remove leading and trailing whitespaces
    text = re.sub(r"\d+", "", text)    # Remove numbers
    text = re.sub(r"[^\w\s]", "", text)    # Remove punctuation and special characters
    
    # Check language and remove stopwords accordingly
    stop_words = set(stopwords.words('english'))
    if lang == 'xhosa':
        stop_words = set(stopwords.words('xhosa'))
    elif lang == 'nsotho':
        stop_words = set(stopwords.words('nsotho'))
    elif lang == 'venetan':
        stop_words = set(stopwords.words('italian'))
    elif lang == 'tsonga':
        stop_words = set(stopwords.words('tsonga'))
    elif lang == 'southernsotho':
        stop_words = set(stopwords.words('southernsotho'))
    elif lang == 'zulu':
        stop_words = set(stopwords.words('zulu'))
    elif lang == 'swazi':
        stop_words = set(stopwords.words('swazi'))
    elif lang == 'sesotho':
        stop_words = set(stopwords.words('sesotho'))
    elif lang == 'afrikaans':
        stop_words = set(stopwords.words('dutch'))

    tokens = text.split()
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

In [45]:
df = df_train

In [46]:
# Get unique languages from the 'lang' column
languages = df['lang_id'].unique()

In [47]:
# Apply the clean_text function to the 'text' column for each language
for lang in languages:
    df.loc[df['lang_id'] == lang, 'text'] = df.loc[df['lang_id'] == lang, 'text'].apply(lambda x: clean_text(x, lang))

In [48]:
df.head()

Unnamed: 0,lang_id,text
0,xho,umgaqosiseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika
1,xho,dha iya kuba nobulumko bokubeka umsebenzi naphi kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga dha mayibize uncedo olufanelekileyo elungelweni layo
2,eng,province kwazulunatal department transport invite tender established contractor experienced bridge construction construction kwajolwayo tugela river pedestrian bridge near tugela ferry duration project month
3,nso,netefatša gore file dilo moka tše dumelelanego tšona mohlala maleri magolo šomišwago fihlelela meagong metelele scaffold bolokegilego lefelo maleba šomela phela gabotse bjbj
4,ven,khomishini ndinganyiso mbeu ewa maana nga mulayo khomishini ndinganyiso mbeu thetshelesa mbilaelo dzine dza tshimbilelana tshialula nga mbeu nahone ivhea foramu thungo nga mulayo ndinganyiso


In [69]:
def clean_text_2(text):
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove words with length 1 or 2
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
    text = re.sub(r'^\s+|\s+$', '', text)  # Remove leading and trailing whitespaces
    text = re.sub(r"\d+", "", text)    # Remove numbers
    text = re.sub(r"[^\w\s]", "", text)
    text = text.lower()  # Convert text to lowercase
    return text

In [70]:
df_test['text'] = df_test['text'].apply(clean_text_2)

In [71]:
df_test.head()

Unnamed: 0,index,text
0,1,mmasepala maemo kgethegileng letlelela kgato
1,2,uzakwaziswa ngokufaneleko nakungafuneka eminye imitlolo engezelelako ukuqedelela ukutloliswa kwesibawo sakho
2,3,tshivhumbeo tshi fana ngano dza vhathu
3,4,kube inja nelikati betingevakala kutsi titsini naticocisana
4,5,winste buitelandse valuta


### Save cleaned dataset

In [72]:
# Save the cleaned DataFrame as a CSV file
df.to_csv('data/cleaned_data.csv', index=False)

In [73]:
df_test.to_csv('data/cleaned_test_data.csv', index=False)