## EAST AFRICA VIRTUAL HACKATHON 2021: SWAHILI NEWS ML CHALLENGE

## Let's Get Started 

In [93]:
# import important modules
import numpy as np
import pandas as pd

# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB # classifier 

from sklearn.metrics import log_loss #evaluation metric
from sklearn.feature_extraction.text import CountVectorizer

# text preprocessing modules
import re 
from string import punctuation 

import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

In [94]:
# load data
path = ''
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")
submission = pd.read_csv(path+"sample_submission.csv")

In [95]:
# show top five rows of train data
train.head() 

Unnamed: 0,id,content,category
0,SW4670,"Bodi ya Utalii Tanzania (TTB) imesema, itafan...",uchumi
1,SW30826,"PENDO FUNDISHA-MBEYA RAIS Dk. John Magufuri, ...",kitaifa
2,SW29725,Mwandishi Wetu -Singida BENKI ya NMB imetoa ms...,uchumi
3,SW20901,"TIMU ya taifa ya Tanzania, Serengeti Boys jan...",michezo
4,SW12560,Na AGATHA CHARLES – DAR ES SALAAM ALIYEKUWA K...,kitaifa


In [96]:
# show top five rows of test data
test.head()

Unnamed: 0,id,content
0,SW4255,WAZIRI MKUU Kassim Majaliwa amep okea leseni ...
1,SW15677,RAIS John Magufuli amewataka viongozi wa Halm...
2,SW15925,"NEW YORK, MAREKANI MKALI wa hip hop nchini Mar..."
3,SW7615,"WAZIRI wa Kilimo, Dk Charles Tizeba amelitaka..."
4,SW28011,"Mwandishi wetu, Tanga WAFANYABIASHARA wa Mkoa ..."


In [97]:
# show top five rows of submision file
submission.head()

Unnamed: 0,test_id,kitaifa,michezo,burudani,uchumi,kimataifa,afya
0,SW4255,1.0,0.0,0.0,0.0,0.0,0.0
1,SW15677,0.0,0.0,0.0,1.0,0.0,0.0
2,SW15925,,,,,,
3,SW7615,,,,,,
4,SW28011,,,,,,


In [98]:
# check the shape of the train data
train.shape

(23268, 3)

In [99]:
# check the shape of the test data
test.shape

(7756, 2)

In [100]:
# check missing values in train data
train.isnull().sum()

id          0
content     0
category    0
dtype: int64

In [101]:
# check missing values in test data
test.isnull().sum()

id         0
content    0
dtype: int64

In [102]:
# evalute news category distribution
train.category.value_counts()

kitaifa      10242
michezo       6004
burudani      2229
uchumi        2028
kimataifa     1906
afya           859
Name: category, dtype: int64

### Data Preparation 

In [103]:
# a mapping dictionary that maps the category values from 0 to 5
category_mapping = {
"kitaifa": 0,
"michezo": 1,
"burudani": 2,
"uchumi": 3,
"kimataifa": 4,
"afya": 5
}

train["category"] = train.category.map(category_mapping)

train.head()

Unnamed: 0,id,content,category
0,SW4670,"Bodi ya Utalii Tanzania (TTB) imesema, itafan...",3
1,SW30826,"PENDO FUNDISHA-MBEYA RAIS Dk. John Magufuri, ...",0
2,SW29725,Mwandishi Wetu -Singida BENKI ya NMB imetoa ms...,3
3,SW20901,"TIMU ya taifa ya Tanzania, Serengeti Boys jan...",1
4,SW12560,Na AGATHA CHARLES – DAR ES SALAAM ALIYEKUWA K...,0


In [104]:
# a simple function to clean text data 

def text_cleaning(text):
    # Clean the text data

    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
    text = text.lower()  # set in lowercase 
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
        
    # Return a list of words
    return(text)

In [105]:
#clean the train and test data
train["content"] = train["content"].apply(text_cleaning)
test["content"] = test["content"].apply(text_cleaning)

In [106]:
#split features and target from train data 
X = train["content"]
y = train.category.values

In [107]:
# Transform text data 
vectorizer = CountVectorizer(lowercase=False)

vectorizer.fit(X)

#transform train data 
X_transformed = vectorizer.transform(X)

#transform test data
test_transformed = vectorizer.transform(test["content"])

In [108]:
# split data into train and validate

X_train, X_valid, y_train, y_valid = train_test_split(
    X_transformed,
    y,
    test_size=0.20,
    random_state=42,
    shuffle=True,
    stratify=y,
)

### Create Classifier 

In [109]:
# Create a classifier
news_classifier = MultinomialNB()    

In [110]:
# train the news_classifier 
news_classifier.fit(X_train,y_train)

MultinomialNB()

In [111]:
# test model performance on valid data 
y_probas = news_classifier.predict_proba(X_valid)

In [112]:
# evalute model performance by using log_loss in the validation data
log_loss(y_valid, y_probas)

4.170148843669263

In [113]:
# create prediction from the test data
test_probas = news_classifier.predict_proba(test_transformed)

### Create Submission File

In [114]:
# create submission file 
submission_cols = ['kitaifa', 'michezo', 'burudani','uchumi', 'kimataifa', 'afya'] 
submission_df = pd.DataFrame(test_probas, columns = submission_cols)
submission_df['test_id'] = submission['test_id']   # add  test_id 

#rearange columns 
submission_df = submission_df[['test_id','kitaifa', 'michezo', 'burudani','uchumi', 'kimataifa', 'afya']]

# save submission file 
submission_df.to_csv(path+"first_submission.csv",index=False) 

Now upload your first submssion file on the hackathon page 👍