# **South African Language Identification Hackathon**

As a multicultural society, South Africa is characterised by its rich linguistic diversity with 11 official languages. The aim of this machine learning model is to characterize pieces of text to the natural language in which it belongs.

# 1. Import Packages

In [1]:
# Packages for data analysis
import pandas as pd
import numpy as np
import time

# Packages for visualizations
import seaborn as sns
import matplotlib.style as style

# Packages for preprocessing
import textblob
import nltk
import string
import re
import spacy.cli
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
nltk.download('punkt')

# Packages for training models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import metrics
import xgboost as xgb

# Model Evaluation Packages
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
%matplotlib inline

# Style
sns.set(font_scale=1.5)
style.use('seaborn-pastel')
style.use('seaborn-poster')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nobuhle.skakane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nobuhle.skakane\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
spacy.load('en_core_web_sm')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
stop = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nobuhle.skakane\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nobuhle.skakane\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nobuhle.skakane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nobuhle.skakane\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# 2.Loading Dataset

In [5]:
df_train = pd.read_csv('train_set.csv')
df_test = pd.read_csv('test_set.csv')
df_sample_submission = pd.read_csv('sample_submission.csv')

In [6]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()
sample_submission_copy = df_sample_submission.copy()

In [7]:
df_train_copy.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [8]:
df_test_copy.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [9]:
df_sample_submission.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


# 3. Data Overview

In [10]:
df_train_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


# 4.Data Preprocessing

In [11]:
"""
Knowing that we are dealing with text data, we decided to first clean the data by making all tweets lower-case, removing punctuation marks and removing white spaces before doing anything else. Also, replacing all links with the word 'LINK' and all user handles with 'USER_REF'
"""
def clean_text(df_copy):
    i = 0
    for text in df_copy['text']:
        text = text.lower()
        text = re.sub(r'http\S+', 'LINK', text)
        text = re.sub(r'@\S+', 'USER_REF', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = text.lstrip()
        text = text.rstrip()
        text = text.replace('  ', ' ')
        df_copy.loc[i, 'text'] = text
        i += 1

In [12]:
def remove_stopwords(df_copy):
    my_stop_words = stopwords.words('english')
    my_stop_words.append('LINK')
    my_stop_words.append('USER_REF')

    df_copy_index = 0

    for text in df_copy['text']:
        text = word_tokenize(text)
        text = [word for word in text if not word in my_stop_words]
        text = ' '.join(text)

        df_copy.loc[df_copy_index, 'text'] = text
        df_copy_index += 1

    return df_copy

In [13]:
def lem_text(df_copy):
    df_copy_index = 0

    for text in df_copy['text']:
        text = nlp(text)
      
        for token in text:
            df_copy.loc[df_copy_index, 'text'] = df_copy.loc[df_copy_index, 'text'].replace(str(token.text), str(token.lemma_))

            df_copy_index += 1

        return df_copy

In [14]:
clean_text(df_train_copy)
remove_stopwords(df_train_copy)
lem_text(df_train_copy)

Unnamed: 0,lang_id,text
0,xho,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,province kwazulunatal department transport inv...
3,nso,netefatša gore ba file dilo ka moka tše le dum...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...
32996,sot,modise mosadi na ntse sa utlwe hore thabang ra...
32997,eng,closing date submission completed tenders augu...
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...


In [15]:
clean_text(df_test_copy)
remove_stopwords(df_test_copy)
lem_text(df_test_copy)

Unnamed: 0,index,text
0,1,mmasepala fa maemo kgethegileng letlelela kgat...
1,2,uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,tshivhumbeo tshi fana na ngano dza vhathu
3,4,kube inja nelikati betingevakala kutsi titsini...
4,5,winste op buitelandse valuta
...,...,...
5677,5678,mark ballot private
5678,5679,ge ka kgetha ka bowena go se šomiše mofani ka ...
5679,5680,e ka kopo etsa kgetho ya hao ka hloko hobane h...
5680,5681,tb ke bokudi ba pmb mme morero tla lefella tlh...


In [16]:
# Replace '.txt' with 'text file'
df_train_copy["text"] = df_train_copy["text"].str.replace(".txt", " text file")
df_test_copy["text"] = df_test_copy["text"].str.replace(".txt", " text file")

  df_train_copy["text"] = df_train_copy["text"].str.replace(".txt", " text file")
  df_test_copy["text"] = df_test_copy["text"].str.replace(".txt", " text file")


# 5. Feature Engineering

In [17]:
X = df_train_copy['text']
y = df_train_copy['lang_id']

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10)

# 6. Model Building

In [19]:
# These are the different Classification models we will train our data on
# Creating a list of names so we can print metrics for the entire list at once
names = ['Logistic Regression'
         , 'Nearest Neighbors'
         , 'Multinomial Naive Bayes'
         #, 'Linear SVC'
         , 'RBF SVC'
         , 'Linear SVM'
         , 'Decision Tree'
         #, 'Random Forest'
         #, 'AdaBoost'
]

In [20]:
# These are the different Classification models we will train our data on
# Creating a list of names so we can print metrics for the entire list at once
classifiers = [
    LogisticRegression(random_state=42, multi_class='ovr', n_jobs=1, C=1e5, max_iter=4000)
    , KNeighborsClassifier(n_neighbors=5)
    , MultinomialNB()
    #, SVC(kernel="linear", C=0.025)
    , SVC(gamma=2, C=1)
    , LinearSVC(random_state=42)
    , DecisionTreeClassifier(max_depth=5)
    #, RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    #, AdaBoostClassifier()
]

In [21]:
def models_building(classifiers, X_train, y_train, X_val, y_val):
    """
    This function takes in a list of classifiers
    and both the train and validation sets
    and return a summary of F1-score and
    processing time as a dataframe

    Input:
    classifiers: a list of classifiers to train
                 datatype: list
    X_train: independent variable for training
             datatype: series
    y_train: dependent variable for training
             datatype: series
    X_val: independent variable for validation
           datatype: series
    y_val: dependent variable for validation
           datatype: series

    Output:
    model_summary: F1 Score for all the classifiers
                   datatype: dataframe
    """

    models_summary = {}

    # Pipeline to balance the classses and then to build the model
    for clf in classifiers:
        clf_text = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                                       max_df=0.9,
                                                       ngram_range=(1, 2))),
                             ('clf', clf)])

        # Logging the Execution Time for each model
        start_time = time.time()
        clf_text.fit(X_train, y_train)
        predictions = clf_text.predict(X_val)
        run_time = time.time()-start_time

        # Output for each model
        models_summary[clf.__class__.__name__] = {
            'F1-Macro': metrics.f1_score(y_val,
                                         predictions,
                                         average='macro'),
            'F1-Accuracy': metrics.f1_score(y_val, predictions,
                                            average='micro'),
            'F1-Weighted': metrics.f1_score(y_val,
                                            predictions,
                                            average='weighted'),
            'Execution Time': run_time}

    return pd.DataFrame.from_dict(models_summary, orient='index')


In [22]:
classifiers_df = models_building(classifiers, X_train, y_train, X_val, y_val)
ordered_df = classifiers_df.sort_values('F1-Macro', ascending=False)
ordered_df

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted,Execution Time
MultinomialNB,0.999092,0.999091,0.999091,10.002316
LogisticRegression,0.996055,0.996061,0.99606,201.736941
LinearSVC,0.99575,0.995758,0.995756,15.511536
SVC,0.991615,0.991515,0.99155,1389.323175
KNeighborsClassifier,0.967434,0.967576,0.967332,15.656766
DecisionTreeClassifier,0.511836,0.546364,0.505642,14.652031


As can be seen above, Multinomial Naive Bayes is the best performing model and we perform further hyperparameter tuning below;

# 7. Hyperparameter Tuning

In [23]:
# Refining the train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.01)

In [24]:
# Creating a pipeline for the gridsearch
param_grid = {'alpha': [0.1, 1, 5, 10]}  # setting parameter grid

tuned_mnb = Pipeline([('tfidf', TfidfVectorizer(min_df=2,
                                                max_df=0.9,
                                                ngram_range=(1, 2))),
                      ('mnb', GridSearchCV(MultinomialNB(),
                                           param_grid=param_grid,
                                           cv=5,
                                           n_jobs=-1,
                                           scoring='f1_weighted'))
                      ])

tuned_mnb.fit(X_train, y_train)  # Fitting the model

y_pred_mnb = tuned_mnb.predict(X_val)  # predicting the fit on validation set

print(classification_report(y_val, y_pred_mnb))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00        28
         eng       1.00      1.00      1.00        36
         nbl       1.00      1.00      1.00        19
         nso       1.00      1.00      1.00        31
         sot       1.00      1.00      1.00        32
         ssw       1.00      1.00      1.00        28
         tsn       1.00      1.00      1.00        29
         tso       1.00      1.00      1.00        21
         ven       1.00      1.00      1.00        31
         xho       1.00      1.00      1.00        31
         zul       1.00      1.00      1.00        44

    accuracy                           1.00       330
   macro avg       1.00      1.00      1.00       330
weighted avg       1.00      1.00      1.00       330



# 8. Submission

In [26]:
submission = pd.DataFrame(df_test_copy['index'])
submission['lang_id'] = tuned_mnb.predict(df_test_copy['text'])
submission.to_csv('submission_final.csv', index=False)