# South African Language Identification Code Challenge 

### by Ethan Macrae












# Introduction



### Problem Statement

Given text data, classify the text as one of the 11 Official South African Languages.

# Table of Contents

1. Importing Libraries
2. Importing Dataset
3. Data Preprocessing
4. Exploratory Data Analysis
5. Feature Engineering and Selection
6. Model Selection
7. Submission
8. Conclusion

# Importing Libraries

In [15]:
# Packages for data analysis
import pandas as pd
import numpy as np
import time

# Packages for visualizations
import seaborn as sns
import matplotlib.style as style

# Packages for preprocessing
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Packages for training models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer

# Model Evaluation Packages
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import make_scorer




# Importing Dataset

In [3]:
# Importing the dataset
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Data Preprocessing

In [28]:
# view the first 5 entries in the training data
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [6]:
# view the first 5 entries of the test data
test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [8]:
# Check how many entires of each language do we have in the training data
train['lang_id'].value_counts()

xho    3000
eng    3000
nso    3000
ven    3000
tsn    3000
nbl    3000
zul    3000
ssw    3000
tso    3000
sot    3000
afr    3000
Name: lang_id, dtype: int64

# Exploratory Data Analysis

## Data Cleaning

In [10]:
def remove_pattern(input_txt):         
    return re.sub('@[\w]+','',input_txt)


punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

def cleaning_repeating_char(text):
    return re.sub(r'(.)1+', r'1', text)

def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)

def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)


In [11]:
#clean the data

train['text'] = train['text'].apply(remove_pattern)
test['text'] = test['text'].apply(remove_pattern)

train['text'] = train['text'].apply(cleaning_punctuations)
test['text'] = test['text'].apply(cleaning_punctuations)

train['text'] = train['text'].apply(cleaning_repeating_char)
test['text'] = test['text'].apply(cleaning_repeating_char)


train['text'] = train['text'].apply(cleaning_numbers)
test['text'] = test['text'].apply(cleaning_numbers)

# Feature Engineering and Selection

In [61]:
# Separate X and Y
X = train['text']
y = train['lang_id']

In [62]:
# Creating a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [63]:
cv = CountVectorizer(ngram_range=(1,3), max_df = 0.9, min_df = 3)
tf = TfidfVectorizer(min_df=1, max_df=0.9, ngram_range=(3, 6), analyzer='char')

In [64]:
# X_train = cv.fit_transform(X_train)
# X_test = cv.transform(X_test)

X_train = tf.fit_transform(X_train)
X_test = tf.transform(X_test)

## Setting up Classifiers for Model Training

In [53]:
# List all the models
mnb = MultinomialNB(alpha = 0.1)
sgc = SGDClassifier(loss='log', penalty='l2',alpha=8e-05, random_state=42, max_iter=2000, tol=None, n_jobs = 6)
lr = LogisticRegression(max_iter=2000)

In [40]:
sgc.fit(X_train, y_train)

SGDClassifier(alpha=8e-05, loss='log', max_iter=2000, n_jobs=6, random_state=42,
              tol=None)

In [65]:
mnb.fit(X_train, y_train)

MultinomialNB(alpha=0.1)

In [42]:
lr.fit(X_train, y_train)

LogisticRegression(max_iter=2000)

In [43]:
pred_sgc = sgc.predict(X_test)

In [44]:
pred_mnb = mnb.predict(X_test)

In [45]:
pred_lr = lr.predict(X_test)

### Comparing Classification Scores


In [46]:
print(classification_report(y_test,pred_sgc))
print(classification_report(y_test,pred_mnb))
print(classification_report(y_test,pred_lr))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       1.00      1.00      1.00       615
         nbl       0.99      0.99      0.99       583
         nso       1.00      1.00      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       1.00      0.99      1.00       584
         tsn       1.00      1.00      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       0.99      0.99      0.99       609
         zul       0.97      0.98      0.98       590

    accuracy                           0.99      6600
   macro avg       0.99      0.99      0.99      6600
weighted avg       0.99      0.99      0.99      6600

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       1.00      1.00      1.00       615
         nbl       0.99 

## Hyperparameter Tuning on best models

In [48]:
# Refining the train-test split for validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [49]:
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

### Multinomial Naive Bayes

In [52]:
#gridsearch
param_grid = {'alpha': [0, 0.01, 0.03, 0.05, 0.1, 0,3, 1, 5]}  # setting parameter grid

grid = GridSearchCV(mnb,param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)
print(grid.best_estimator_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ............................................alpha=0; total time=   0.0s




[CV] END ............................................alpha=0; total time=   0.0s
[CV] END ............................................alpha=0; total time=   0.0s




[CV] END ............................................alpha=0; total time=   0.0s
[CV] END ............................................alpha=0; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.03; total time=   0.0s
[CV] END .........................................alpha=0.03; total time=   0.0s
[CV] END .........................................alpha=0.03; total time=   0.0s
[CV] END .........................................alpha=0.03; total time=   0.0s
[CV] END .........................................alpha=0.03; total time=   0.0s
[CV] END ...................



[CV] END ............................................alpha=0; total time=   0.0s
[CV] END ............................................alpha=0; total time=   0.0s
[CV] END ............................................alpha=0; total time=   0.0s




[CV] END ............................................alpha=3; total time=   0.0s
[CV] END ............................................alpha=3; total time=   0.0s
[CV] END ............................................alpha=3; total time=   0.0s
[CV] END ............................................alpha=3; total time=   0.0s
[CV] END ............................................alpha=3; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ............................................alpha=5; total time=   0.0s
[CV] END ............................................alpha=5; total time=   0.0s
[CV] END ...................

# Submission

In [66]:
submission = pd.DataFrame(test['index'])

submission['lang_id'] = mnb.predict(tf.transform(test['text']))
submission.to_csv('Ethan Macrae SA language Sub.csv', index=False)


In [67]:
submission

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot


# 6. Conclusion

The Multinomial Naive Bayes classifier was the best performing model. It had 99% F1 Scores for the test data. 