# EDSA Classification Hackathon 
#### 18 June 2021


### DANIEL BRUINTJIES                                           


In [2]:
# Standard
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

# Natural language Processing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Models
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, StackingClassifier

# Performance
from sklearn.metrics import f1_score
from sklearn import metrics


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
pd.set_option('display.max_colwidth', None)
train = pd.read_csv("train_set.csv")
test = pd.read_csv("test_set.csv")

In [4]:
print(train.shape)
print(test.shape)

(33000, 2)
(5682, 2)


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


In [6]:
print(train.lang_id.value_counts())
print(train.lang_id.unique())

ven    3000
nso    3000
nbl    3000
eng    3000
xho    3000
tso    3000
afr    3000
tsn    3000
zul    3000
ssw    3000
sot    3000
Name: lang_id, dtype: int64
['xho' 'eng' 'nso' 'ven' 'tsn' 'nbl' 'zul' 'ssw' 'tso' 'sot' 'afr']


In [178]:
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi naphi na kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga i-dha mayibize uncedo olufanelekileyo elungelweni layo
2,eng,the province of kwazulu-natal department of transport invites tenders from established contractors experienced in bridge construction for the construction of the kwajolwayo tugela river pedestrian bridge near tugela ferry the duration of the project will be months
3,nso,o netefatša gore o ba file dilo ka moka tše le dumelelanego ka tšona mohlala maleri a magolo a a šomišwago go fihlelela meagong e metelele scaffolds a a bolokegilego lefelo la maleba la go šomela go phela gabotse bjbj
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana u ya nga mulayo wa khomishini ya ndinganyiso ya mbeu u thetshelesa mbilaelo dzine dza tshimbilelana na tshialula u ya nga mbeu nahone i ivhea sa foramu ya thungo u ya nga mulayo wa ndinganyiso


In [179]:
def clean_text(text):
    """
    A function to preprocess the text
    :param message: String, text to be cleaned
    return: String, text
    """
    text = text.lower()
    # removing punctuation and digits
    text = re.sub(r'[-]',' ',text)
    text = re.sub(r'[_]', ' ', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub('[0-9]+', '', text) 
    
    stop_words = stopwords.words('english')
    text = ' '.join([word for word in text.split(' ') if not word in stop_words])
    text = re.sub("\\s+", " ", text)  # fills white spaces
    text = text.lstrip()  # removes whitespaces before string
    text = text.rstrip()  # removes whitespaces after string
    

    return text

In [180]:
%%time

train['text_clean']=train['text'].apply(clean_text) 
test['text_clean']=test['text'].apply(clean_text)

CPU times: user 7.69 s, sys: 391 ms, total: 8.08 s
Wall time: 8.07 s


In [181]:
test.head()

Unnamed: 0,index,text,text_clean
0,1,"Mmasepala, fa maemo a a kgethegileng a letlelela kgato eo.",mmasepala fa maemo kgethegileng letlelela kgato eo
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye imitlolo engezelelako ukuqedelela ukutloliswa kwesibawo sakho.,uzakwaziswa ngokufaneleko nakungafuneka eminye imitlolo engezelelako ukuqedelela ukutloliswa kwesibawo sakho
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,tshivhumbeo tshi fana na ngano dza vhathu
3,4,Kube inja nelikati betingevakala kutsi titsini naticocisana.,kube inja nelikati betingevakala kutsi titsini naticocisana
4,5,Winste op buitelandse valuta.,winste op buitelandse valuta


In [182]:
train.head()

Unnamed: 0,lang_id,text,text_clean
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika,umgaqo siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi naphi na kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga i-dha mayibize uncedo olufanelekileyo elungelweni layo,dha iya kuba nobulumko bokubeka umsebenzi naphi na kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga dha mayibize uncedo olufanelekileyo elungelweni layo
2,eng,the province of kwazulu-natal department of transport invites tenders from established contractors experienced in bridge construction for the construction of the kwajolwayo tugela river pedestrian bridge near tugela ferry the duration of the project will be months,province kwazulu natal department transport invites tenders established contractors experienced bridge construction construction kwajolwayo tugela river pedestrian bridge near tugela ferry duration project months
3,nso,o netefatša gore o ba file dilo ka moka tše le dumelelanego ka tšona mohlala maleri a magolo a a šomišwago go fihlelela meagong e metelele scaffolds a a bolokegilego lefelo la maleba la go šomela go phela gabotse bjbj,netefatša gore ba file dilo ka moka tše le dumelelanego ka tšona mohlala maleri magolo šomišwago go fihlelela meagong e metelele scaffolds bolokegilego lefelo la maleba la go šomela go phela gabotse bjbj
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana u ya nga mulayo wa khomishini ya ndinganyiso ya mbeu u thetshelesa mbilaelo dzine dza tshimbilelana na tshialula u ya nga mbeu nahone i ivhea sa foramu ya thungo u ya nga mulayo wa ndinganyiso,khomishini ya ndinganyiso ya mbeu yo ewa maana u ya nga mulayo wa khomishini ya ndinganyiso ya mbeu u thetshelesa mbilaelo dzine dza tshimbilelana na tshialula u ya nga mbeu nahone ivhea sa foramu ya thungo u ya nga mulayo wa ndinganyiso


### Model Building

In [65]:
X = train['text_clean']
y = train['lang_id'] 

X_train , X_test , y_train , y_test = train_test_split(X , y, test_size =0.4, random_state=1)

In [66]:
classifiers = [ ComplementNB(),
               LinearSVC(random_state=1),
               #ExtraTreesClassifier(),
               #LogisticRegression(random_state=1, n_jobs=1, max_iter=5000),
               SGDClassifier(random_state=1),
               #AdaBoostClassifier(),
               #KNeighborsClassifier(),
               MultinomialNB(),
               RidgeClassifier(),
               #RandomForestClassifier(random_state=1)
              ]

In [67]:
def model_assessment(classifiers, X_train , X_test , y_train , y_test):
    """
    A function to train and test different classifiers and return a performance summary. 
    :param classifiers: List of models
    :param Train Test Split
    return: Dataframe of models as index, and metrics as columns
    """  
    model_summary = {} 
    
    for clf in classifiers:
        model = Pipeline([
                    ('tfidf', TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')),
                    ('clf', clf)
                          ])
        
        start_time = time.time()      # Logging Current Time 
        model.fit(X_train, y_train)        # Training
        predictions = model.predict(X_test)    # Testing 
        run_time = time.time() - start_time        # Calculate Execuation Time
      
    # Create Dictionary Of Each Models Performance
        model_summary[clf.__class__.__name__] = {
          'F1-Macro':metrics.f1_score(y_test, predictions, average='macro'),
          'F1-Accuracy':metrics.f1_score(y_test, predictions, average='micro'),
          'F1-Weighted':metrics.f1_score(y_test, predictions, average='weighted'),
          'Execution Time': run_time }
      
        
    return pd.DataFrame.from_dict(model_summary, orient='index') # Create/Return Dataframe

In [68]:
clf_performance_df = model_assessment(classifiers, X_train , X_test , y_train , y_test)
clf_performance_df.to_csv('clf_performance_df.csv')

In [69]:
clf_performance_df = pd.read_csv('clf_performance_df.csv', index_col = 0) 

In [70]:
clf_performance_df.sort_values('F1-Weighted', ascending=False)

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted,Execution Time
LinearSVC,0.999549,0.999545,0.999546,28.811248
SGDClassifier,0.999549,0.999545,0.999546,29.772962
MultinomialNB,0.999547,0.999545,0.999545,25.663713
RidgeClassifier,0.999469,0.99947,0.99947,73.897409
ComplementNB,0.99862,0.998636,0.998639,24.207253


### Hyperparameter Tuning

In [72]:
classifiers = [
               LinearSVC(random_state=1),
               SGDClassifier(random_state=1),
               MultinomialNB(),
               RidgeClassifier()
              ]

In [74]:
X = train['text_clean']
y = train['lang_id'] 

X_train , X_test , y_train , y_test = train_test_split(X , y, test_size =0.4, random_state=1)

In [78]:
def get_hyperparams(classifiers, X_train, y_train):
    """
    A function to train different classifiers and return their parameters.
    :classifiers: List of models
    return: Dictionary with model names as keys and the values are a dictionary with parameter names as keys 
            and parameter value as values.
    """  
    model_params = {}
  
    for clf in classifiers:
        model = Pipeline([
                    ('tfidf', TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')),
                    ('clf',clf)
                          ])
        
        model.fit(X_train, y_train)
        
        params = model.get_params()
        model_name = clf.__class__.__name__ 
        model_name = {}
        
        for key in params:
            if key.startswith("clf"):
                if len(key) < 5:
                    model_name['model'] = params[key]
                else:
                    model_name[key[5:]] = params[key]
        
        model_params[clf.__class__.__name__] = model_name
        
    return model_params

In [79]:
%%time
model_params = get_hyperparams(classifiers, X_train, y_train)

CPU times: user 2min 53s, sys: 44 s, total: 3min 37s
Wall time: 2min 52s


#### Grid Search

In [80]:
X = train['text_clean']
y = train['lang_id'] 

vect = TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')

X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.4, random_state =1)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

#### LinearSVC

In [82]:
#Current Hyperparameters
model_params[classifiers[0].__class__.__name__]

{'C': 1.0,
 'class_weight': None,
 'dual': True,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'loss': 'squared_hinge',
 'max_iter': 1000,
 'model': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
           multi_class='ovr', penalty='l2', random_state=1, tol=0.0001,
           verbose=0),
 'multi_class': 'ovr',
 'penalty': 'l2',
 'random_state': 1,
 'tol': 0.0001,
 'verbose': 0}

In [87]:
%%time
# define model and parameters to loop through
model = LinearSVC()
C = [0.9, 0.7]
max_iter = [1000]
grid = dict(C=C, max_iter = max_iter)

GS = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='f1_weighted', error_score=0)
GS.fit(X_train, y_train)

y_pred = GS.predict(X_test)
f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')


CPU times: user 7.6 s, sys: 99 ms, total: 7.7 s
Wall time: 56.1 s


In [88]:
print(GS.best_score_)
print(GS.best_params_)     # The parameters we want
print(GS.best_estimator_) 

0.9989898921697524
{'C': 0.9, 'max_iter': 1000}
LinearSVC(C=0.9, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)


#### RidgeClassifier

In [89]:
#Current Hyperparameters
model_params[classifiers[3].__class__.__name__]

{'alpha': 1.0,
 'class_weight': None,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'model': RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                 max_iter=None, normalize=False, random_state=None,
                 solver='auto', tol=0.001),
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

In [90]:
%%time
# define model and parameters to loop through
model = RidgeClassifier()
alpha = list(np.linspace(0.15,0.4, 5))
grid = dict(alpha=alpha)

GS = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='f1_weighted', error_score=0)
GS.fit(X_train, y_train)

y_pred = GS.predict(X_test)
f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')


CPU times: user 1min 31s, sys: 1min 15s, total: 2min 47s
Wall time: 20min 4s


In [92]:
print(GS.best_score_)
print(GS.best_params_)     # The parameters we want
print(GS.best_estimator_) 

0.9990909703043116
{'alpha': 0.2125}
RidgeClassifier(alpha=0.2125, class_weight=None, copy_X=True,
                fit_intercept=True, max_iter=None, normalize=False,
                random_state=None, solver='auto', tol=0.001)


#### SGDClassifier(random_state=1)

In [99]:
#Current Hyperparameters
model_params[classifiers[1].__class__.__name__]

{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'hinge',
 'max_iter': 1000,
 'model': SGDClassifier(alpha=0.0001, average=False, class_weight=None,
               early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
               l1_ratio=0.15, learning_rate='optimal', loss='hinge',
               max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
               power_t=0.5, random_state=1, shuffle=True, tol=0.001,
               validation_fraction=0.1, verbose=0, warm_start=False),
 'n_iter_no_change': 5,
 'n_jobs': None,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': 1,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [109]:
%%time
# define model and parameters to loop through
model = SGDClassifier(random_state=1)

model = SGDClassifier()
alpha = [0.000017]
max_iter = [10000]

# define grid search
grid = dict(alpha=alpha, max_iter=max_iter, power_t=power_t)

GS = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='f1_weighted', error_score=0)
GS.fit(X_train, y_train)

y_pred = GS.predict(X_test)
f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')


CPU times: user 4.05 s, sys: 1.2 s, total: 5.25 s
Wall time: 41.6 s


In [110]:
print(GS.best_score_)
print(GS.best_params_)     # The parameters we want
print(GS.best_estimator_) 

0.9990404670260455
{'alpha': 1.7e-05, 'max_iter': 10000, 'power_t': 0.32}
SGDClassifier(alpha=1.7e-05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=10000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.32, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


#### MultinomialNB

In [112]:
#Current Hyperparameters
model_params[classifiers[2].__class__.__name__]

{'alpha': 1.0,
 'class_prior': None,
 'fit_prior': True,
 'model': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)}

In [116]:
%%time
# define model and parameters to loop through
model = MultinomialNB()

alpha = list(np.linspace(0.1,0.02,4))
grid = dict(alpha=alpha)

GS = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='f1_weighted', error_score=0)
GS.fit(X_train, y_train)

y_pred = GS.predict(X_test)
f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')

CPU times: user 767 ms, sys: 65.6 ms, total: 832 ms
Wall time: 8.38 s


In [117]:
print(GS.best_score_)
print(GS.best_params_)     # The parameters we want
print(GS.best_estimator_) 

0.9996969689237549
{'alpha': 0.1}
MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)


### Choosing The Best Models after Tuning

In [123]:
X = train['text_clean']
y = train['lang_id'] 

X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.4, random_state =1)

In [124]:
classifiers = [ MultinomialNB(alpha=0.1),
                StackingClassifier(estimators = estimators, final_estimator = final_est, 
                                  passthrough = True),
                LinearSVC(C=0.9, random_state=1),
                ComplementNB(alpha=0.18),
                SGDClassifier(alpha=1.7e-05, max_iter=10000, power_t=0.32, random_state=1),
                RidgeClassifier(alpha=0.2125),
              ]

In [125]:
best_performance_df = model_assessment(classifiers, X_train , X_test , y_train , y_test)
best_performance_df.to_csv('best_performance_df.csv')

In [141]:
best_performance_df = pd.read_csv('best_performance_df.csv', index_col = 0) 
best_performance_df.sort_values('F1-Weighted', ascending = False)

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted,Execution Time
MultinomialNB,0.999848,0.999848,0.999848,22.385005
StackingClassifier,0.999848,0.999848,0.999848,94.524046
LinearSVC,0.999549,0.999545,0.999546,27.261466
SGDClassifier,0.999473,0.99947,0.99947,26.462303
RidgeClassifier,0.999322,0.999318,0.999318,100.041523
ComplementNB,0.998159,0.998182,0.998186,23.271741


### Submission

#### Stacking Classifier

In [None]:
%%time
multiNB1 = MultinomialNB(alpha=0.1)
multiNB2 = MultinomialNB(alpha=0.1)

estimators = [('multiNB1', multiNB1), ('multiNB2', multiNB2)]
final_est = RidgeClassifier(alpha=0.2125)

stacking_clf = StackingClassifier(estimators = estimators, final_estimator = final_est, 
                                  passthrough = True)

X = train['text_clean']
y = train['lang_id'] 

vect = TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(2, 6), analyzer= 'char')

X_train , X_test , y_train , y_test = train_test_split(X, y, stratify=y,test_size=0.05, random_state =1)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

stacking_clf.fit(X_train, y_train)

y_pred = stacking_clf.predict(X_test)
f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')
print(f1_weighted)

#### MultinomialNB

In [210]:
model = MultinomialNB(alpha=0.1)

X = train['text_clean']
y = train['lang_id'] 

vect = TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(2, 6), analyzer= 'char')

X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.05, stratify=y,shuffle=True, random_state =1)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')
print(f1_weighted)

1.0


In [208]:
X = test['text_clean']

X_vect = vect.transform(X)

# Predict
test['lang_id'] = stacking_clf.predict(X_vect)  # or   test['lang_id'] = model.predict(X_vect)

In [209]:
submission = test[['index', 'lang_id']]
submission.to_csv('stacking_clf_0.csv',index=False)

In [199]:
submission

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot
