# Classification Hackathon

### DANIEL BRUINTJIES

In [67]:
# Standard
import pandas as pd
import numpy as np
import time

import matplotlib.pyplot as plt
%matplotlib inline

# Natural language Processing
import re
import nltk
import spacy
import string
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Models
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, StackingClassifier

# Performance
from sklearn.metrics import f1_score
from sklearn import metrics

# Saving
import pickle

In [68]:
pd.set_option('display.max_colwidth', None)
train = pd.read_csv("train_set.csv")
test = pd.read_csv("test_set.csv")

In [69]:
print(train.shape)
print(test.shape)

(33000, 2)
(5682, 2)


In [70]:
test_size = 5682/33000 
print(test_size)

0.17218181818181819


In [71]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


In [72]:
test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlelela kgato eo."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye imitlolo engezelelako ukuqedelela ukutloliswa kwesibawo sakho.
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini naticocisana.
4,5,Winste op buitelandse valuta.


In [73]:
def clean_text(text):
    """
    A function to preprocess the text
    :param message: String, text to be cleaned
    return: String, text
    """
    text = text.lower()
    # removing punctuation and digits
    text = re.sub(r'[-]',' ',text)
    text = re.sub(r'[_]', ' ', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub('[0-9]+', '', text) 
    text = re.sub("\\s+", " ", text)  # fills white spaces
    text = text.lstrip()  # removes whitespaces before string
    text = text.rstrip()  # removes whitespaces after string

    return text

In [74]:
%%time
train['text_clean']=train['text'].apply(clean_text) 
test['text_clean']=test['text'].apply(clean_text)

Wall time: 1.23 s


In [75]:
train.head()

Unnamed: 0,lang_id,text,text_clean
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika,umgaqo siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi naphi na kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga i-dha mayibize uncedo olufanelekileyo elungelweni layo,i dha iya kuba nobulumko bokubeka umsebenzi naphi na kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga i dha mayibize uncedo olufanelekileyo elungelweni layo
2,eng,the province of kwazulu-natal department of transport invites tenders from established contractors experienced in bridge construction for the construction of the kwajolwayo tugela river pedestrian bridge near tugela ferry the duration of the project will be months,the province of kwazulu natal department of transport invites tenders from established contractors experienced in bridge construction for the construction of the kwajolwayo tugela river pedestrian bridge near tugela ferry the duration of the project will be months
3,nso,o netefatša gore o ba file dilo ka moka tše le dumelelanego ka tšona mohlala maleri a magolo a a šomišwago go fihlelela meagong e metelele scaffolds a a bolokegilego lefelo la maleba la go šomela go phela gabotse bjbj,o netefatša gore o ba file dilo ka moka tše le dumelelanego ka tšona mohlala maleri a magolo a a šomišwago go fihlelela meagong e metelele scaffolds a a bolokegilego lefelo la maleba la go šomela go phela gabotse bjbj
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana u ya nga mulayo wa khomishini ya ndinganyiso ya mbeu u thetshelesa mbilaelo dzine dza tshimbilelana na tshialula u ya nga mbeu nahone i ivhea sa foramu ya thungo u ya nga mulayo wa ndinganyiso,khomishini ya ndinganyiso ya mbeu yo ewa maana u ya nga mulayo wa khomishini ya ndinganyiso ya mbeu u thetshelesa mbilaelo dzine dza tshimbilelana na tshialula u ya nga mbeu nahone i ivhea sa foramu ya thungo u ya nga mulayo wa ndinganyiso


In [127]:
test.tail()

Unnamed: 0,index,text,text_clean,lang_id
5677,5678,You mark your ballot in private.,you mark your ballot in private,eng
5678,5679,"Ge o ka kgetha ka bowena go se šomiše Mofani ka tirelo yo a kgethilwego wa GEMS (DSP), gona tefelo mmogo e tla šomišwa. Tefelo mmogo yeo e ka nna ya ba phapano magareng ga hlwahlwa ya tirelo le yeo GEMS e ka bego e e lefeletše ge nkabe o šomišitše GEMS DSP Goba persente ya tefelo mmogo go ya ka Melao ya Setlamo ya GEMS.",ge o ka kgetha ka bowena go se šomiše mofani ka tirelo yo a kgethilwego wa gems dsp gona tefelo mmogo e tla šomišwa tefelo mmogo yeo e ka nna ya ba phapano magareng ga hlwahlwa ya tirelo le yeo gems e ka bego e e lefeletše ge nkabe o šomišitše gems dsp goba persente ya tefelo mmogo go ya ka melao ya setlamo ya gems,nso
5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ha o na ho kgona ho fetola kgetho ena hara selemo ntle le tumello ya Boto ya Batshwaredi.",e ka kopo etsa kgetho ya hao ka hloko hobane ha o na ho kgona ho fetola kgetho ena hara selemo ntle le tumello ya boto ya batshwaredi,sot
5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella tlhahlobo ya ho sheba lefu lena. Leha ho le jwalo, o tlameha ho fumana kalafo ya hao ya TB ditsing tsa Mmuso.",tb ke bokudi ba pmb mme morero o tla lefella tlhahlobo ya ho sheba lefu lena leha ho le jwalo o tlameha ho fumana kalafo ya hao ya tb ditsing tsa mmuso,sot
5681,5682,Vakatjhela iwebhusayidi yethu ku-www.,vakatjhela iwebhusayidi yethu ku www,nbl


In [195]:
print(train.lang_id.value_counts())
print(train.lang_id.unique())

ssw    3000
tso    3000
ven    3000
zul    3000
sot    3000
nbl    3000
eng    3000
tsn    3000
nso    3000
xho    3000
afr    3000
Name: lang_id, dtype: int64
['xho' 'eng' 'nso' 'ven' 'tsn' 'nbl' 'zul' 'ssw' 'tso' 'sot' 'afr']


In [47]:
train['lang_id'] = train['lang_id'].astype('string')
train['text'] = train['text'].astype('string')

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   lang_id     33000 non-null  string
 1   text        33000 non-null  string
 2   text_clean  33000 non-null  object
dtypes: object(1), string(2)
memory usage: 773.6+ KB


### Model Building

In [18]:
X = train['text_clean']
y = train['lang_id'] 

X_train , X_test , y_train , y_test = train_test_split(X , y, test_size =0.17, random_state=1)

In [33]:
classifiers = [ ComplementNB(),
               LinearSVC(random_state=1),
               ExtraTreesClassifier(),
               LogisticRegression(random_state=1, n_jobs=1, max_iter=5000),
               SGDClassifier(random_state=1),
               AdaBoostClassifier(),
               KNeighborsClassifier(),
               MultinomialNB(),
               RidgeClassifier(),
               RandomForestClassifier(random_state=1)
              ]

In [34]:
def model_assessment(classifiers, X_train , X_test , y_train , y_test):
    """
    A function to train and test different classifiers and return a performance summary. 
    :param classifiers: List of models
    :param Train Test Split
    return: Dataframe of models as index, and metrics as columns
    """  
    model_summary = {} 
    
    for clf in classifiers:
        model = Pipeline([
                    ('tfidf', TfidfVectorizer(max_df=0.9, ngram_range=(1, 2))),
                    ('clf', clf)
                          ])
        
        start_time = time.time()      # Logging Current Time 
        model.fit(X_train, y_train)        # Training
        predictions = model.predict(X_test)    # Testing 
        run_time = time.time() - start_time        # Calculate Execuation Time
      
    # Create Dictionary Of Each Models Performance
        model_summary[clf.__class__.__name__] = {
          'F1-Macro':metrics.f1_score(y_test, predictions, average='macro'),
          'F1-Accuracy':metrics.f1_score(y_test, predictions, average='micro'),
          'F1-Weighted':metrics.f1_score(y_test, predictions, average='weighted'),
          'Execution Time': run_time }
      
        
    return pd.DataFrame.from_dict(model_summary, orient='index') # Create/Return Dataframe

In [35]:
clf_performance_df = model_assessment(classifiers, X_train , X_test , y_train , y_test)
clf_performance_df.to_csv('clf_performance_df.csv')

In [None]:
compNB = model_assessment(classifiers, X_train , X_test , y_train , y_test)
compNB.to_csv('compNB.csv')

In [37]:
all_classifiers = pd.concat([compNB, clf_performance_df], axis = 0)

In [39]:
all_classifiers.to_csv('all_classifiers.csv')

In [22]:
clf_performance_df = pd.read_csv('clf_performance_df.csv', index_col = 0) 

In [38]:
all_classifiers.sort_values('F1-Weighted', ascending=False)

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted,Execution Time
MultinomialNB,0.998249,0.998217,0.998216,5.762156
ComplementNB,0.997905,0.997861,0.997859,5.786962
RidgeClassifier,0.996884,0.996791,0.996791,14.186019
LinearSVC,0.996714,0.996613,0.996614,9.610314
SGDClassifier,0.996033,0.9959,0.995902,7.865045
LogisticRegression,0.995003,0.994831,0.994839,200.28663
ExtraTreesClassifier,0.991342,0.990909,0.990976,301.837464
RandomForestClassifier,0.989625,0.989127,0.989193,180.862793
KNeighborsClassifier,0.967741,0.967201,0.966948,11.070413
AdaBoostClassifier,0.479997,0.547415,0.471852,36.791643


### Hyperparameter Tuning

In [47]:
classifiers = [
               LinearSVC(random_state=1),
               ComplementNB(),
               MultinomialNB(),
               RidgeClassifier()
              ]

In [52]:
X = train['text_clean']
y = train['lang_id'] 

X_train , X_test , y_train , y_test = train_test_split(X , y, test_size =0.17, random_state=1)

In [53]:
def get_hyperparams(classifiers, X_train, y_train):
    """
    A function to train different classifiers and return their parameters.
    :classifiers: List of models
    return: Dictionary with model names as keys and the values are a dictionary with parameter names as keys 
            and parameter value as values.
    """  
    model_params = {}
  
    for clf in classifiers:
        model = Pipeline([
                    ('tfidf', TfidfVectorizer(max_df=0.9, ngram_range=(1, 2))),
                    ('clf',clf)
                          ])
        
        model.fit(X_train, y_train)
        
        params = model.get_params()
        model_name = clf.__class__.__name__ 
        model_name = {}
        
        for key in params:
            if key.startswith("clf"):
                if len(key) < 5:
                    model_name['model'] = params[key]
                else:
                    model_name[key[5:]] = params[key]
        
        model_params[clf.__class__.__name__] = model_name
        
    return model_params

In [54]:
%%time
model_params = get_hyperparams(classifiers, X_train, y_train)

Wall time: 33.8 s


#### Grid Search

In [46]:
X = train['text_clean']
y = train['lang_id'] 

vect = TfidfVectorizer(stop_words = 'english', ngram_range=(1, 1))

X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.4, random_state =1)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

#### LinearSVC

In [55]:
#Current Hyperparameters
model_params[classifiers[0].__class__.__name__]

{'model': LinearSVC(random_state=1),
 'C': 1.0,
 'class_weight': None,
 'dual': True,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'loss': 'squared_hinge',
 'max_iter': 1000,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'random_state': 1,
 'tol': 0.0001,
 'verbose': 0}

In [65]:
%%time
# define model and parameters to loop through
model = LinearSVC()
C = [15, 25]
max_iter = [1000]
grid = dict(C=C, max_iter = max_iter)

lsvc = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='f1_weighted', error_score=0)
lsvc.fit(X_train, y_train)

y_pred = lsvc.predict(X_test)
f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')


Wall time: 2min 57s


In [66]:
print(lsvc.best_score_)
print(lsvc.best_params_)     # The parameters we want
print(lsvc.best_estimator_) 

0.997041247699951
{'C': 15, 'max_iter': 1000}
LinearSVC(C=15)


#### RidgeClassifier

In [67]:
#Current Hyperparameters
model_params[classifiers[3].__class__.__name__]

{'model': RidgeClassifier(),
 'alpha': 1.0,
 'class_weight': None,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

In [74]:
%%time
# define model and parameters to loop through
model = RidgeClassifier()
alpha = list(np.linspace(0.17,0.24,8))
grid = dict(alpha=alpha)

lsvc = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='f1_weighted', error_score=0)
lsvc.fit(X_train, y_train)

y_pred = lsvc.predict(X_test)
f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')


Wall time: 8min 1s


In [75]:
print(lsvc.best_score_)
print(lsvc.best_params_)     # The parameters we want
print(lsvc.best_estimator_) 

0.9974795369157897
{'alpha': 0.2}
RidgeClassifier(alpha=0.2)


#### ComplementNB

In [70]:
#Current Hyperparameters
model_params[classifiers[2].__class__.__name__]

{'model': MultinomialNB(),
 'alpha': 1.0,
 'class_prior': None,
 'fit_prior': True}

In [79]:
%%time
# define model and parameters to loop through
model = ComplementNB()

alpha = list(np.linspace(0.17,0.24,8))
grid = dict(alpha=alpha)

lsvc = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='f1_weighted', error_score=0)
lsvc.fit(X_train, y_train)

y_pred = lsvc.predict(X_test)
f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')


Wall time: 14.1 s


In [80]:
print(lsvc.best_score_)
print(lsvc.best_params_)     # The parameters we want
print(lsvc.best_estimator_) 

0.9981725476022678
{'alpha': 0.18000000000000002}
ComplementNB(alpha=0.18000000000000002)


#### MultinomialNB

In [71]:
#Current Hyperparameters
model_params[classifiers[1].__class__.__name__]

{'model': ComplementNB(),
 'alpha': 1.0,
 'class_prior': None,
 'fit_prior': True,
 'norm': False}

In [55]:
%%time
# define model and parameters to loop through
model = MultinomialNB()

alpha = list(np.linspace(0.05,0.17,8))
grid = dict(alpha=alpha)

lsvc = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='f1_weighted', error_score=0)
lsvc.fit(X_train, y_train)

y_pred = lsvc.predict(X_test)
f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')

Wall time: 3.23 s


In [56]:
print(lsvc.best_score_)
print(lsvc.best_params_)     # The parameters we want
print(lsvc.best_estimator_) 

0.9992423422148715
{'alpha': 0.10142857142857144}
MultinomialNB(alpha=0.10142857142857144)


#### Stacking Classifier

In [None]:
%%time
compNB = ComplementNB(alpha=0.18)
multiNB = MultinomialNB(alpha=0.18)
estimators = [('compNB', compNB), ('multiNB', multiNB)]
final_est = RidgeClassifier(alpha=0.2)

stacking_clf = StackingClassifier(estimators = estimators, final_estimator = final_est, 
                                  passthrough = True)

X = train['text_clean']
y = train['lang_id'] 

vect = TfidfVectorizer(max_df=0.9, ngram_range=(1, 2))

X_train , X_test , y_train , y_test = train_test_split(X, y, stratify=y,test_size=0.05, random_state =1)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)


stacking_clf.fit(X_train, y_train)

y_pred = stacking_clf.predict(X_test)
f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')
print(f1_weighted)

#### Choosing The Best Models after Tuning

In [89]:
X = train['text_clean']
y = train['lang_id'] 

X_train , X_test , y_train , y_test = train_test_split(X , y, test_size =0.17, random_state=1)

In [90]:
classifiers = [
                StackingClassifier(estimators = estimators, final_estimator = final_est, 
                                  passthrough = True),
                LinearSVC(random_state=1, C=15),
                ComplementNB(alpha=0.18),
                MultinomialNB(alpha=0.17285714285714288),
                RidgeClassifier(alpha=0.2),  
              ]

In [91]:
best_performance_df = model_assessment(classifiers, X_train , X_test , y_train , y_test)
best_performance_df.to_csv('best_performance_df.csv')

In [92]:
best_performance_df = pd.read_csv('best_performance_df.csv', index_col = 0) 
best_performance_df.sort_values('F1-Weighted', ascending = False)

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted,Execution Time
StackingClassifier,0.998941,0.99893,0.99893,11.261713
MultinomialNB,0.998936,0.99893,0.99893,5.92037
ComplementNB,0.998258,0.998217,0.998216,6.238325
RidgeClassifier,0.997578,0.997504,0.997505,19.722058
LinearSVC,0.997221,0.997148,0.997149,23.101878


### Submissions

In [185]:
#model = StackingClassifier(estimators = estimators, final_estimator = final_est, 
#                                  passthrough = True)

model = MultinomialNB(alpha=0.17285714285714288)
#model = ComplementNB(alpha=0.18)

X = train['text_clean']
y = train['lang_id'] 

vect = TfidfVectorizer(max_df=0.9, ngram_range=(1, 2))

X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.05, random_state =1)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')
print(f1_weighted)

0.9993938561922088


In [186]:
len(vect.get_feature_names())

1487633

In [116]:
X = test['text_clean']

X_vect = vect.transform(X)

# Predict
test['lang_id'] = model.predict(X_vect)

In [117]:
submission = test[['index', 'lang_id']]
submission.to_csv('MultinomialNB005.csv',index=False)

In [118]:
submission

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot


In [187]:
y_test = pd.DataFrame(y_test).reset_index()
y_pred = pd.DataFrame(y_pred).reset_index()

In [188]:
y_pred['lang_id'] = y_pred[0].rename('lang_id')

In [189]:
y_test

Unnamed: 0,index,lang_id
0,12144,sot
1,27820,tsn
2,1827,eng
3,8116,afr
4,26529,ven
...,...,...
1645,25668,eng
1646,16652,tso
1647,26602,eng
1648,22675,ven


In [190]:
y_pred['lang_id_pred'] = y_pred['lang_id']

In [191]:
pred = pd.concat([y_test, y_pred['lang_id_pred']], axis=1)

In [192]:
pred

Unnamed: 0,index,lang_id,lang_id_pred
0,12144,sot,sot
1,27820,tsn,tsn
2,1827,eng,eng
3,8116,afr,afr
4,26529,ven,ven
...,...,...,...
1645,25668,eng,eng
1646,16652,tso,tso
1647,26602,eng,eng
1648,22675,ven,ven


In [193]:
pred[pred['lang_id']!= pred['lang_id_pred']]

Unnamed: 0,index,lang_id,lang_id_pred
498,9828,sot,nso


In [194]:
train.iloc[9828]

lang_id                                                                                                                                                                                                                                                                                     sot
text          nico vermaak grain sa basie ntsimane boy mokoena karabo peele paul morule job metswamere thando lolwane jane mcpherson grain sa dr kobus laubscher grainsa moss malo lepati macaphasa jenny mathews modulasetulo wa lenaneo la ntshetsopele ya molemi sehlopha se sebetsang mmoho
text_clean    nico vermaak grain sa basie ntsimane boy mokoena karabo peele paul morule job metswamere thando lolwane jane mcpherson grain sa dr kobus laubscher grainsa moss malo lepati macaphasa jenny mathews modulasetulo wa lenaneo la ntshetsopele ya molemi sehlopha se sebetsang mmoho
Name: 9828, dtype: object