# Experimenting bag of words

In [1]:
import sys 
sys.path.append('../scripts')

In [2]:
import os
import pandas as pd
import numpy as np
from collections import Counter

from config import Config
from tqdm.auto import tqdm
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.feature_extraction.text import HashingVectorizer # BOW
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score

import nltk
from nltk.tokenize import word_tokenize # word tokenizer

from utils import remove_repetitions, replace_accents, make_folds

import joblib
import gc



# Load dataset

In [3]:
train_df = pd.read_csv(os.path.join(Config.data_dir, 'cleaned_train.csv'))
test_df = pd.read_csv(os.path.join(Config.data_dir, 'Test.csv'))
sub_df = pd.read_csv(os.path.join(Config.submissions_dir, 'SampleSubmission.csv'))

In [4]:
train_df.head()

Unnamed: 0,ID,text,label
0,13P0QT0,3sbaa lek ou le seim riahi ou 3sbaa le ca,-1
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1
3,U0TTYY8,ak slouma,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,-1


In [5]:
test_df.head()

Unnamed: 0,ID,text
0,2DDHQW9,barcha aaindou fiha hak w barcha teflim kadhalik
1,5HY6UEY,ye gernabou ye 9a7ba
2,ATNVUJX,saber w barra rabbi m3ak 5ouya
3,Q9XYVOQ,cha3ébbb ta7aaaaannnnnnnnnnn tfouuhh
4,TOAHLRH,rabi y5alihoulek w yfar7ek bih w inchallah itc...


# Preprocessing pipeline

* Remove neutral texts or add them to positive samples
* Tokenize input sentences


In [6]:
# use neutral as positime sample
label_map = {
    '-1' : 0,
    '0' : 1,
    '1' : 1
}
train_df['label'] = train_df['label'].apply(lambda label: label_map[str(label)])
train_df.head()

Unnamed: 0,ID,text,label
0,13P0QT0,3sbaa lek ou le seim riahi ou 3sbaa le ca,0
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,0
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,0
3,U0TTYY8,ak slouma,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,0


In [7]:
word_tokenize(text = train_df.iloc[0].text, language='french')

['3sbaa', 'lek', 'ou', 'le', 'seim', 'riahi', 'ou', '3sbaa', 'le', 'ca']

# define vectorizer
bow = CountVectorizer(
    tokenizer=word_tokenize, 
    token_pattern=None
)

bow = HashingVectorizer(
    tokenizer=word_tokenize, 
    token_pattern=None
)

%%time
# apply vectorization
transformed_dataset = bow.fit_transform(X=train_df.text)


# Training pipeline
* Using KFold cross-validation
* Split data into n folds

In [29]:
class args:
    n_folds = 10
    

dataset, n_folds = make_folds(
    data = train_df, 
    args = args, 
    target_col='label', 
    stratified= True
)

dataset.head()

Splitting:   0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,ID,text,label,fold
0,13P0QT0,3sbaa lek ou le seim riahi ou 3sbaa le ca,0,7
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,0,9
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,0,7
3,U0TTYY8,ak slouma,1,6
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,0,4


# Training

In [30]:
%%time

accs = []
for fold_num in range(n_folds):
    
    train = dataset[dataset.fold!=fold_num].reset_index(drop=True)
    val = dataset[dataset.fold==fold_num].reset_index(drop=True)
    
    # define vectorizer
    bow = HashingVectorizer(
        tokenizer=word_tokenize, 
        token_pattern=None, 
        alternate_sign=False
        )
    
    bow.fit(train.text)
    
    # train data
    xtrain = bow.transform(train.text)
    ytrain = train.label.values
    
    # test data 
    xval = bow.transform(val.text)
    yval = val.label.values
    
    # modeling 
    params = {'alpha' :1.0, 'fit_prior' :True, 'class_prior' : None}
    model = MultinomialNB(**params)
    
    # run training
    model.fit(X=xtrain, y=ytrain)
    
    # make predictions
    predictions = model.predict(X=xval)
    
    # compute accuracy
    acc = accuracy_score(y_true=yval, y_pred=predictions)
    
    print(f"[INFO] Fold : {fold_num} -> acc : {acc:.5f}")
    
    accs.append(acc)
    # save model
    _ = joblib.dump(
        value=model, 
        filename=os.path.join(Config.models_dir, f'Arabizi-NB-fold-{fold_num}.pkl')
    )
    
    del train
    del val
    del xtrain
    del xval
    del ytrain
    del yval
    del model
    del predictions
    del bow
    
    gc.collect()

avg_acc = np.array(accs).mean()

print(f'[INFO] Average accuracy : {avg_acc}')

[INFO] Fold : 0 -> acc : 0.79971
[INFO] Fold : 1 -> acc : 0.78329
[INFO] Fold : 2 -> acc : 0.79057
[INFO] Fold : 3 -> acc : 0.78971
[INFO] Fold : 4 -> acc : 0.78729
[INFO] Fold : 5 -> acc : 0.79286
[INFO] Fold : 6 -> acc : 0.78557
[INFO] Fold : 7 -> acc : 0.77943
[INFO] Fold : 8 -> acc : 0.78686
[INFO] Fold : 9 -> acc : 0.78700
[INFO] Average accuracy : 0.7882285714285714
CPU times: user 1min 5s, sys: 444 ms, total: 1min 5s
Wall time: 1min 7s


# Prediction & submission

In [32]:
# preprocess test dataset
test_df['text'] = test_df['text'].apply(
    lambda txt : replace_accents(remove_repetitions(sequence=txt, n_repetitions=2))
)

bow = HashingVectorizer(
    tokenizer=word_tokenize, 
    token_pattern=None,
    alternate_sign=False

    )

xtest = bow.transform(test_df.text)

# inference
preds = []

kf_models = [f for f in os.listdir(Config.models_dir) if 'NB' in f]
for fname in tqdm(kf_models):
    print(fname)
    # load model
    model = joblib.load(filename=os.path.join(Config.models_dir, fname))
    ypred = model.predict(xtest)
    
    # reformat predictions
    for idx, p in enumerate(ypred):
        if p == 0:
            ypred[idx] = -1
    
    preds.append(ypred)


  0%|          | 0/10 [00:00<?, ?it/s]

Arabizi-NB-fold-5.pkl
Arabizi-NB-fold-8.pkl
Arabizi-NB-fold-2.pkl
Arabizi-NB-fold-3.pkl
Arabizi-NB-fold-1.pkl
Arabizi-NB-fold-9.pkl
Arabizi-NB-fold-0.pkl
Arabizi-NB-fold-6.pkl
Arabizi-NB-fold-7.pkl
Arabizi-NB-fold-4.pkl


In [34]:
pred_df = pd.DataFrame(
    data = np.array(preds).transpose(),
    columns=[f'model-{p}' for p in range(n_folds)]
)

pred_df.head()

Unnamed: 0,model-0,model-1,model-2,model-3,model-4,model-5,model-6,model-7,model-8,model-9
0,1,1,1,1,1,1,1,1,1,1
1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,1,1,1,1,1,1,1,1,1,1
3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,1,1,1,1,1,1,1,1,1,1


In [35]:
test_df['label'] = 0

for idx in tqdm(range(len(pred_df))):

    if pred_df.iloc[idx].mean() >= .6:

        test_df.at[idx, 'label'] = 1
    else:
        test_df.at[idx, 'label'] = -1

  0%|          | 0/30000 [00:00<?, ?it/s]

In [36]:
test_df.tail()

Unnamed: 0,ID,text,label
29995,NHXTL3R,me ihebekch raw,-1
29996,U1YWB2O,nchallah rabi m3ak w iwaf9ek mais just 7abit n...,1
29997,O3KYLM0,slim rabi m3ak w e5edem w 5alli l7ossed lemnay...,1
29998,W4C38TY,bara 5alis rouhik yizi mitbal3it jam3iya hlaki...,-1
29999,4NNX5QE,rabi m3aak ya khawlaaa n7ebouuk rana barsha w ...,1


In [37]:
test_df[['ID', 'label']]['label'].value_counts()

 1    19238
-1    10762
Name: label, dtype: int64

In [38]:
fn = f'NB-params-{params}-nfolds-{n_folds}.csv'

test_df[['ID', 'label']].to_csv(
    os.path.join(Config.submissions_dir,fn), 
    index=False
)

In [39]:
fn

"NB-params-{'alpha': 1.0, 'fit_prior': True, 'class_prior': None}-nfolds-10.csv"