In [2]:
import sys
sys.path.append('../scripts')

In [3]:
import warnings
warnings.filterwarnings(action='ignore')

In [21]:
import os
import sys
import pandas as pd
import numpy as np

import torch as th
from torch.utils.data import DataLoader

import pytorch_lightning as pl

from config import Config
from dataset import DataSet

from utils import load_models, predict, remove_repetitions, replace_accents, save_experiment_conf
from tqdm.auto import tqdm

In [10]:
test_df = pd.read_csv(os.path.join(Config.data_dir, "Test.csv"))
test_df.text = test_df.text.apply(
    lambda txt: replace_accents(remove_repetitions(sequence=txt, n_repetitions=2))
)

In [11]:
test_df.head()

Unnamed: 0,ID,text
0,2DDHQW9,barcha aaindou fiha hak w barcha teflim kadhalik
1,5HY6UEY,ye gernabou ye 9a7ba
2,ATNVUJX,saber w barra rabbi m3ak 5ouya
3,Q9XYVOQ,cha3ebb ta7aaaaann tfouuhh
4,TOAHLRH,rabi y5alihoulek w yfar7ek bih w inchallah itc...


In [24]:
test_ds = DataSet(df=test_df, task='test')
test_dl = DataLoader(
        dataset=test_ds,
        batch_size=128,
        shuffle=False
    )

In [18]:
try:
    last_version = args.version
except:
    last_version = save_experiment_conf()

print(f'[INFO] Version : {last_version}')
loaded_models = load_models(n_folds=Config.n_folds, version=last_version)

[INFO] Version : 21
[INFO] (5) Matching models found : 
 ['arabizi-sentiments-camembert-base-version-21-fold-0.bin', 'arabizi-sentiments-camembert-base-version-21-fold-1.bin', 'arabizi-sentiments-camembert-base-version-21-fold-2.bin', 'arabizi-sentiments-camembert-base-version-21-fold-3.bin', 'arabizi-sentiments-camembert-base-version-21-fold-4.bin']


Loding models:   0%|          | 0/5 [00:00<?, ?it/s]

In [36]:
predictions = []
all_predictions = []

for num in range(Config.n_folds):
    model = loaded_models[num]
    print(f'Model from split {num}')
    with th.no_grad():
        model.eval()
        model.cuda()
        for data in tqdm(test_dl, desc='Predicting'):
            ids = data['ids']
            logits = model(ids.cuda())
            # as we added 1 to avoid target from being < 0 (Negative sentiment)
            reformat_pred = logits.argmax(dim=1) - 1
            predictions += (reformat_pred.detach().cpu().numpy().tolist())

        all_predictions.append(np.array(predictions))
        predictions = []


Model from split 0


Predicting:   0%|          | 0/235 [00:00<?, ?it/s]

Model from split 1


Predicting:   0%|          | 0/235 [00:00<?, ?it/s]

Model from split 2


Predicting:   0%|          | 0/235 [00:00<?, ?it/s]

Model from split 3


Predicting:   0%|          | 0/235 [00:00<?, ?it/s]

Model from split 4


Predicting:   0%|          | 0/235 [00:00<?, ?it/s]

In [38]:
np.array(all_predictions).shape

(5, 30000)

In [55]:
probas = np.array(all_predictions, dtype=int).mean(axis=0)
probas

array([ 0.2, -1. ,  1. , ...,  0.4, -1. ,  1. ])

In [64]:
labels = np.zeros(shape=30000, dtype=int)
for idx, p in enumerate(probas):
        if p > .7:
            labels[idx] = 1
        else:
            labels[idx] = -1


In [65]:
labels

array([-1, -1,  1, ..., -1, -1,  1])