# FastText and Word Vector (TP n°3)

In [2]:
import os

import numpy as np

from datasets import load_dataset

from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer

import spacy
import fasttext as fast
#import transformers

from typing import Dict
from typing import Callable
from typing import List
import re

# set a defined random generator, better for reproducible results.
random = np.random.default_rng(42)

## Take a look on IMDB dataset:

In [3]:
imdb = load_dataset('imdb')
print(imdb)

Reusing dataset imdb (/home/leherlemaxime/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


And we have the following number of entries:

In [4]:
print(f"train entries: {len(imdb['train'])}\ntest entries: {len(imdb['test'])}")

train entries: 25000
test entries: 25000


## Translate dataset for FastText API:

Generate a shuffle index list:

In [5]:
rand_idx = np.arange(len(imdb['train']))
np.random.shuffle(rand_idx)
print(rand_idx[:10])
rand_idy = np.arange(len(imdb['train']))
np.random.shuffle(rand_idy)
print(rand_idy[:10])

[10757  3394  5269 14177  2703 16747 24521  7483   580 19241]


Write IMDB dataset into file with FastText format:

In [6]:
%%time

if not os.path.exists("imdb_train.txt"):
    with open("imdb_train.txt", "wb") as f:
        for i in rand_idx:
            entry = imdb['train'][int(i)]
            s = f"__label__{entry['label']} {entry['text']}\n".encode("utf-8")
            f.write(s)
    
        f.close()
        
if not os.path.exists("imdb_test.txt"):
    with open("imdb_test.txt", "wb") as f:
        for i in rand_idy:
            entry = imdb['test'][int(i)]
            s = f"__label__{entry['label']} {entry['text']}\n".encode("utf-8")
            f.write(s)
    
        f.close()

CPU times: user 1.03 ms, sys: 550 µs, total: 1.58 ms
Wall time: 1.47 ms


Let's see the input format of an entry:

In [7]:
!head -n 1 imdb_train.txt

__label__0 Omen IV: The Awakening starts at the 'St. Frances Orphanage' where husband & wife Karen (Faye Grant) & Gene York (Michael Woods) are given a baby girl by Sister Yvonne (Megan Leitch) who they have adopted, they name her Delia. At first things go well but as the years pass & Delia (Asia Vieria) grows up Karen becomes suspicious of her as death & disaster follows her, Karen is convinced that she is evil itself. Karen then finds out that she is pregnant but discovers a sinister plot to use her as a surrogate mother for th next Antichrist & gets a shock when she finds out who Delia's real father was...<br /><br />Originally to be directed by Dominique Othenin-Girard who either quit or was sacked & was replaced by Jorge Montesi who completed the film although why he bothered is anyone's guess as Omen IV: The Awakening is absolutely terrible & a disgrace when compared to it illustrious predecessors. The script by Brian Taggert is hilariously bad, I'm not sure whether this nonsense

## First training with FastText model:

In [8]:
fast_model = fast.train_supervised('imdb_train.txt')

Read 5M words
Number of words:  281132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1399773 lr:  0.000000 avg.loss:  0.425937 ETA:   0h 0m 0s


Let's see the train vocabulary:

In [9]:
print(f"the vocabulary size is: {len(fast_model.words)}\n\nThis is a slice of it:\n{fast_model.words[:20]}")

the vocabulary size is: 281132

This is a slice of it:
['the', 'a', 'and', 'of', 'to', 'is', 'in', 'I', 'that', 'this', 'it', '/><br', 'was', 'as', 'with', 'for', 'but', 'The', 'on', 'movie']


### Results of the model:

We respectfully copy and paste this print function from FastText documentation to see results:

In [10]:
def print_results(N : int, p : float, r : float) -> None:
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

So let's compute precision at 1 (P@1) and the recall on the test dataset:

In [11]:
print_results(*fast_model.test('imdb_test.txt'))

N	25000
P@1	0.859
R@1	0.859


And we can compute these metrics for all labels separately:

In [12]:
def print_labels_results(l_scores : Dict[str, Dict[str, float]]) -> None:
    for label in l_scores:
        print(f"label '{label}':\n")
        print(f"\tprecision: {np.round(l_scores[label]['precision'], 3)}")
        print(f"\trecall: {np.round(l_scores[label]['recall'], 3)}")
        print(f"\tF1 score: {np.round(l_scores[label]['f1score'], 3)}\n")

In [13]:
print_labels_results(fast_model.test_label('imdb_test.txt'))

label '__label__1':

	precision: 0.867
	recall: nan
	F1 score: 1.733

label '__label__0':

	precision: 0.852
	recall: nan
	F1 score: 1.704



## Pre-processing on IMDB dataset:

### Clean the text

The text-format is not perfect, we have for exemple '\t' or '<br\>' that are formated text. So we will replace all special char by space. And we will also add space before and after '!' to make it a separated word.

In [14]:
def clean_the_text(text_array : str) -> str:
    '''
        This function return a list of all word and char in the text in parameters.

            Parameters:
                    text_array (str): The text in a string format.

            Returns:
                    result (str) : A list with all the word and char in the inpt text.
    '''
    
    specialChars = "()\\\''.,;:\"?-" 
    for specialChar in specialChars:
        text_array = text_array.replace(specialChar, ' ')
        
    text_array = text_array.replace("/>", ' ')
    text_array = text_array.replace("<br", ' ')
    
    ''' We add space before and after '!' for the split function '''
    text_array = text_array.replace("!", " ! ")
    
    return text_array.lower()

Now we can try the same model but with the clean text and see if this modification change the results.

In [15]:
%%time

if not os.path.exists("imdb_clean_train.txt"):
    with open("imdb_clean_train.txt", "wb") as f:
        for i in rand_idx:
            entry = imdb['train'][int(i)]
            s = f"__label__{entry['label']} {clean_the_text(entry['text'])}\n".encode("utf-8")
            f.write(s)
    
        f.close()
        
if not os.path.exists("imdb_clean_test.txt"):
    with open("imdb_clean_test.txt", "wb") as f:
        for i in rand_idy:
            entry = imdb['test'][int(i)]
            s = f"__label__{entry['label']} {clean_the_text(entry['text'])}\n".encode("utf-8")
            f.write(s)
    
        f.close()

CPU times: user 1.37 ms, sys: 0 ns, total: 1.37 ms
Wall time: 1.26 ms


In [16]:
!head -n 1 imdb_clean_train.txt

__label__0 what a time we live in when someone like this joe swan whatever the hell is considered a good filmmaker   or even a filmmaker at all !  where are the new crop of filmmakers with brains and talent    we need them bad  and to hell with mumblecore !       this movie is about nothing  just as the characters in the film stand for nothing  it s this horrible  so called gen y  that is full of bored idiots  some of which declare themselves filmmakers with out bothering to learn anything about the craft before shooting  well  orson welles was a filmmaker  john huston was a filmmaker  fellini was a filmmaker  dreyer was a filmmaker  etc  current films like these show just how stupid young  so called  filmmakers  can be when they believe going out with no script  no direction  no thought  no legit  camerawork   everything shot horribly on dv   no craft of editing  no nothing  stands for  rebellious  or  advanced  film making  nope  it s called ignorance and laziness or just pure mastur

In [17]:
fast_model_clean = fast.train_supervised('imdb_clean_train.txt')

Read 6M words
Number of words:  80799
Number of labels: 2
Progress: 100.0% words/sec/thread: 1528910 lr:  0.000000 avg.loss:  0.389570 ETA:   0h 0m 0s100.0% words/sec/thread: 1529060 lr: -0.000007 avg.loss:  0.389570 ETA:   0h 0m 0s


In [18]:
print(f"the vocabulary size is: {len(fast_model_clean.words)}\n\nThis is a slice of it:\n{fast_model_clean.words[:20]}")

the vocabulary size is: 80799

This is a slice of it:
['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this', 'that', 's', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'you']


#### Result of clean model

In [19]:
print_results(*fast_model_clean.test('imdb_clean_test.txt'))

N	25000
P@1	0.879
R@1	0.879


In [20]:
print_labels_results(fast_model_clean.test_label('imdb_clean_test.txt'))

label '__label__0':

	precision: 0.882
	recall: nan
	F1 score: 1.763

label '__label__1':

	precision: 0.876
	recall: nan
	F1 score: 1.752



So we can see that in average we have a upgrade of our result of 0.02. It's not a huge upgrade but it's still ok for a few more seconds of calculation.

### Clean text with stop word

We have see that the function to clean upgrade our result but why stop here ?

We can add a other clean step on the text, this step is to delete stop words. What is stop words ? Stop words are the non-discriminating words, like __the__, __a__, __an__, __this__ ....

So first of all we will create a list of all the stop word and after we will delete them from text.

In [21]:
list_of_stop_word = ["the", "and", "a", "of", "to", "is", "it", "in", "this", "that", "s", "was", "as", "for", "with", "but", "then", "an", "at", "who", "when", "than", "where", "which", "with", "on", "t", "are", "by", "so", "from", "have", "be", "or", "just", "about", ""]

Now we will create our extend clean text function.

In [22]:
def clean_the_text_extend(text : str, list_of_stop_word : List[str]) -> str:
    '''
        This function return a list of all word and char in the text in parameters.

            Parameters:
                    text (str): The text in a string format.
                    
                    list_of_stop_word: This is that list of our stop words to remove from the text

            Returns:
                    result (str) : A list with all the word and char in the inpt text.
    '''
    text = text.lower()
    
    specialChars = "()\\\''.,;:\"?-" 
    for specialChar in specialChars:
        text = text.replace(specialChar, ' ')
        
    text = text.replace("/>", ' ')
    text = text.replace("<br", ' ')
    
    text = text.replace("</s>", " ")
    
    ''' We add space before and after '!' for the split function '''
    text = text.replace("!", " ! ")
    
    for word in list_of_stop_word:
        ''' We add this to only remove the all word and not isolated letter in an other word'''
        word = " " + word + " "
        text = text.replace(word, " ")
    
    return text.lower()

Now try again with this new function

In [23]:
%%time

if not os.path.exists("imdb_clean_extend_train.txt"):
    with open("imdb_clean_extend_train.txt", "wb") as f:
        for i in rand_idx:
            entry = imdb['train'][int(i)]
            s = f"__label__{entry['label']} {clean_the_text_extend(entry['text'], list_of_stop_word)}\n".encode("utf-8")
            f.write(s)
    
        f.close()

if not os.path.exists("imdb_clean_extend_test.txt"):
    with open("imdb_clean_extend_test.txt", "wb") as f:
        for i in rand_idy:
            entry = imdb['test'][int(i)]
            s = f"__label__{entry['label']} {clean_the_text_extend(entry['text'], list_of_stop_word)}\n".encode("utf-8")
            f.write(s)
    
        f.close()

CPU times: user 1.39 ms, sys: 145 µs, total: 1.54 ms
Wall time: 1.44 ms


In [24]:
!head -n 1 imdb_clean_extend_train.txt

__label__0 this yawn titles credits  boring point tedium acting wooden stilted !  admittedly director richard jobson directing debut  earth green lit script poorly developed one  looks like another money down drain government project  scottish screen credited surprise  surprise   nearly fell asleep three times my review will unfortunately more restrained one  please  please mister jobson what ever you ve been doing prior directing sedative  go back ! 


In [25]:
fast_model_clean_extend = fast.train_supervised('imdb_clean_extend_train.txt')

Read 3M words
Number of words:  80799
Number of labels: 2
Progress: 100.0% words/sec/thread: 1472403 lr:  0.000000 avg.loss:  0.325895 ETA:   0h 0m 0s


In [26]:
print(f"the vocabulary size is: {len(fast_model_clean_extend.words)}\n\nThis is a slice of it:\n{fast_model_clean_extend.words[:20]}")

the vocabulary size is: 80799

This is a slice of it:
['you', 'not', 'one', '</s>', '!', 'all', 'they', 'like', 'there', 'or', 'just', 'about', 'out', 'if', 'has', 'what', 'some', 'good', 'can', 'more']


#### Result of clean model extend

In [27]:
print_results(*fast_model_clean_extend.test('imdb_clean_extend_test.txt'))

N	25000
P@1	0.886
R@1	0.886


In [28]:
print_labels_results(fast_model_clean_extend.test_label('imdb_clean_extend_test.txt'))

label '__label__1':

	precision: 0.891
	recall: nan
	F1 score: 1.781

label '__label__0':

	precision: 0.881
	recall: nan
	F1 score: 1.762



So with this result we can see that the result are abit better so we will now use the clean text expand instead of clean text classic.

### Stemming the data:

First of all we need to create a function that stemme a word.

In [29]:
re_word = re.compile(r"^\w+$")
stemmer = SnowballStemmer("english")

stem_word: Callable[[str], str] = lambda w : stemmer.stem(w.lower()) if re_word.match(w) else w

Now we have ti create a function that apply stemming to a whole text.

In [30]:
def stemming_text(text : str) -> str:
    '''
        This function steeming the text in parameter and return in
        
        Parameters :
                text (str) : the text to stemming
                
        Returns :
                return_text (str) : the text stemmed
    '''
    list_of_words = text.split(" ")
    
    list_of_words = [stem_word(word) for word in list_of_words]
    
    return_text = " ".join(list_of_words)
    
    return return_text

Now with this function we can create the new model where we use the stemming for all text before write them in the file.

In [31]:
%%time

if not os.path.exists("imdb_stemmed_train.txt"):
    with open("imdb_stemmed_train.txt", "wb") as f:
        for i in rand_idx:
            entry = imdb['train'][int(i)]
            s = f"__label__{entry['label']} {stemming_text(entry['text'])}\n".encode("utf-8")
            f.write(s)
    
        f.close()
        
if not os.path.exists("imdb_stemmed_test.txt"):
    with open("imdb_stemmed_test.txt", "wb") as f:
        for i in rand_idy:
            entry = imdb['test'][int(i)]
            s = f"__label__{entry['label']} {stemming_text(entry['text'])}\n".encode("utf-8")
            f.write(s)
    
        f.close()

CPU times: user 1.24 ms, sys: 110 µs, total: 1.35 ms
Wall time: 1.24 ms


In [32]:
!head -n 1 imdb_stemmed_train.txt

__label__1 i watch this last night after not have seen it for sever years. it realli is a fun littl film, with a bunch of face you didn't know were in it. arkin shine as always. check it out; you won't be dissappointed. by the way, it was just releas on dvd and contrari to it packaging, it is widescreen. the transfer is rather poor, but at least the whole movi is visible. ;-)


In [33]:
fast_model_stemmed = fast.train_supervised('imdb_stemmed_train.txt')

Read 5M words
Number of words:  245430
Number of labels: 2
Progress: 100.0% words/sec/thread: 1434439 lr:  0.000000 avg.loss:  0.423211 ETA:   0h 0m 0s


In [34]:
print(f"the vocabulary size is: {len(fast_model_stemmed.words)}\n\nThis is a slice of it:\n{fast_model_stemmed.words[:20]}")

the vocabulary size is: 245430

This is a slice of it:
['the', 'a', 'and', 'of', 'to', 'is', 'in', 'it', 'i', 'this', 'that', '/><br', 'was', 'as', 'for', 'with', 'but', 'movi', 'film', 'be']


#### Result of stemmed model

In [35]:
print_results(*fast_model_stemmed.test('imdb_stemmed_test.txt'))

N	25000
P@1	0.861
R@1	0.861


In [36]:
print_labels_results(fast_model_stemmed.test_label('imdb_stemmed_test.txt'))

label '__label__1':

	precision: 0.86
	recall: nan
	F1 score: 1.72

label '__label__0':

	precision: 0.863
	recall: nan
	F1 score: 1.726



The stemmed model don't change the result or juste of 0.002 in the label 1 so the result is not convincing.

### Lemming the data:

Firstly, we need to download the english model of Spacy lemmatization:

In [37]:
!python -m spacy download en_core_web_sm > output_dl.txt

In [38]:
# loading the small English model
nlp = spacy.load("en_core_web_sm")

In [39]:
%%time

if not os.path.exists("lemmed_imdb_train.txt"):
    with open("lemmed_imdb_train.txt", "wb") as f:
        for i in rand_idx:
            entry = imdb['train'][int(i)]
            
            # lemmatize before writting
            lemmed_text = ' '.join([token.lemma_ for token in nlp(entry['text'])])
            s = f"__label__{entry['label']} {lemmed_text}\n".encode("utf-8")
            f.write(s)
    
        f.close()

CPU times: user 2.26 ms, sys: 170 µs, total: 2.43 ms
Wall time: 1.87 ms


Do it in test dataset also:

In [40]:
if not os.path.exists("lemmed_imdb_test.txt"):
    with open("lemmed_imdb_test.txt", "wb") as f:
        for i in rand_idy:
            entry = imdb['test'][int(i)]
            
            # lemmatize before writting
            lemmed_text = ' '.join([token.lemma_ for token in nlp(entry['text'])])
            s = f"__label__{entry['label']} {lemmed_text}\n".encode("utf-8")
            f.write(s)
    
        f.close()

In [41]:
fast_model_lemming = fast.train_supervised('lemmed_imdb_train.txt')

Read 6M words
Number of words:  106199
Number of labels: 2
Progress: 100.0% words/sec/thread: 1797330 lr:  0.000000 avg.loss:  0.414804 ETA:   0h 0m 0s


In [42]:
print(f"the vocabulary size is: {len(fast_model_lemming.words)}\n\nThis is a slice of it:\n{fast_model_lemming.words[:20]}")

the vocabulary size is: 106199

This is a slice of it:
['the', 'be', ',', '.', 'and', 'a', 'of', 'to', 'it', 'I', 'in', 'this', 'that', '"', 'have', '-', '/><br', 'movie', 'film', 'as']


#### Results

In [43]:
print_results(*fast_model_lemming.test('lemmed_imdb_test.txt'))

N	25000
P@1	0.867
R@1	0.867


In [44]:
print_labels_results(fast_model_lemming.test_label('lemmed_imdb_test.txt'))

label '__label__0':

	precision: 0.868
	recall: nan
	F1 score: 1.736

label '__label__1':

	precision: 0.867
	recall: nan
	F1 score: 1.734



## Hyperparameters tunning:

We need to extract a validation set of our train dataset to avoid a tunning validation on test dataset:

In [45]:
# split command will copy and separate file into set of files of 20000 lines.
# Train file have 25.000 lines, so train will have 20.000 lines and validation 5.000 lines.
!split -l20000 "imdb_train.txt" tuning_

!ls -l ./tuning_*

-rw-r--r-- 1 leherlemaxime leherlemaxime 26719815 Oct  5 16:38 ./tuning_aa
-rw-r--r-- 1 leherlemaxime leherlemaxime  6713008 Oct  5 16:38 ./tuning_ab
-rw-r--r-- 1 leherlemaxime leherlemaxime 19853802 Oct  5 16:14 ./tuning_complet_aa
-rw-r--r-- 1 leherlemaxime leherlemaxime  4974531 Oct  5 16:14 ./tuning_complet_ab


### Try the default hyperparameter tunning of FastText:

In [46]:
tunning_fast_model = fast.train_supervised(input='tuning_aa', autotuneValidationFile='tuning_ab', autotuneMetric="f1:__label__0")

Progress: 100.0% Trials:    9 Best score:  0.885598 ETA:   0h 0m 0s
Training again with best arguments
Read 4M words
Number of words:  244423
Number of labels: 2
Progress: 100.0% words/sec/thread:  638637 lr:  0.000000 avg.loss:  0.047441 ETA:   0h 0m 0s


Let's compute global metrics:

In [47]:
print_results(*tunning_fast_model.test('imdb_test.txt'))

N	25000
P@1	0.883
R@1	0.883


It looks to give better results with default hyperparameter tunning. But how labels scores change ?

In [48]:
print_labels_results(tunning_fast_model.test_label('imdb_test.txt'))

label '__label__1':

	precision: 0.882
	recall: nan
	F1 score: 1.764

label '__label__0':

	precision: 0.885
	recall: nan
	F1 score: 1.769



Results are better with tunning and we highlight that optimize f1 result on __negative__ label induces better improvements on __positive__ label. The reason is because we juste have two labels and __negative__ label had less wrongly classification.

## Merge optimisation

First we will try to add the clean text extend optimisation to other optimisaion because we see that this optimisation clean the text and just keep this important part. We also see that lemming is much better that stemming.

It's the reason why we try a model with clean text extend and lemming.

In [49]:
%%time

if not os.path.exists("lemmed_clean_imdb_train.txt"):
    with open("lemmed_clean_imdb_train.txt", "wb") as f:
        for i in rand_idx:
            entry = imdb['train'][int(i)]
            
            # lemmatize before writting
            lemmed_text = ' '.join([token.lemma_ for token in nlp(clean_the_text_extend(entry['text'], list_of_stop_word))])
            s = f"__label__{entry['label']} {lemmed_text}\n".encode("utf-8")
            f.write(s)
    
        f.close()

if not os.path.exists("lemmed_clean_imdb_test.txt"):
    with open("lemmed_clean_imdb_test.txt", "wb") as f:
        for i in rand_idy:
            entry = imdb['test'][int(i)]
            
            # lemmatize before writting
            lemmed_text = ' '.join([token.lemma_ for token in nlp(clean_the_text_extend(entry['text'], list_of_stop_word))])
            s = f"__label__{entry['label']} {lemmed_text}\n".encode("utf-8")
            f.write(s)
    
        f.close()

CPU times: user 158 µs, sys: 10 ms, total: 10.2 ms
Wall time: 7.6 ms


In [50]:
fast_model_lemming_clean = fast.train_supervised('lemmed_clean_imdb_train.txt')

Read 3M words
Number of words:  64059
Number of labels: 2
Progress: 100.0% words/sec/thread: 1685913 lr:  0.000000 avg.loss:  0.327277 ETA:   0h 0m 0s


In [51]:
print(f"the vocabulary size is: {len(fast_model_lemming_clean.words)}\n\nThis is a slice of it:\n{fast_model_lemming_clean.words[:20]}")

the vocabulary size is: 64059

This is a slice of it:
['you', 'not', 'they', 'have', 'be', 'one', 'do', '</s>', '!', 'all', 'see', 'make', 'like', 'good', 'there', 'well', 'or', 'just', 'about', 'out']


#### Results of combine optimisations

In [52]:
print_results(*fast_model_lemming_clean.test('lemmed_clean_imdb_test.txt'))

N	25000
P@1	0.880
R@1	0.880


In [53]:
print_labels_results(fast_model_lemming_clean.test_label('lemmed_clean_imdb_test.txt'))

label '__label__0':

	precision: 0.875
	recall: nan
	F1 score: 1.75

label '__label__1':

	precision: 0.884
	recall: nan
	F1 score: 1.768



So with these 2 optimisation we have not a improvement compare to clean text extended only. OUr hypothesys is that we start to overfit on the train set. So a solution is to use this methode but with tunning hyperparametres.

## Our final model

In [54]:
# split command will copy and separate file into set of files of 20000 lines.
# Train file have 25.000 lines, so train will have 20.000 lines and validation 5.000 lines.
!split -l20000 "lemmed_clean_imdb_train.txt" tuning_complet_

!ls -l ./tuning_complet_*

-rw-r--r-- 1 leherlemaxime leherlemaxime 19853802 Oct  5 16:45 ./tuning_complet_aa
-rw-r--r-- 1 leherlemaxime leherlemaxime  4974531 Oct  5 16:45 ./tuning_complet_ab


We have manually work on the parameter epochs, rate and n-grams. But we don't succed in find a better resulat that juste use autometric so we juste use this to get our better result on this data set.

In [55]:
tunning_fast_model_complet = fast.train_supervised(input='tuning_complet_aa', autotuneValidationFile='tuning_complet_ab', autotuneMetric="f1:__label__0")

Progress: 100.0% Trials:    9 Best score:  0.893574 ETA:   0h 0m 0s
Training again with best arguments
Read 3M words
Number of words:  58103
Number of labels: 2
Progress: 100.0% words/sec/thread:  414091 lr:  0.000000 avg.loss:  0.034724 ETA:   0h 0m 0s


### Our final results

In [56]:
print_results(*tunning_fast_model_complet.test('lemmed_clean_imdb_test.txt'))

N	25000
P@1	0.894
R@1	0.894


In [57]:
print_labels_results(tunning_fast_model_complet.test_label('lemmed_clean_imdb_test.txt'))

label '__label__1':

	precision: 0.897
	recall: nan
	F1 score: 1.795

label '__label__0':

	precision: 0.891
	recall: nan
	F1 score: 1.782



## Conclusion: