# Utilisation de FastText pour la classification supervisée de texte

- Introduction
- Installation de Fasttext

In [2]:
import locale
import glob
import os.path
import requests

import sys
import codecs
import smart_open
import random
from string import digits, punctuation

import re

from nltk import RegexpTokenizer
from nltk.corpus import stopwords

if sys.version > '3':
    control_chars = [chr(0x85)]
else:
    control_chars = [unichr(0x85)]

dirname = 'txt_sentoken'

tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('english'))

def clean_text(d):
    new_d = re.sub(r'[0-9]+', '', d)
    dlist = tokenizer.tokenize(new_d.lower())
    wlist = [token for token in dlist if token not in stopword_set]
    return " ".join(wlist)

# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    norm_text = norm_text.replace('</s>', '')
    norm_text = norm_text.replace('new york', 'newyork')
    # Remove punctuation
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':', '*', '-', '\'', '[', ']', '`', '/', '<', '>']:
        norm_text = norm_text.replace(char, ' ')
    return norm_text

import time
start = time.clock()

# Concatenate and normalize test/train data
print("Cleaning up dataset...")
folders = ['pos', 'neg']
alldata = u''
for fol in folders:
    temp = u''
    # Is there a better pattern to use?
    txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))
    for txt in txt_files:
        with smart_open.smart_open(txt, "rb") as t:
            t_clean = t.read().decode("ascii")
            t_clean = t_clean.replace('\n', '')
            t_clean = clean_text(t_clean)
            for c in control_chars:
                t_clean = t_clean.replace(c, ' ')
                
            temp += '__label__{0} '.format(fol) + clean_text(t_clean)
        temp += "\n"
    temp_norm = normalize_text(temp)
    alldata += temp_norm
    
all_data_list = alldata.splitlines()
random.shuffle(all_data_list)
train = all_data_list[0:1600]
test  = all_data_list[1601:]
        
with smart_open.smart_open(os.path.join(dirname, 'train.txt'), 'wb') as f:
    for idx, line in enumerate(train):
        num_line = u"{0}\n".format(line)
        f.write(num_line.encode("utf-8"))
        
with smart_open.smart_open(os.path.join(dirname, 'test.txt'), 'wb') as f:
    for idx, line in enumerate(test):
        num_line = u"{0}\n".format(line)
        f.write(num_line.encode("utf-8"))

print("created train.txt and test.txt...")

end = time.clock()
print ("Total running time: ", end-start)

Cleaning up dataset...
created train.txt and test.txt...
('Total running time: ', 4.430346)


In [3]:
! /home/francois/Projets/fasttext/fastText-0.1.0/fasttext supervised -input txt_sentoken/train.txt -output ft_model -dim 100 -minCount 10 -lr 1.0 -epoch 50 -wordNgrams 2 -verbose 1

Read 0M words
Number of words:  7891
Number of labels: 2
Progress: 100.0%  words/sec/thread: 1197326  lr: 0.000000  loss: 0.058987  eta: 0h0m 


In [4]:
! /home/francois/Projets/fasttext/fastText-0.1.0/fasttext test ft_model.bin txt_sentoken/test.txt

N	399
P@1	0.86
R@1	0.86
Number of examples: 399


In [5]:
! /home/francois/Projets/fasttext/fastText-0.1.0/fasttext predict ft_model.bin txt_sentoken/test.txt > txt_sentoken/predictions.txt

In [6]:
import pandas as pd

predictions = pd.read_table('txt_sentoken/predictions.txt', header=None, names=['sent'])

true_label = []
for idx, line in enumerate(test):
    true_label.append(line.split(' ')[0])

true_label = pd.Series(true_label)

from sklearn.metrics import confusion_matrix, precision_score, recall_score

print(confusion_matrix(y_true=true_label, y_pred=predictions.sent))
print(precision_score(y_true=true_label, y_pred=predictions.sent, pos_label=u'__label__pos'))
print(recall_score(y_true=true_label, y_pred=predictions.sent, pos_label=u'__label__pos'))

[[169  30]
 [ 26 174]]
0.852941176471
0.87


### Récupération des coordonnées dans l'espace de représentation pour une phrase

In [7]:
! echo "this is movie a nice" | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext print-sentence-vectors ft_model.bin 

0.0089917 -0.0648 0.011288 -0.048513 -0.042958 -0.020031 0.024691 -0.043624 0.07417 0.060329 -0.058447 0.011822 -0.033099 -0.069808 -0.041454 0.0083163 -0.073101 0.0016414 0.0033266 0.025806 0.0020999 0.069131 -0.03313 -0.0055453 -0.046971 0.041006 -0.017689 0.03461 -0.016446 -0.02283 -0.014837 0.032038 -0.048377 0.025368 -0.0080721 0.043123 -0.035591 0.027284 0.055823 0.056293 -0.05315 -0.022388 0.020224 0.0027797 -0.013748 0.040548 0.028991 -0.015892 0.017652 0.02129 -0.047755 -0.03256 -0.0099254 -0.043222 0.014896 0.010894 0.063251 0.0041321 0.0056546 0.006915 0.074451 0.021588 -0.035139 0.01356 0.025366 -0.012054 -0.05446 -0.084969 0.030065 -0.016906 0.08094 0.067511 0.022843 -0.0057636 -0.052231 -0.00010444 0.021866 -0.03161 0.0028517 -0.030354 0.047949 -0.090013 -0.067344 -0.045949 0.019621 -0.04837 -0.079511 0.067844 0.0099889 -0.013375 -0.0051241 0.056258 0.010435 0.011498 -0.027954 0.082473 -0.065607 -0.013554 0.042453 -0.025248 


### Prediction sur une phrase du jeu de test

In [27]:
! shuf -n 1 txt_sentoken/test.txt > demo_sentence
! cat demo_sentence
! cat demo_sentence| /home/francois/Projets/fasttext/fastText-0.1.0/fasttext predict-prob ft_model.bin -

__label__neg staring george clooney arnold schwarzenegger chris donnell uma thurman alicia silverstone well start like say couple things first miss michael keaton miss tim burton would much prefer think last two batman films like dallas dream sequence even first film batman really star though damn close first couple anyway batman forever batman robin almost pushed bit player really say clooney regardless better kilmer good batman given next nothing might well made much difference joel schumacher said refuses bend masses hated films cheered return buton wont make batman brooding dark fine mean granted batman returns awsome film bit dark sometimes yet bright neon campy style killing anything series meant usually easy films loved con air films critics slammed granted film really paper thin cliche ridden except one thing fun virtually fun film cared less thrill little ones theres violence really none everyone comes fine end like old tv shows ends everyone laughing schwarzenegger awful mean

###  Prédiction sur une nouvelle phrase

In [12]:
! echo "this is worst film ever" | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext predict ft_model.bin -

__label__neg


In [13]:
! echo "this is the best film ever" | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext predict ft_model.bin -

__label__pos


### Prédiction avec proba sur une nouvelle phrase

In [15]:
! echo "want like mike mike badly embarrassingly bad broke six year relationship six months ago move n l still result jokes fall flat tries impress comedian well unemployed comedian one hollywood little fish gotten bad asked application starbuck actually starbuck thing gets even worse worse fail like mike however played endearingly jon favreau swingers revolves around mike half hearted awkward efforts get back social swing things end enthusiastically assisted equally underachieving twentysomething actor friends trent vince vaughn film animated performance gift gab bordering disturbing side motivational speaking gun toting sue patrick van horn rob ron livingston played hamlet longs work goofy disneyland succeed lack theme park experience together talk women make eye contact involves avoiding call never appear desperate two days industry standard talking talk walk walk strutting like cool swingers aspire always laughable effect get mike funk friends persistently convince get outside whether quickie night trip seedy vegas casino cutthroat hollywood party beautiful people 50s swing lounge everywhere encourage mike look replacement honey keep vigil progress lack like dotty doting parents mike never leaves company without recipient confidence cheerleading money mike honeys know equipped eccentric dating philosophies analogy flirting bear bunny unwavering support friends like maybe mike badly favreau also doubled film screenwriter proves gift creating engaging characters witty banter goes beyond today bon mot reassuring honesty friendships even hits low points friends commiserate mike gets gumption look love join cheerleading section well swingers light unassuming fare sweet candy want bust gut laughing" | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext predict-prob ft_model.bin -

__label__pos 0.996094


In [16]:
! echo "good film" | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext print-sentence-vectors ft_model.bin

0.003392 -0.002939 0.0019247 -0.0028346 -0.0047066 -0.00016678 0.0071454 -0.0054693 0.0041421 0.0032568 -0.0038942 0.00316 0.0020336 -0.0031915 -0.0021873 -0.0012865 -0.0073408 0.0026556 0.0015705 0.0028794 0.0011919 0.0076576 -0.0027339 -0.00035268 -0.0036188 0.0069926 -0.0024973 0.0031581 -0.0034432 0.00045934 -0.0047447 -0.0018983 -0.0072065 -0.00043172 -0.0013393 0.0064714 -0.0028161 0.00093948 0.0024238 0.0060359 -0.00098259 0.0038408 0.0033328 0.0015892 -0.0010333 -3.2415e-05 0.0038734 -0.001281 -0.0013905 0.0051946 -0.0034965 0.00033689 0.00027116 -0.00072945 0.0033516 0.00067717 0.0035627 0.0027791 0.0057138 0.0019676 0.0024891 0.0041746 -0.0024664 -0.00050395 0.00019104 -0.00336 -0.0075722 -0.0066707 0.0011618 0.0057644 0.0022545 0.0042548 0.00086093 0.0015778 -0.0067326 0.0012711 0.001366 -0.0046021 -0.00069335 -0.0042968 0.0041013 -0.00421 -0.0028406 -0.0063894 0.0029337 -0.0062338 -0.0038646 0.0081928 0.005114 -0.00094606 -0.0022145 0.0072953 0.0028441 -0.0002434 -0.0029185

In [17]:
! echo "__label__neg good film" | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext print-sentence-vectors ft_model.bin

0.003392 -0.002939 0.0019247 -0.0028346 -0.0047066 -0.00016678 0.0071454 -0.0054693 0.0041421 0.0032568 -0.0038942 0.00316 0.0020336 -0.0031915 -0.0021873 -0.0012865 -0.0073408 0.0026556 0.0015705 0.0028794 0.0011919 0.0076576 -0.0027339 -0.00035268 -0.0036188 0.0069926 -0.0024973 0.0031581 -0.0034432 0.00045934 -0.0047447 -0.0018983 -0.0072065 -0.00043172 -0.0013393 0.0064714 -0.0028161 0.00093948 0.0024238 0.0060359 -0.00098259 0.0038408 0.0033328 0.0015892 -0.0010333 -3.2415e-05 0.0038734 -0.001281 -0.0013905 0.0051946 -0.0034965 0.00033689 0.00027116 -0.00072945 0.0033516 0.00067717 0.0035627 0.0027791 0.0057138 0.0019676 0.0024891 0.0041746 -0.0024664 -0.00050395 0.00019104 -0.00336 -0.0075722 -0.0066707 0.0011618 0.0057644 0.0022545 0.0042548 0.00086093 0.0015778 -0.0067326 0.0012711 0.001366 -0.0046021 -0.00069335 -0.0042968 0.0041013 -0.00421 -0.0028406 -0.0063894 0.0029337 -0.0062338 -0.0038646 0.0081928 0.005114 -0.00094606 -0.0022145 0.0072953 0.0028441 -0.0002434 -0.0029185

In [18]:
! cat txt_sentoken/test.txt | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext print-sentence-vectors ft_model.bin > test_embeddings.txt
! cat txt_sentoken/train.txt | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext print-sentence-vectors ft_model.bin > train_embeddings.txt

In [19]:
train_embeddings = pd.read_table('train_embeddings.txt', header=None, sep = ' ', usecols=range(100))
test_embeddings  = pd.read_table('test_embeddings.txt', header=None, sep = ' ', usecols=range(100))

In [20]:
train_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.001063,0.012428,-0.002059,0.008208,0.007955,0.003319,-0.004256,0.007651,-0.012737,-0.011058,...,0.000523,-0.010139,-0.00136,-0.002075,0.005754,-0.014911,0.011132,0.001784,-0.007941,0.004074
1,-0.001401,0.013775,-0.002551,0.009404,0.008693,0.004084,-0.004653,0.00862,-0.014729,-0.012456,...,0.001362,-0.011186,-0.002151,-0.001905,0.005954,-0.016754,0.012882,0.002448,-0.009022,0.004427
2,-0.001421,0.013207,-0.001691,0.008903,0.008507,0.003653,-0.004594,0.007651,-0.0133,-0.011135,...,0.000633,-0.010526,-0.001708,-0.001801,0.005987,-0.015382,0.012073,0.002372,-0.00791,0.004628
3,0.000999,-0.012914,0.001757,-0.009833,-0.008372,-0.003287,0.004825,-0.007997,0.014222,0.012257,...,-0.000974,0.010504,0.002103,0.002106,-0.005833,0.01575,-0.012353,-0.002224,0.008334,-0.004855
4,-0.001388,0.013559,-0.0023,0.009498,0.008577,0.003838,-0.004225,0.00873,-0.014297,-0.012252,...,0.000934,-0.011179,-0.001871,-0.002441,0.00596,-0.015975,0.01265,0.002226,-0.009122,0.004612


In [21]:
train_labels = []
for idx, line in enumerate(train):
    if line.split(' ')[0] == '__label__neg':
        train_labels.append(0)
    else:
        train_labels.append(1)
    
train_labels = pd.Series(train_labels)

# Utilisation en mode non supervisé

In [22]:
! cut -d " " -f2- < txt_sentoken/train.txt > txt_sentoken/new_train.txt
! cut -d " " -f2- < txt_sentoken/test.txt > txt_sentoken/new_test.txt
! cat txt_sentoken/new_train.txt txt_sentoken/new_test.txt > txt_sentoken/alltext.txt

In [23]:
! /home/francois/Projets/fasttext/fastText-0.1.0/fasttext skipgram -input txt_sentoken/alltext.txt -output ft_unsup -verbose 1 -epoch 10

Read 0M words
Number of words:  14445
Number of labels: 0
Progress: 100.0%  words/sec/thread: 40345  lr: 0.000000  loss: 2.307617  eta: 0h0m 


In [156]:
! echo "movie" | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext print-word-vectors ft_unsup.bin

movie 0.14064 0.22668 -0.21277 0.11936 0.12501 0.10321 -0.2514 -0.29288 -0.29551 -0.35803 0.31854 0.055502 -0.21604 -0.15882 0.16071 0.047071 0.34528 0.28839 0.18431 0.039957 -0.049985 0.17658 0.053777 -0.098213 -0.060322 0.24786 0.1271 0.44754 -0.088739 0.12483 -0.19556 0.039624 0.04636 0.030464 -0.16416 0.054768 -0.1555 -0.21387 -0.05512 -0.2886 -0.1767 -0.03436 -0.037316 -0.081621 -0.086023 0.048882 0.19246 0.050984 0.18906 -0.067273 -0.285 -0.076275 -0.016961 -0.17284 0.1082 0.22225 0.20788 0.072995 0.094063 -0.32515 -0.048841 -0.18102 0.10388 0.20685 -0.058732 0.2537 0.13877 0.035383 -0.33446 0.14121 0.055363 -0.075047 -0.01501 0.2677 -0.17807 -0.22202 -0.023567 -0.20686 0.086915 -0.22158 -0.015467 0.21271 0.30773 0.16065 0.029145 0.040464 0.29892 0.2771 -0.02625 0.047435 0.0054103 -0.037923 -0.0018578 0.30257 0.2315 0.048956 0.078198 -0.066184 0.14997 -0.021119 


In [24]:
! echo 'god' | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext nn ft_unsup.bin

Pre-computing word vectors... done.
Query word? kjv 0.4999
gods 0.499604
gosh 0.487426
rayden 0.481517
ohh 0.476493
prayer 0.468075
sebastian 0.463756
prophecy 0.462057
godfather 0.462025
forbid 0.459193
Query word? 

In [25]:
! echo 'I really loved this film' | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext print-sentence-vectors ft_unsup.bin

I really loved this film -0.025931 -0.073626 -0.010985 0.020291 -0.043761 -0.036222 -0.0042842 0.074306 0.021662 0.060196 0.078136 -0.18185 0.0040928 -0.082199 -0.034199 -0.01356 0.031131 -0.01258 0.041837 0.18099 0.036069 -0.093139 -0.026138 0.000578 0.04385 -0.071343 -0.061684 0.086262 -0.01204 -0.054685 -0.079823 0.062614 -0.028148 0.034306 0.12456 0.024237 -0.042043 -0.087541 -0.0067077 0.070211 0.034819 0.050928 0.012006 -0.0030801 0.075727 0.028465 0.063046 0.010449 0.030095 0.038868 -0.10905 0.10011 -0.071911 0.10821 0.13392 -0.0045151 -0.027522 -0.035445 -0.036866 -0.078188 0.030399 -0.026785 0.072355 0.088656 -0.0015046 -0.099272 -0.10924 0.027447 -0.027323 -0.020139 0.029448 -0.028611 -0.01919 -0.12224 -0.025228 -0.018938 -0.058468 -0.1063 0.060376 0.052353 0.0093961 0.080436 0.05728 0.061197 0.10581 -0.027898 -0.037431 0.01361 0.029897 0.079944 0.098863 -0.063421 0.078235 -0.075797 -0.12327 -0.0076739 -0.065259 0.032764 0.026322 -0.060646 


# Utilisation d'un modèle pré-entrainé

In [117]:
! echo "Nous aimons la bière" | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext print-sentence-vectors /home/francois/Projets/fasttext/wiki.fr.bin

Nous aimons la bière 0.0097061 0.027632 -0.016972 0.0024993 -0.059054 -0.049652 -0.038861 0.067473 -0.027538 0.04983 -0.101 0.051885 -0.060752 0.043942 0.00046987 -0.065984 0.032154 0.013981 -0.0010329 -0.027698 0.046101 -0.067757 -0.032105 0.048581 0.018488 -0.054423 -0.028525 -0.03788 -0.013619 -0.036317 0.052851 0.01124 0.0069892 0.03149 0.04434 -0.028495 -0.026408 -0.037046 0.040003 -0.025441 0.031822 -0.033723 0.037992 -0.0064203 0.050544 -0.01959 0.017874 0.058403 -0.01319 0.024226 0.012506 0.004137 -0.022506 0.072635 -0.02259 -0.050828 -0.073293 -0.03184 0.058128 -0.01114 -0.023823 0.083778 0.073318 0.036844 -0.036143 0.027283 -0.0095662 -0.0055503 0.031119 -0.069717 0.014842 0.019191 -0.098134 0.0060344 -0.092135 -0.0078774 0.071788 0.069902 -0.017464 0.02453 -0.038238 -0.015698 -0.028161 -0.036732 0.0016258 0.055754 0.047618 0.028653 -0.005505 -0.091129 -0.017169 0.0024231 -0.021791 -0.050251 -0.012768 0.015336 -0.046974 0.013884 0.011046 -0.081225 0.058553 -0.025834 -0.006233

In [161]:
! echo 'dieu' | /home/francois/Projets/fasttext/fastText-0.1.0/fasttext nn /home/francois/Projets/fasttext/wiki.fr.bin

Pre-computing word vectors... done.
Query word? dieu  0.712345
divinité 0.702067
dieu, 0.696018
dieux 0.689006
divin 0.689001
divine 0.659328
divine» 0.657204
dieu» 0.650595
dieux» 0.64837
démiurge 0.643392
Query word? 