In [1]:
import pandas as pd
import fasttext

## Download data

In [1]:
!wget https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz
!tar -xvzf cooking.stackexchange.tar.gz
!head cooking.stackexchange.txt

--2023-03-11 16:15:35--  https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 457609 (447K) [application/x-tar]
Saving to: ‘cooking.stackexchange.tar.gz’


2023-03-11 16:15:35 (10.8 MB/s) - ‘cooking.stackexchange.tar.gz’ saved [457609/457609]

cooking.stackexchange.id
cooking.stackexchange.txt
readme.txt
__label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe?
__label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments
__label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove?
__label__restaurant Michelin Three Star Restaurant; but if the chef is not there
__label__knife-skills __label__dicing Without knife ski

## Split into train and eval

In [17]:
!head -12404 cooking.stackexchange.txt > cooking.train
!tail -3000 cooking.stackexchange.txt > cooking.test

## Train a classifier

In [18]:
model = fasttext.train_supervised(input='cooking.train')

Read 0M words
Number of words:  14543
Number of labels: 735
Progress: 100.0% words/sec/thread:    9470 lr:  0.000000 avg.loss: 10.173771 ETA:   0h 0m 0s


## Evaluate

In [19]:
model.test('cooking.test')

# N (n samples) = 3,000
# precision@1 = 0.157
# recall@1 = 0.067

(3000, 0.15666666666666668, 0.0677526308202393)

In [20]:
# Predict five samples
model.test('cooking.test', k=5)

# N (n samples) = 3,000
# precision@5 = 0.069
# recall@5 = 0.148

(3000, 0.06866666666666667, 0.1484791696698861)

## Data preparation

In [24]:
# Text is not all in lowercase and there's punctuation
!head cooking.train -n 5

__label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe?
__label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments
__label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove?
__label__restaurant Michelin Three Star Restaurant; but if the chef is not there
__label__knife-skills __label__dicing Without knife skills, how can I quickly and accurately dice vegetables?


In [25]:
# Lowercase and split punctuations as their own tokens
!cat cooking.stackexchange.txt | sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" > cooking.preprocessed.txt
!head -12404 cooking.preprocessed.txt > cooking.train
!tail -3000 cooking.preprocessed.txt > cooking.test

In [26]:
!head cooking.train -n 5

__label__sauce __label__cheese how much does potato starch affect a cheese sauce recipe ? 
__label__food-safety __label__acidity dangerous pathogens capable of growing in acidic environments
__label__cast-iron __label__stove how do i cover up the white spots on my cast iron stove ? 
__label__restaurant michelin three star restaurant; but if the chef is not there
__label__knife-skills __label__dicing without knife skills ,  how can i quickly and accurately dice vegetables ? 


## Train and evaluate again

In [27]:
model = fasttext.train_supervised(input='cooking.train')

Read 0M words
Number of words:  8952
Number of labels: 735
Progress: 100.0% words/sec/thread:   10109 lr:  0.000000 avg.loss: 10.265862 ETA:   0h 0m 0s 49.9% words/sec/thread:   10513 lr:  0.050106 avg.loss: 11.161880 ETA:   0h 0m 2s


In [28]:
model.test('cooking.test')

# Some improvement
# N (n samples) = 3,000
# precision@1 = 0.176
# recall@1 = 0.076

(3000, 0.17633333333333334, 0.07625774830618423)

In [29]:
model = fasttext.train_supervised(input='cooking.train', epoch=25)

Read 0M words
Number of words:  8952
Number of labels: 735
Progress: 100.0% words/sec/thread:   14934 lr:  0.000000 avg.loss:  7.219388 ETA:   0h 0m 0s 24.2% words/sec/thread:   11072 lr:  0.075794 avg.loss:  9.759532 ETA:   0h 0m18s 74.1% words/sec/thread:   12377 lr:  0.025857 avg.loss:  7.764671 ETA:   0h 0m 5s


In [30]:
model.test('cooking.test')

# Some improvement
# N (n samples) = 3,000
# precision@1 = 0.52
# recall@1 = 0.22

(3000, 0.5193333333333333, 0.22459276344241028)

In [31]:
model = fasttext.train_supervised(input='cooking.train', epoch=100)

Read 0M words
Number of words:  8952
Number of labels: 735
Progress: 100.0% words/sec/thread:   28544 lr:  0.000000 avg.loss:  3.243346 ETA:   0h 0m 0s


In [32]:
model.test('cooking.test')

# Some improvement
# N (n samples) = 3,000
# precision@1 = 0.547
# recall@1 = 0.23

(3000, 0.5476666666666666, 0.23684589880351736)

In [33]:
model = fasttext.train_supervised(input='cooking.train', lr=1.0, epoch=25)

Read 0M words
Number of words:  8952
Number of labels: 735
Progress: 100.0% words/sec/thread:   20789 lr:  0.000000 avg.loss:  4.640056 ETA:   0h 0m 0s


In [34]:
model.test('cooking.test')

# Some improvement
# N (n samples) = 3,000
# precision@1 = 0.58
# recall@1 = 0.25

(3000, 0.5806666666666667, 0.25111719763586565)

In [35]:
model = fasttext.train_supervised(input='cooking.train', lr=0.01, epoch=25)

Read 0M words
Number of words:  8952
Number of labels: 735
Progress: 100.0% words/sec/thread:   22235 lr:  0.000000 avg.loss: 10.787449 ETA:   0h 0m 0s


In [36]:
model.test('cooking.test')

# Some improvement
# N (n samples) = 3,000
# precision@1 = 0.096
# recall@1 = 0.042

(3000, 0.096, 0.04151650569410408)

In [37]:
model = fasttext.train_supervised(input='cooking.train', lr=1.0, wordNgrams=2, epoch=25)

Read 0M words
Number of words:  8952
Number of labels: 735
Progress: 100.0% words/sec/thread:   27886 lr:  0.000000 avg.loss:  3.221201 ETA:   0h 0m 0s


In [38]:
model.test('cooking.test')

# Some improvement
# N (n samples) = 3,000
# precision@1 = 0.607
# recall@1 = 0.263

(3000, 0.607, 0.26250540579501225)

## Synonyms via word representations

In [11]:
model = fasttext.train_unsupervised(input='/workspace/search_with_machine_learning_course/data/wiki_sample.txt',
                                    model='skipgram', maxn=0, minCount=20)

Read 0M words
Number of words:  3066
Number of labels: 0
Progress: 100.0% words/sec/thread:   61376 lr:  0.000000 avg.loss:  2.428927 ETA:   0h 0m 0s 14.2% words/sec/thread:   66956 lr:  0.042894 avg.loss:  2.910815 ETA:   0h 0m 3s


In [12]:
model.get_nearest_neighbors('politics')

[(0.7584359049797058, 'crown'),
 (0.7537130117416382, 'presidential'),
 (0.7474841475486755, 'finance'),
 (0.7364110350608826, 'foundation'),
 (0.7332940101623535, 'commerce'),
 (0.7316431999206543, 'cabinet'),
 (0.7286686897277832, 'agency'),
 (0.7266026735305786, 'office'),
 (0.7207195162773132, 'foreign'),
 (0.713577926158905, 'politicians')]

In [13]:
model.get_nearest_neighbors('linux')

[(0.9231120944023132, 'applications'),
 (0.914638340473175, 'computers'),
 (0.9082654118537903, 'implementation'),
 (0.9012420177459717, 'input'),
 (0.8922620415687561, 'component'),
 (0.8842714428901672, 'visual'),
 (0.8797218799591064, 'microsoft'),
 (0.8760251402854919, 'tools'),
 (0.8733764886856079, 'devices'),
 (0.8730340003967285, 'file')]