# Week 4 - Query Understanding

## Setup

In [None]:
import subprocess

In [10]:
%mkdir -p /workspace/datasets/week4
%cd /workspace/datasets/week4

/home/jupyter/.kaggle/datasets/week4


## Preprocess queries


To preprocess query text, we run the following transformations:
- lowercase
- remove punctuation
- tokenize
- stem using porter stemmer 

As preprocssing text is slow, we do it in a separate step before running other experiments.


In [1]:
cmd = f"python /workspace/search_with_machine_learning_course/week4/preprocess_queries.py"
print(f"+ {cmd}")
subprocess.run(cmd, shell=True)

+ python /workspace/search_with_machine_learning_course/week4/preprocess_queries.py
Reading query data from /workspace/datasets/train.csv
Preprocessing query data


100%|██████████| 1865269/1865269 [02:46<00:00, 11225.45it/s]


                   category  ...                          query
0              abcat0101001  ...  television panason 50 pulgada
1              abcat0101001  ...                          sharp
2        pcmcat193100050014  ...                           nook
3              abcat0101001  ...                            rca
4              abcat0101005  ...                            rca
...                     ...  ...                            ...
1865264  pcmcat247400050000  ...                            ttv
1865265  pcmcat218000050000  ...                          incas
1865266  pcmcat248500050020  ...                        ds game
1865267  pcmcat209000050008  ...                          archo
1865268  pcmcat182300050008  ...                   graphic card

[1865269 rows x 3 columns]
Saving preprocessed query data to /workspace/datasets/week4/query_df.pk


CompletedProcess(args='python /workspace/search_with_machine_learning_course/week4/preprocess_queries.py', returncode=0)

## Create training data

### Training data for min_queries = 100

This will generate `train.mq100.txt` and `test.mq100.txt`

In [2]:
cmd = f"python /workspace/search_with_machine_learning_course/week4/create_labeled_queries.py --min_queries 100"
print(f"+ {cmd}")
subprocess.run(cmd, shell=True)

+ python /workspace/search_with_machine_learning_course/week4/create_labeled_queries.py --min_queries 100
df: len=1854998 columns=['category', 'raw_query', 'query']
             category                            raw_query                          query
0        abcat0101001  Televisiones Panasonic  50 pulgadas  television panason 50 pulgada
1        abcat0101001                                Sharp                          sharp
2  pcmcat193100050014                                 nook                           nook
3        abcat0101001                                  rca                            rca
4        abcat0101005                                  rca                            rca

No. of unique categories = 1486
No. of categories with #queries < 100 = 668
No. of affected rows = 19256

No. of unique categories = 1004
No. of categories with #queries < 100 = 140
No. of affected rows = 5858

No. of unique categories = 916
No. of categories with #queries < 100 = 36
No. of af

CompletedProcess(args='python /workspace/search_with_machine_learning_course/week4/create_labeled_queries.py --min_queries 100', returncode=0)

### Training data for min_queries = 1000

This will generate `train.mq1000.txt` and `test.mq1000.txt`

In [4]:
cmd = f"python /workspace/search_with_machine_learning_course/week4/create_labeled_queries.py --min_queries 1000"
print(f"+ {cmd}")
subprocess.run(cmd, shell=True)

+ python /workspace/search_with_machine_learning_course/week4/create_labeled_queries.py --min_queries 1000
df: len=1854998 columns=['category', 'raw_query', 'query']
             category                            raw_query                          query
0        abcat0101001  Televisiones Panasonic  50 pulgadas  television panason 50 pulgada
1        abcat0101001                                Sharp                          sharp
2  pcmcat193100050014                                 nook                           nook
3        abcat0101001                                  rca                            rca
4        abcat0101005                                  rca                            rca

No. of unique categories = 1486
No. of categories with #queries < 1000 = 1188
No. of affected rows = 224005

No. of unique categories = 605
No. of categories with #queries < 1000 = 236
No. of affected rows = 82358

No. of unique categories = 435
No. of categories with #queries < 1000 = 48
No.

CompletedProcess(args='python /workspace/search_with_machine_learning_course/week4/create_labeled_queries.py --min_queries 1000', returncode=0)

## Training experiments

In [15]:
def run_exp(epoch=5, lr=.1, wordNgrams=1):
    for mq in ["100", "1000"]:
        model = f"model.mq{mq}.e{epoch}.lr{lr}.ng{wordNgrams}"
        cmd = f"fasttext supervised -input train.mq{mq}.txt -output {model} -epoch {epoch} -lr {lr} -wordNgrams {wordNgrams} -seed 42"
        print("+", cmd)
        subprocess.run(cmd, shell=True)
        subprocess.run(f"fasttext test {model}.bin test.mq{mq}.txt", shell=True)
        subprocess.run(f"fasttext test {model}.bin test.mq{mq}.txt 3", shell=True)
        subprocess.run(f"fasttext test {model}.bin test.mq{mq}.txt 5", shell=True)
        print()

### Default fasttext parameters

In [12]:
run_exp()

+ fasttext supervised -input train.mq100.txt -output model.mq100.e5.lr0.1.ng1 -epoch 5 -lr 0.1 -wordNgrams 1


Read 0M words
Number of words:  7640
Number of labels: 872
Progress: 100.0% words/sec/thread:    7381 lr:  0.000000 avg.loss:  5.311603 ETA:   0h 0m 0s


N	49986
P@1	0.466
R@1	0.466
N	49986
P@3	0.207
R@3	0.62
N	49986
P@5	0.136
R@5	0.678

+ fasttext supervised -input train.mq1000.txt -output model.mq1000.e5.lr0.1.ng1 -epoch 5 -lr 0.1 -wordNgrams 1


Read 0M words
Number of words:  7759
Number of labels: 387
Progress: 100.0% words/sec/thread:   15636 lr:  0.000000 avg.loss:  4.309403 ETA:   0h 0m 0s


N	50000
P@1	0.481
R@1	0.481
N	50000
P@3	0.214
R@3	0.643
N	50000
P@5	0.141
R@5	0.707



### Increasing learning rate

Increasing learning rate improves precision and recall

In [18]:
run_exp(lr=.5)

+ fasttext supervised -input train.mq100.txt -output model.mq100.e5.lr0.5.ng1 -epoch 5 -lr 0.5 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7640
Number of labels: 872
Progress: 100.0% words/sec/thread:    7450 lr:  0.000000 avg.loss:  3.887329 ETA:   0h 0m 0s


N	49986
P@1	0.516
R@1	0.516
N	49986
P@3	0.232
R@3	0.697
N	49986
P@5	0.152
R@5	0.762

+ fasttext supervised -input train.mq1000.txt -output model.mq1000.e5.lr0.5.ng1 -epoch 5 -lr 0.5 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7759
Number of labels: 387
Progress: 100.0% words/sec/thread:   16268 lr:  0.000000 avg.loss:  3.607438 ETA:   0h 0m 0s


N	50000
P@1	0.525
R@1	0.525
N	50000
P@3	0.236
R@3	0.708
N	50000
P@5	0.154
R@5	0.772



### Increasing epochs

Similar improvement as with increased learning rate. Perhaps default parameters result in underfitting.

In [17]:
run_exp(epoch=25)

+ fasttext supervised -input train.mq100.txt -output model.mq100.e25.lr0.1.ng1 -epoch 25 -lr 0.1 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7640
Number of labels: 872
Progress: 100.0% words/sec/thread:    7312 lr:  0.000000 avg.loss:  2.566525 ETA:   0h 0m 0s


N	49986
P@1	0.518
R@1	0.518
N	49986
P@3	0.232
R@3	0.697
N	49986
P@5	0.152
R@5	0.761

+ fasttext supervised -input train.mq1000.txt -output model.mq1000.e25.lr0.1.ng1 -epoch 25 -lr 0.1 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7759
Number of labels: 387
Progress: 100.0% words/sec/thread:   16006 lr:  0.000000 avg.loss:  2.238065 ETA:   0h 0m 0s


N	50000
P@1	0.526
R@1	0.526
N	50000
P@3	0.236
R@3	0.708
N	50000
P@5	0.154
R@5	0.769



### Increasing both epochs and learning rate

Increasing both doesn't really change much.

In [19]:
run_exp(epoch=25, lr=.5)

+ fasttext supervised -input train.mq100.txt -output model.mq100.e25.lr0.5.ng1 -epoch 25 -lr 0.5 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7640
Number of labels: 872
Progress: 100.0% words/sec/thread:    7474 lr:  0.000000 avg.loss:  3.262587 ETA:   0h 0m 0s


N	49986
P@1	0.514
R@1	0.514
N	49986
P@3	0.231
R@3	0.692
N	49986
P@5	0.151
R@5	0.753

+ fasttext supervised -input train.mq1000.txt -output model.mq1000.e25.lr0.5.ng1 -epoch 25 -lr 0.5 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7759
Number of labels: 387
Progress: 100.0% words/sec/thread:   16461 lr:  0.000000 avg.loss:  2.844285 ETA:   0h 0m 0s


N	50000
P@1	0.522
R@1	0.522
N	50000
P@3	0.235
R@3	0.705
N	50000
P@5	0.153
R@5	0.765



# Using bigrams

In [20]:
run_exp(lr=.5, wordNgrams=2)

+ fasttext supervised -input train.mq100.txt -output model.mq100.e5.lr0.5.ng2 -epoch 5 -lr 0.5 -wordNgrams 2 -seed 42


Read 0M words
Number of words:  7640
Number of labels: 872
Progress: 100.0% words/sec/thread:    7382 lr:  0.000000 avg.loss:  3.655626 ETA:   0h 0m 0s


N	49986
P@1	0.514
R@1	0.514
N	49986
P@3	0.232
R@3	0.696
N	49986
P@5	0.152
R@5	0.762

+ fasttext supervised -input train.mq1000.txt -output model.mq1000.e5.lr0.5.ng2 -epoch 5 -lr 0.5 -wordNgrams 2 -seed 42


Read 0M words
Number of words:  7759
Number of labels: 387
Progress: 100.0% words/sec/thread:   15927 lr:  0.000000 avg.loss:  3.092229 ETA:   0h 0m 0s


N	50000
P@1	0.527
R@1	0.527
N	50000
P@3	0.237
R@3	0.712
N	50000
P@5	0.155
R@5	0.777

