# Week 4 - Query Understanding

## Setup

In [1]:
import subprocess

In [2]:
%mkdir -p /workspace/datasets/week4
%cd /workspace/datasets/week4

/home/jupyter/.kaggle/datasets/week4


## Preprocess queries


To preprocess query text, we run the following transformations:
- lowercase
- remove punctuation
- tokenize
- stem using porter stemmer 

As preprocssing text is slow, we do it in a separate step before running other experiments.


In [3]:
cmd = f"python /workspace/search_with_machine_learning_course/week4/preprocess_queries.py"
print(f"+ {cmd}")
subprocess.run(cmd, shell=True)

+ python /workspace/search_with_machine_learning_course/week4/preprocess_queries.py
Reading query data from /workspace/datasets/train.csv
Preprocessing query data


100%|██████████| 1865269/1865269 [02:48<00:00, 11079.23it/s]


                   category  ...                          query
0              abcat0101001  ...  television panason 50 pulgada
1              abcat0101001  ...                          sharp
2        pcmcat193100050014  ...                           nook
3              abcat0101001  ...                            rca
4              abcat0101005  ...                            rca
...                     ...  ...                            ...
1865264  pcmcat247400050000  ...                            ttv
1865265  pcmcat218000050000  ...                          incas
1865266  pcmcat248500050020  ...                        ds game
1865267  pcmcat209000050008  ...                          archo
1865268  pcmcat182300050008  ...                   graphic card

[1865269 rows x 3 columns]
Saving preprocessed query data to /workspace/datasets/week4/query_df.pk


CompletedProcess(args='python /workspace/search_with_machine_learning_course/week4/preprocess_queries.py', returncode=0)

## Create training data

### Training data for min_queries = 100

This will generate `train.mq100.txt` and `test.mq100.txt`

In [4]:
cmd = f"python /workspace/search_with_machine_learning_course/week4/create_labeled_queries.py --min_queries 100"
print(f"+ {cmd}")
subprocess.run(cmd, shell=True)

+ python /workspace/search_with_machine_learning_course/week4/create_labeled_queries.py --min_queries 100
df: len=1854998 columns=['category', 'raw_query', 'query']
             category                            raw_query                          query
0        abcat0101001  Televisiones Panasonic  50 pulgadas  television panason 50 pulgada
1        abcat0101001                                Sharp                          sharp
2  pcmcat193100050014                                 nook                           nook
3        abcat0101001                                  rca                            rca
4        abcat0101005                                  rca                            rca

No. of unique categories = 1486
No. of categories with #queries < 100 = 630 (leaf) / 668 (all)
No. of affected rows = 18336

No. of unique categories = 994
No. of categories with #queries < 100 = 105 (leaf) / 126 (all)
No. of affected rows = 4569

No. of unique categories = 920
No. of categori

CompletedProcess(args='python /workspace/search_with_machine_learning_course/week4/create_labeled_queries.py --min_queries 100', returncode=0)

### Training data for min_queries = 1000

This will generate `train.mq1000.txt` and `test.mq1000.txt`

In [5]:
cmd = f"python /workspace/search_with_machine_learning_course/week4/create_labeled_queries.py --min_queries 1000"
print(f"+ {cmd}")
subprocess.run(cmd, shell=True)

+ python /workspace/search_with_machine_learning_course/week4/create_labeled_queries.py --min_queries 1000
df: len=1854998 columns=['category', 'raw_query', 'query']
             category                            raw_query                          query
0        abcat0101001  Televisiones Panasonic  50 pulgadas  television panason 50 pulgada
1        abcat0101001                                Sharp                          sharp
2  pcmcat193100050014                                 nook                           nook
3        abcat0101001                                  rca                            rca
4        abcat0101005                                  rca                            rca

No. of unique categories = 1486
No. of categories with #queries < 1000 = 1099 (leaf) / 1188 (all)
No. of affected rows = 207932

No. of unique categories = 589
No. of categories with #queries < 1000 = 179 (leaf) / 217 (all)
No. of affected rows = 66294

No. of unique categories = 442
No. of c

CompletedProcess(args='python /workspace/search_with_machine_learning_course/week4/create_labeled_queries.py --min_queries 1000', returncode=0)

## Training experiments

In [6]:
def run_exp(epoch=5, lr=.1, wordNgrams=1):
    for mq in ["100", "1000"]:
        model = f"model.mq{mq}.e{epoch}.lr{lr}.ng{wordNgrams}"
        cmd = f"fasttext supervised -input train.mq{mq}.txt -output {model} -epoch {epoch} -lr {lr} -wordNgrams {wordNgrams} -seed 42"
        print("+", cmd)
        subprocess.run(cmd, shell=True)
        subprocess.run(f"fasttext test {model}.bin test.mq{mq}.txt", shell=True)
        subprocess.run(f"fasttext test {model}.bin test.mq{mq}.txt 3", shell=True)
        subprocess.run(f"fasttext test {model}.bin test.mq{mq}.txt 5", shell=True)
        print()

### Default fasttext parameters

In [7]:
run_exp()

+ fasttext supervised -input train.mq100.txt -output model.mq100.e5.lr0.1.ng1 -epoch 5 -lr 0.1 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7726
Number of labels: 884
Progress: 100.0% words/sec/thread:    7204 lr:  0.000000 avg.loss:  5.318608 ETA:   0h 0m 0s


N	49991
P@1	0.467
R@1	0.467
N	49991
P@3	0.206
R@3	0.619
N	49991
P@5	0.136
R@5	0.678

+ fasttext supervised -input train.mq1000.txt -output model.mq1000.e5.lr0.1.ng1 -epoch 5 -lr 0.1 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7773
Number of labels: 403
Progress: 100.0% words/sec/thread:   15052 lr:  0.000000 avg.loss:  4.360879 ETA:   0h 0m 0s-0.000005 avg.loss:  4.360879 ETA:   0h 0m 0s


N	50000
P@1	0.472
R@1	0.472
N	50000
P@3	0.212
R@3	0.636
N	50000
P@5	0.14
R@5	0.7



### Increasing learning rate

Increasing learning rate improves precision and recall

In [8]:
run_exp(lr=.5)

+ fasttext supervised -input train.mq100.txt -output model.mq100.e5.lr0.5.ng1 -epoch 5 -lr 0.5 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7726
Number of labels: 884
Progress: 100.0% words/sec/thread:    7190 lr:  0.000000 avg.loss:  3.861906 ETA:   0h 0m 0s


N	49991
P@1	0.517
R@1	0.517
N	49991
P@3	0.232
R@3	0.697
N	49991
P@5	0.152
R@5	0.761

+ fasttext supervised -input train.mq1000.txt -output model.mq1000.e5.lr0.5.ng1 -epoch 5 -lr 0.5 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7773
Number of labels: 403
Progress: 100.0% words/sec/thread:   15343 lr:  0.000000 avg.loss:  3.649434 ETA:   0h 0m 0s


N	50000
P@1	0.524
R@1	0.524
N	50000
P@3	0.235
R@3	0.705
N	50000
P@5	0.153
R@5	0.766



### Increasing epochs

Similar improvement as with increased learning rate. Perhaps default parameters result in underfitting.

In [9]:
run_exp(epoch=25)

+ fasttext supervised -input train.mq100.txt -output model.mq100.e25.lr0.1.ng1 -epoch 25 -lr 0.1 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7726
Number of labels: 884
Progress: 100.0% words/sec/thread:    7176 lr:  0.000000 avg.loss:  2.710290 ETA:   0h 0m 0s


N	49991
P@1	0.518
R@1	0.518
N	49991
P@3	0.233
R@3	0.699
N	49991
P@5	0.152
R@5	0.761

+ fasttext supervised -input train.mq1000.txt -output model.mq1000.e25.lr0.1.ng1 -epoch 25 -lr 0.1 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7773
Number of labels: 403
Progress: 100.0% words/sec/thread:   15339 lr:  0.000000 avg.loss:  2.299196 ETA:   0h 0m 0s


N	50000
P@1	0.522
R@1	0.522
N	50000
P@3	0.234
R@3	0.702
N	50000
P@5	0.153
R@5	0.765



### Increasing both epochs and learning rate

Increasing both doesn't really change much.

In [10]:
run_exp(epoch=25, lr=.5)

+ fasttext supervised -input train.mq100.txt -output model.mq100.e25.lr0.5.ng1 -epoch 25 -lr 0.5 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7726
Number of labels: 884
Progress: 100.0% words/sec/thread:    7214 lr:  0.000000 avg.loss:  3.373950 ETA:   0h 0m 0s


N	49991
P@1	0.515
R@1	0.515
N	49991
P@3	0.231
R@3	0.693
N	49991
P@5	0.151
R@5	0.757

+ fasttext supervised -input train.mq1000.txt -output model.mq1000.e25.lr0.5.ng1 -epoch 25 -lr 0.5 -wordNgrams 1 -seed 42


Read 0M words
Number of words:  7773
Number of labels: 403
Progress: 100.0% words/sec/thread:   15228 lr:  0.000000 avg.loss:  3.306704 ETA:   0h 0m 0s


N	50000
P@1	0.516
R@1	0.516
N	50000
P@3	0.232
R@3	0.695
N	50000
P@5	0.152
R@5	0.759



# Using bigrams

In [11]:
run_exp(lr=.5, wordNgrams=2)

+ fasttext supervised -input train.mq100.txt -output model.mq100.e5.lr0.5.ng2 -epoch 5 -lr 0.5 -wordNgrams 2 -seed 42


Read 0M words
Number of words:  7726
Number of labels: 884
Progress: 100.0% words/sec/thread:    7199 lr:  0.000000 avg.loss:  3.755955 ETA:   0h 0m 0s


N	49991
P@1	0.518
R@1	0.518
N	49991
P@3	0.233
R@3	0.698
N	49991
P@5	0.152
R@5	0.762

+ fasttext supervised -input train.mq1000.txt -output model.mq1000.e5.lr0.5.ng2 -epoch 5 -lr 0.5 -wordNgrams 2 -seed 42


Read 0M words
Number of words:  7773
Number of labels: 403
Progress: 100.0% words/sec/thread:   15226 lr:  0.000000 avg.loss:  3.079125 ETA:   0h 0m 0s


N	50000
P@1	0.522
R@1	0.522
N	50000
P@3	0.235
R@3	0.705
N	50000
P@5	0.154
R@5	0.769

