# Setting up

In [None]:
%load_ext autoreload
%autoreload 2
%pip install nltk pandas
import nltk
nltk.download('punkt')

In [None]:
import pandas as pd
import json
import contextlib
from functions import display_rows

ENABLE_CLASSIFIER_TRAINING=False
ENABLE_SYNONYM_TRAINING=False


%env WEEK3 /workspace/search_with_machine_learning_course/week3
WEEK3='/workspace/search_with_machine_learning_course/week3'

%env MAX_CATEGORY_DEPTH=4
MAX_CATEGORY_DEPTH=4

PRUNE_PRODUCT_IF_LESS_THAN = 5


# Training Classifier

In [None]:
# Dump categories, 10 level deep, dump everything
!python dump_category_depth_maps.py --max-depth 10
!head -n 3 $WEEK3/data/category_depth_maps.csv

In [None]:
category_depth_maps_df = pd.read_csv(f'{WEEK3}/data/category_depth_maps.csv').set_index('id')
category_depth_maps_at_depth_df = category_depth_maps_df[category_depth_maps_df['depth'] <= MAX_CATEGORY_DEPTH]

display_rows(category_depth_maps_at_depth_df, 4)


In [None]:
#!python createContentTrainingData.py --output $WEEK3/data/output.fasttext

In [None]:
!tail -n 3 $WEEK3/data/output.fasttext
!tail -n 3 $WEEK3/data/output.fasttext.csv

In [None]:
# load all fastext labels
fasttext_df = pd.read_csv(f'{WEEK3}/data/output.fasttext.csv').set_index('label')

# shuffle data
fasttext_df = fasttext_df.sample(frac=1)

# group data by labels
fasttext_df_group_by_label = fasttext_df.groupby('label')
print(f'fasttext_df_group_by_label:')
display_rows(fasttext_df_group_by_label.agg(['count']), 5)


In [None]:
# prune the labels that has less than N products
fasttext_df_labels_with_more_than_n_products = fasttext_df_group_by_label.filter(lambda x: x.shape[0] > PRUNE_PRODUCT_IF_LESS_THAN)
print(f'fasttext_df_labels_with_more_than_{PRUNE_PRODUCT_IF_LESS_THAN}_products:')
display_rows(fasttext_df_labels_with_more_than_n_products, 5)

with open(f'{WEEK3}/data/output-prune-less-than-{PRUNE_PRODUCT_IF_LESS_THAN}-products.fasttext', 'w') as output:
    for index, row in fasttext_df_labels_with_more_than_n_products.iterrows():        
        output.write("__label__%s %s\n" % (index, row['name_stemmed']))


In [None]:

# keep only the labels up to depth Nth catetory depth. For example, N=2, Best Buy -> Category level2
fasttext_df_at_depth = fasttext_df[
    fasttext_df.index.isin(category_depth_maps_at_depth_df.index)
]
fasttext_df_at_depth_first_cell_id = fasttext_df_at_depth.index.values[0]
print('fasttext_df_at_depth:')
display_rows(fasttext_df_at_depth, 5)
print(f'The depth of {fasttext_df_at_depth_first_cell_id} in category_depth_maps_at_depth_df depth (for UAT):')
display(
category_depth_maps_at_depth_df[category_depth_maps_at_depth_df.index == fasttext_df_at_depth_first_cell_id]
)

with open(f'{WEEK3}/data/output-prune-category-at-depth-{MAX_CATEGORY_DEPTH}.fasttext', 'w') as output:
    for index, row in fasttext_df_at_depth.iterrows():        
        output.write("__label__%s %s\n" % (index, row['name_stemmed']))

print('')

In [None]:
# keep only the labels up to depth Nth catetory depth and prune labels with less than N products
fasttext_df_more_than_n_products_and_at_depth = fasttext_df_labels_with_more_than_n_products[
    fasttext_df_labels_with_more_than_n_products.index.isin(category_depth_maps_at_depth_df.index)
]

print('')

with open(f'{WEEK3}/data/output-prune-less-than-{PRUNE_PRODUCT_IF_LESS_THAN}-prune-category-at-depth-{MAX_CATEGORY_DEPTH}.fasttext', 'w') as output:
    for index, row in fasttext_df_at_depth.iterrows():        
        output.write("__label__%s %s\n" % (index, row['name_stemmed']))


In [None]:

%env DATA_SIZE=25000
%env CLASSIFICATION_TRAINING_FILE=$WEEK3/data/output-prune-less-than-5-prune-category-at-depth-4.fasttext
%env FASTTEXT_TRAINING_FILE=/workspace/search_with_machine_learning_course/week3/data/data.train
%env FASTTEXT_TEST_FILE=/workspace/search_with_machine_learning_course/week3/data/data.test


!mkdir -p /workspace/search_with_machine_learning_course/week3/data
print('Generating data.train...')
!head -n $DATA_SIZE $CLASSIFICATION_TRAINING_FILE > $FASTTEXT_TRAINING_FILE
print('Done')
print('Generating data.test...')
!tail -n $DATA_SIZE $CLASSIFICATION_TRAINING_FILE > $FASTTEXT_TEST_FILE
print('Done')

print('\n')
!echo Fist 5 lines in $FASTTEXT_TRAINING_FILE
!head -n 5 $FASTTEXT_TRAINING_FILE
print('\n')
!echo Fist 5 lines in $FASTTEXT_TEST_FILE
!head -n 5 $FASTTEXT_TEST_FILE



In [None]:
!~/fastText-0.9.2/fasttext supervised \
    -input $FASTTEXT_TRAINING_FILE \
    -output /workspace/search_with_machine_learning_course/week3/data/model \
    -epoch 25 \
    -lr 1 \
    -loss hs \
    -wordNgrams 2


!~/fastText-0.9.2/fasttext test \
    /workspace/search_with_machine_learning_course/week3/data/model.bin \
    $FASTTEXT_TEST_FILE


In [None]:
!python f_classify.py -n 5 -i "apple"

# Training Synonym

In [None]:
!~/fastText-0.9.2/fasttext skipgram \
    -input /workspace/datasets/fasttext/titles.txt \
    -output /workspace/search_with_machine_learning_course/week3/data/title_model_epoch_25

In [None]:
import fasttext
import functions
from hand_picked_tokens import get_tokens

models = [
    'title_model.bin'
]

output = ''
for input in get_tokens():
    for model_name in models:
        input_stemmed = functions.transform_name(input)
        model = fasttext.load_model(f'/workspace/search_with_machine_learning_course/week3/data/{model_name}')    
        predictions = model.get_nearest_neighbors(input_stemmed, k=10)

        output += f'Synonyms of [{input}]:\n'
        
        for (score, r) in predictions:    
            output += f'    {r}    ({score})\n'
        output += '----------------------------\n\n'


print(output)

In [None]:
token='iphone'
print(f'Synonyms for {token}:')
predictions = f_nn.predict(token, 10)
for (score, r) in predictions:    
    print(f'    {r}    ({score})')
print('----------------------------')
print('')

In [None]:
import hand_picked_tokens
import f_nn

for token in hand_picked_tokens.get_tokens():
    print(f'Synonyms for {token}:')
    predictions = f_nn.predict(token, 10)
    for (score, r) in predictions:    
        print(f'    {r}    ({score})')
    print('----------------------------')
    print('')

# Review rating to language correlation



In [None]:
%env REVIEW_DATA_SIZE=50000
%env RAW_FASTTEXT_REVIEW_LABEL_FILE=/workspace/search_with_machine_learning_course/week3/data/output-reviews.fasttext
%env SHUFFLED_FASTTEXT_REVIEW_LABEL_FILE=/workspace/search_with_machine_learning_course/week3/data/output-reviews-shuffled.fasttext
%env FASTTEXT_REVIEW_TRAINING_FILE=/workspace/search_with_machine_learning_course/week3/data/data-review.train
%env FASTTEXT_REVIEW_TEST_FILE=/workspace/search_with_machine_learning_course/week3/data/data-review.test

In [None]:
!python createReviewLabels.py --use-3-tie --output $RAW_FASTTEXT_REVIEW_LABEL_FILE

In [None]:
!wc -l $RAW_FASTTEXT_REVIEW_LABEL_FILE

!echo ""
!echo "Shuffle $RAW_FASTTEXT_REVIEW_LABEL_FILE into $SHUFFLED_FASTTEXT_REVIEW_LABEL_FILE"
!shuf $RAW_FASTTEXT_REVIEW_LABEL_FILE -o $SHUFFLED_FASTTEXT_REVIEW_LABEL_FILE

#!echo ""
#!echo "First first 3 rows of $RAW_FASTTEXT_REVIEW_LABEL_FILE"
#!head -n 3 $RAW_FASTTEXT_REVIEW_LABEL_FILE

#!echo ""
#!echo "First first 3 rows of $SHUFFLED_FASTTEXT_REVIEW_LABEL_FILE"
#!head -n 3 $SHUFFLED_FASTTEXT_REVIEW_LABEL_FILE



In [None]:

!mkdir -p /workspace/search_with_machine_learning_course/week3/data

!echo "Generating $FASTTEXT_REVIEW_TRAINING_FILE..."
!head -n $REVIEW_DATA_SIZE $SHUFFLED_FASTTEXT_REVIEW_LABEL_FILE > $FASTTEXT_REVIEW_TRAINING_FILE
print('Done')

!echo "Generating $FASTTEXT_REVIEW_TEST_FILE..."
!tail -n $REVIEW_DATA_SIZE $SHUFFLED_FASTTEXT_REVIEW_LABEL_FILE > $FASTTEXT_REVIEW_TEST_FILE
print('Done')


#print('\n')
#!echo Fist 5 lines in $FASTTEXT_REVIEW_TRAINING_FILE
#!head -n 5 $FASTTEXT_REVIEW_TRAINING_FILE
#print('\n')
#!echo Fist 5 lines in $FASTTEXT_REVIEW_TEST_FILE
#!head -n 5 $FASTTEXT_REVIEW_TEST_FILE


In [None]:
if True:
    !~/fastText-0.9.2/fasttext supervised \
        -input $FASTTEXT_REVIEW_TRAINING_FILE \
        -output $WEEK3/data/review-model \
        -autotune-validation $FASTTEXT_REVIEW_TEST_FILE \
        -autotune-duration 1200


    !~/fastText-0.9.2/fasttext test \
        $WEEK3/data/review-model.bin \
        $FASTTEXT_REVIEW_TEST_FILE

In [None]:
if True:
    !~/fastText-0.9.2/fasttext supervised \
        -input $FASTTEXT_REVIEW_TRAINING_FILE \
        -output $WEEK3/data/review-model \
        -epoch 25 \
        -lr 1.0 \
        -wordNgrams 2


    !~/fastText-0.9.2/fasttext test \
        $WEEK3/data/review-model.bin \
        $FASTTEXT_REVIEW_TEST_FILE