# Practical Deep Learning Workshop 2021
## Assignment 3 - Word Embeddings
 
### Authors:
 
1.   Chen Doytshman 205644941
2.   Naor Kolet 205533060


# 0. Imports

In [359]:
import pandas as pd
import numpy as np
 
# TensorFlow
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate
from tensorflow.keras.layers import Dropout, Dense, Lambda, Multiply, Subtract, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Activation, Reshape
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# XGBoost
from xgboost import XGBRegressor

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Misc.
import os
import joblib
import random
from tqdm import tqdm_notebook as tqdm

SEED = 42
%matplotlib inline

In [None]:
pd.set_option('display.max_colwidth', 100)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
import zipfile
DATA_DIR = 'home-depot-product-search-relevance'

files = [f for f in os.listdir(DATA_DIR) if f.endswith('zip')]
for file in tqdm(files):
    with zipfile.ZipFile(f'{DATA_DIR}/{file}') as zf:
        zf.extractall(DATA_DIR)
        os.remove(f'{DATA_DIR}/{file}')

# 0.1 Data Preparation, EDA

Let's load the data:

In [None]:
train_df = pd.read_csv(f'{DATA_DIR}/train.csv', index_col='id', encoding='latin-1')
product_desc_df = pd.read_csv(f'{DATA_DIR}/product_descriptions.csv', index_col='product_uid')
attributes_df = pd.read_csv(f'{DATA_DIR}/attributes.csv', index_col=['product_uid'], dtype={'product_uid': 'Int64'})

In [None]:
test_path = f'{DATA_DIR}/test_labels.csv'
if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
else:
    test_df = pd.read_csv(f'{DATA_DIR}/test.csv', index_col='id', encoding='latin-1')
    sol = pd.read_csv(f'{DATA_DIR}/solution.csv', index_col='id', encoding='latin-1')
    test_df = test_df[sol.relevance != -1]
    test_df = test_df.join(sol.relevance, on='id')
    test_df.to_csv(test_path)

In [None]:
train_df.head()

In [None]:
plt.figure(figsize=(10,6))
ax = sns.countplot(data=train_df, x="relevance")

relevance_values = train_df["relevance"].value_counts().sort_index()

for p, label in zip(ax.patches, relevance_values):
    ax.annotate(label, (p.get_x(), p.get_height()+0.5))

In [None]:
product_desc_df.head()

In [None]:
product_desc_df.iloc[0, 0]

In [None]:
attributes_df[attributes_df.index.isin([100001])]

def concat_attr(record):
    name, value = record
    name = f'{name} ' if not name.startswith('Bullet') else '' # Delete name if it's a "Bullet.."
    return f'{name}{value} ;'

product_groups = attributes_df.groupby('product_uid')
for product_uid, df in product_groups:
    joined_attrs = ' '.join(map(concat_attr, df.values))
    product_desc_df.loc[product_uid, 'concat_desc'] = joined_attrs

product_desc_df

TODO: explain the connection between description and attributes

# Preprocessing

In [None]:
def tokenize_serie(serie):
    serie = pd.Series(map(lambda x: list(map(ord, list(x))), serie.values), index=serie.index)
    max_len = min(serie.apply(len).max(), 1500)
    return pad_sequences(serie, max_len, padding='post')

In [None]:
def join_desc(df):
    df = df.join(product_desc_df['product_description'], on='product_uid')
    df['product_description'] = df['product_title'] + ' : ' + df['product_description']
    df = df[['product_uid', 'search_term', 'product_description', 'relevance']]
    return df

train_df = join_desc(train_df)
test_df = join_desc(test_df)

def split_x_y(df):
    search_term = tokenize_serie(df['search_term'])
    description = tokenize_serie(df['product_description'])
    
    search_term, description = map(lambda x: np.expand_dims(x,axis=2), [search_term, description])
    return (search_term, description), df['relevance'].values

# 1. Character level LSTM

## Using character level processing to predict search relevance

TODO: explain the replacement of items' description column

In [None]:
from tensorflow.keras import backend as K

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 


https://github.com/prabhnoor0212/Siamese-Network-Text-Similarity

In [None]:
from sklearn.metrics import roc_auc_score

def auroc(y_true, y_pred):
    return tf.py_function(roc_auc_score, (y_true, y_pred), tf.double)

In [None]:
sm = 0

def common_model(length):
    global sm
    inp = Input(shape=(length, 1))
    X = Conv1D(64, 10, activation='relu')(inp)
    X = MaxPooling1D()(X)
    X = Flatten()(X)
    X = Activation('relu')(X)
    X = Dense(128, activation='relu')(X)
    
    sm += 1
    return Model(inp, X, name=f'siamese_model_{sm}')

In [None]:
common_model(128).summary()

In [None]:
def init_siamese_model(search_term_len, product_description_len, output_shape=1):
    input_1 = Input(shape=(search_term_len, 1))
    input_2 = Input(shape=(product_description_len, 1))
    
    lstm_1 = LSTM(128)(input_1)
    lstm_2 = LSTM(128)(input_2)
    
    expand_layer = Lambda(lambda tensor: tensor[...,np.newaxis],name="expand_dim_layer")
    
    expended_1 = expand_layer(lstm_1)
    expended_2 = expand_layer(lstm_2)
    
    sm = common_model(128)

    vector_1 = sm(expended_1)
    
    vector_2 = sm(expended_2)
    
    x3 = Subtract()([vector_1, vector_2])
    x3 = Multiply()([x3, x3])

    x1_ = Multiply()([vector_1, vector_1])
    x2_ = Multiply()([vector_2, vector_2])
    x4 = Subtract()([x1_, x2_])
    
    x5 = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([vector_1, vector_2])

    conc = Concatenate(axis=-1)([x5,x4, x3])

    x = Dense(100, activation="relu")(conc)
    x = Dropout(0.01)(x)
    out = Dense(output_shape, activation="relu", name = 'out')(x)

    model = Model([input_1, input_2], out)

    model.compile(loss='mse', optimizer=Adam())
    
    return model

In [None]:
init_siamese_model(train_st.shape[1],train_desc.shape[1]).summary()

In [None]:
def get_callbacks(model_name):
    acc = 'val_loss'
    acc_mode = 'min'
#     acc = 'val_loss' if not model_name.startswith('ss') else 'val_loss'
#     acc_mode = 'max' if not model_name.startswith('ss') else 'min'
    
    checkpoint = ModelCheckpoint(
                              fr'./models/{model_name}.h5', 
                              monitor=acc, 
#                               verbose=1, 
                              save_best_only=True, 
                              mode=acc_mode)
    earlystop = EarlyStopping(monitor=acc, mode=acc_mode, verbose=1, patience=4)
    reduceLR = ReduceLROnPlateau(monitor = 'val_loss', mode = 'min', patience = 3,
                            factor = 0.5, min_lr = 1e-6, verbose = 1)

    return [checkpoint, reduceLR, earlystop]

In [None]:
def train_model(model_gen, train_data, batch_size=128, use_saved=False):
    os.makedirs('./models', exist_ok=True)
    model_name = model_gen.__name__[5:]
        
    if use_saved:
        history = joblib.load(fr'./models/{model_name}_history.sav')
    else:
        callbacks = get_callbacks(model_name)
        
        (train_st, train_desc), train_rel = train_data
        model = model_gen(train_st.shape[1],train_desc.shape[1])
        history = model.fit(
                            x=[train_st, train_desc],
                            y=train_rel,
                            batch_size=batch_size,
                            epochs=20,
                            validation_split=0.2,
                            callbacks=callbacks
                            )
        
        history = history.history
        joblib.dump(history, fr'./models/{model_name}_history.sav')
    
    model = load_model(fr'./models/{model_name}.h5')
    
    return model, history

In [None]:
train_data = split_x_y(train_df)
char_model, _ = train_model(init_siamese_model, train_data, use_saved=True)

In [None]:
(test_st, test_desc), test_rel = split_x_y(test_df)
mse = char_model.evaluate([test_st, test_desc], test_rel)
print(f'MSE loss on test set: {mse:.4f}')

### c. Naïve model-based benchmark with ```CountVectorizer```

TODO: MARKDOWN

In [None]:
desc_series = train_df['product_description']
search_term_series = train_df['search_term']
desc_st = pd.concat([desc_series, search_term_series])
corpus = desc_st.values
vectorizer = CountVectorizer(analyzer='char')
vectorizer.fit(corpus)
print(vectorizer.get_feature_names())

In [None]:
counts_desc = vectorizer.transform(desc_series).toarray()
counts_st = vectorizer.transform(search_term_series).toarray()
assert counts_desc.shape == counts_st.shape

In [None]:
counts_desc.shape

In [None]:
def prepare_data_naive(df):
    desc_series = df['product_description']
    search_term_series = df['search_term']
    counts_desc = vectorizer.transform(desc_series).toarray()
    counts_st = vectorizer.transform(search_term_series).toarray()
    X = np.concatenate([counts_desc, counts_st], axis=1)
    Y = df['relevance'].values
    return X, Y

In [None]:
train_x, train_y = prepare_data_naive(train_df)

In [None]:
use_saved = False
rfr_path = 'models/rfr.sav'

if use_saved:
    rfr = joblib.load(rfr_path)
else:
    rfr = RandomForestRegressor(random_state=SEED, verbose=0, n_jobs=-1)
    rfr.fit(train_x, train_y)
    joblib.dump(rfr, rfr_path)

In [None]:
train_score = rfr.score(train_x, train_y)
print(f'Got score of {train_score:.4f} according to the random forest score function on the train')

In [None]:
train_y_pred = rfr.predict(train_x)
train_mse = mean_squared_error(train_y, train_y_pred)
print(f'Got MSE of {train_mse:.4f} on the train')

In [None]:
test_x, test_y = prepare_data_naive(test_df)
test_y_pred = rfr.predict(test_x)
test_mse = mean_squared_error(test_y, test_y_pred)
print(f'Got MSE of {test_mse:.4f} on the test')

### d. Using our model as a feature extractor

In [None]:
char_model, char_history = char_model

In [None]:
fe_char_model = Model(char_model.input, char_model.layers[-3].output)

In [None]:
train_x, _ = train_data

In [None]:
train_preds_fechar = fe_char_model.predict(train_x)

In [None]:
train_preds_fechar.shape

In [None]:
use_saved = True

fe_rfr_path = 'models/fe_char_rfr.sav'
fe_xgb_path = 'models/fe_char_xgb.sav'

if use_saved:
    rfr_model = joblib.load(fe_rfr_path)
    xgb_model = joblib.load(fe_xgb_path)
else:
    xgb_model = XGBRegressor(use_label_encoder=False, n_jobs=-1)
    rfr_model = RandomForestRegressor(random_state=SEED, verbose=0, n_jobs=-1)

    print('training xgb')
    xgb_model.fit(train_preds_fechar, train_y)
    print('training rfr')
    rfr_model.fit(train_preds_fechar, train_y)

    joblib.dump(rfr_model, fe_rfr_path)
    joblib.dump(xgb_model, fe_xgb_path)

In [None]:
test_features = fe_char_model.predict([test_st, test_desc])
test_y_pred = rfr_model.predict(test_features)
test_mse = mean_squared_error(test_y, test_y_pred)
print(f'Got MSE of {test_mse:.4f} on the test using RandomForest with feature extraction')

In [None]:
test_y_pred = xgb_model.predict(test_features)
test_mse = mean_squared_error(test_y, test_y_pred)
print(f'Got MSE of {test_mse:.4f} on the test using XGBoost with feature extraction')

# 2. Word embeddings and word level LSTM

## a. Preprocess the data to create tokens of words/character-combinations

In [None]:
sentence = 'Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a "Z" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws'

In [None]:
import re
# TODO: seperate also on nubmer after chars, i.e. word100%
def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z,(,)])\.?\s?(?=[A-Z])|(?<=[A-Z,(,)])\.?\s?(?=[A-Z][a-z])|$)', identifier)
    return ' '.join([m.group(0).strip() for m in matches])

In [None]:
'BEHR Premium Textured DeckOver 1-gal. #SC-141 Tugboat Wood and Concrete Coating : BEHR Premium Textured DECKOVER is an innovative solid color coating. It will bring your old, weathered wood or concrete back to life. The advanced 100% acrylic resin formula creates a durable coating for your tired and worn out deck, rejuvenating to a whole new look.  For the best results, be sure to properly prepare the surface using other applicable BEHR products displayed above.California residents: see&nbsp;Proposition 65 informationRevives wood and composite decks, railings, porches and boat docks, also great for concrete pool decks, patios and sidewalks100% acrylic solid color coatingResists cracking and peeling and conceals splinters and cracks up to 1/4 in.Provides a durable, mildew resistant finishCovers up to 75 sq. ft. in 2 coats per gallonCreates a textured, slip-resistant finishFor best results, prepare with the appropriate BEHR product for your wood or concrete surfaceActual paint colors may vary from on-screen and printer representationsColors available to be tinted in most storesOnline Price includes Paint Care fee in the following states: CA, CO, CT, ME, MN, OR, RI, VT'

In [None]:
train_df['product_description'] = train_df['product_description'].apply(camel_case_split)

In [None]:
desc_series = train_df['product_description']
search_term_series = train_df['search_term']
desc_st = pd.concat([desc_series, search_term_series])

In [326]:
def tokenize_words(serie):
    serie = pd.Series(tokenizer.texts_to_sequences(serie.values), index=serie.index)
    max_len = min(serie.apply(len).max(), 1500)
    return pad_sequences(serie, max_len, padding='post')

In [343]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(desc_st)

In [328]:
desc_padded = tokenize_words(train_df['product_description'])
st_padded = tokenize_words(train_df['search_term'])

## b. Create embeddings

In [None]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [None]:
!gzip -d GoogleNews-vectors-negative300.bin.gz

In [392]:
from gensim import models

EMBEDDING_FILE = './GoogleNews-vectors-negative300.bin'
embeddings_index = models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
embed_size = 300
word_index = tokenizer.word_index
max_features = len(word_index) + 1

nb_words = min(len(word_index), len(word_index))
embedding_matrix = (np.random.rand(nb_words+1, embed_size) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= max_features: continue
    if word in embeddings_index:
        embedding_vector = embeddings_index.get_vector(word)
        embedding_matrix[i] = embedding_vector

## c. Constructing a Siamese network

In [396]:
def init_siamese_model_words(search_term_len, product_description_len, output_shape=1):
    input_1 = Input(shape=(search_term_len,))
    input_2 = Input(shape=(product_description_len,))
    
    embd1 = Embedding(max_features, 
                      embed_size, 
                      weights=[embedding_matrix],
                      input_length=search_term_len,
                      name='search_embd')(input_1)
    
    embd2 = Embedding(max_features,
                      embed_size,
                      weights=[embedding_matrix],
                      input_length=product_description_len,
                      name='desc_embd')(input_2)
    
    lstm_1 = LSTM(128)(embd1)
    lstm_2 = LSTM(128)(embd2)
    
    expand_layer = Lambda(lambda tensor: tensor[...,np.newaxis],name="expand_dim_layer")
    
    expended_1 = expand_layer(lstm_1)
    expended_2 = expand_layer(lstm_2)
    
    sm = common_model(128)

    vector_1 = sm(expended_1)
    
    vector_2 = sm(expended_2)
    
    x3 = Subtract()([vector_1, vector_2])
    x3 = Multiply()([x3, x3])

    x1_ = Multiply()([vector_1, vector_1])
    x2_ = Multiply()([vector_2, vector_2])
    x4 = Subtract()([x1_, x2_])
    
    x5 = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([vector_1, vector_2])

    conc = Concatenate(axis=-1)([x5,x4, x3])

    x = Dense(100, activation="relu")(conc)
    x = Dropout(0.01)(x)
    out = Dense(output_shape, activation="relu", name = 'out')(x)

    model = Model([input_1, input_2], out)
    
    model.get_layer('search_embd').trainable = False
    model.get_layer('desc_embd').trainable = False

    model.compile(loss='mse', optimizer=Adam())
    
    return model

In [397]:
init_siamese_model_words(st_padded.shape[1],desc_padded.shape[1]).summary()

Model: "model_19"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_76 (InputLayer)           [(None, 17)]         0                                            
__________________________________________________________________________________________________
input_77 (InputLayer)           [(None, 1051)]       0                                            
__________________________________________________________________________________________________
search_embd (Embedding)         (None, 17, 300)      19937400    input_76[0][0]                   
__________________________________________________________________________________________________
desc_embd (Embedding)           (None, 1051, 300)    19937400    input_77[0][0]                   
___________________________________________________________________________________________

In [398]:
train_data = ((st_padded, desc_padded), train_df['relevance'].values)
word_model, _ = train_model(init_siamese_model_words, train_data, use_saved=False)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 13/20
Epoch 00013: early stopping


In [399]:
test_desc_padded = tokenize_words(test_df['product_description'])
test_st_padded = tokenize_words(test_df['search_term'])

In [402]:
test_st_padded.shape, test_desc_padded.shape

((112067, 17), (112067, 1051))

In [403]:
mse = word_model.evaluate([test_st_padded, test_desc_padded], test_rel)
print(f'MSE loss on test set: {mse:.4f}')

MSE loss on test set: 0.2881


## d. Using our model as a feature extractor

In [404]:
fe_word_model = Model(word_model.input, word_model.layers[-3].output)

In [405]:
train_x, _ = train_data

In [407]:
train_preds_feword = fe_word_model.predict(train_x)

In [408]:
train_preds_feword.shape

(74067, 100)

In [409]:
use_saved = False

fe_rfr_path = 'models/fe_word_rfr.sav'
fe_xgb_path = 'models/fe_word_xgb.sav'

if use_saved:
    rfr_model = joblib.load(fe_rfr_path)
    xgb_model = joblib.load(fe_xgb_path)
else:
    xgb_model = XGBRegressor(use_label_encoder=False, n_jobs=-1)
    rfr_model = RandomForestRegressor(random_state=SEED, verbose=0, n_jobs=-1)

    print('training xgb')
    xgb_model.fit(train_preds_feword, train_y)
    print('training rfr')
    rfr_model.fit(train_preds_feword, train_y)

    joblib.dump(rfr_model, fe_rfr_path)
    joblib.dump(xgb_model, fe_xgb_path)

training xgb
training rfr


In [413]:
test_features = fe_word_model.predict([test_desc_padded, test_st_padded])
test_y_pred = rfr_model.predict(test_features)
test_mse = mean_squared_error(test_y, test_y_pred)
print(f'Got MSE of {test_mse:.4f} on the test using RandomForest with feature extraction')

Got MSE of 0.2873 on the test using RandomForest with feature extraction


In [414]:
test_y_pred = xgb_model.predict(test_features)
test_mse = mean_squared_error(test_y, test_y_pred)
print(f'Got MSE of {test_mse:.4f} on the test using XGBoost with feature extraction')

Got MSE of 0.2882 on the test using XGBoost with feature extraction


# 3. Comparison of Results

# 4. Final Report