In [0]:
%matplotlib inline

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from bs4 import BeautifulSoup
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [0]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [11]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

In [None]:
data_dir = './'

In [13]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# data_dir = '/content/gdrive/My Drive/ML colab datasets/crowdflower-search-relevance'

# !ls '{data_dir}'

# !cp -v '{data_dir}/kappa_intuition.py' ./
# !mkdir -p /root/.kaggle
# !cp -v '{data_dir}/kaggle.json' /root/.kaggle/

'/content/gdrive/My Drive/ML colab datasets/crowdflower-search-relevance/kappa_intuition.py' -> './kappa_intuition.py'
'/content/gdrive/My Drive/ML colab datasets/crowdflower-search-relevance/kaggle.json' -> '/root/.kaggle/kaggle.json'


In [0]:
from kappa_intuition import quadratic_weighted_kappa

In [12]:
train_file = f'{data_dir}/train.csv'
df = pd.read_csv(train_file)
df.fillna('', inplace=True)
df.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,Red satin accent pillow embroidered with a hea...,1,0.0
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Chr...,Set of 10 Battery Operated Train Christmas Lig...,4,0.0
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471
3,5,wine rack,Concept Housewares WR-44526 Solid-Wood Ceiling...,"Like a silent and sturdy tree, the Southern En...",4,0.0
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb ...,"WTGR1011\nFeatures\nNickel base, 60,000 averag...",2,0.471


In [15]:
import nltk
nltk.download('book')
from nltk import word_tokenize

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package conll2000 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/dependency_treebank.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    

In [0]:
from nltk.corpus import stopwords
import string
ignore_set = set(stopwords.words('english') + list(string.punctuation))

In [0]:
def preprocess(text):
    text = text.replace('\\n', '')
    text = text.replace('\\t', ' ')
    text = text.lower().strip()
    tokens = word_tokenize(text)
    tokens = list(filter(lambda x: x not in ignore_set, tokens))
    return tokens

In [0]:
def get_num_words(text):
    return len(preprocess(text))

In [0]:
def get_lengths(df):
    return list(map(get_num_words, df))

In [0]:
title_lens = get_lengths(df['product_title'])
np.median(title_lens), np.mean(title_lens), np.min(title_lens), np.max(title_lens)

In [22]:
query_lens = get_lengths(df['query'])
np.median(query_lens), np.mean(query_lens), np.min(query_lens), np.max(query_lens)

(2.0, 2.361783815711754, 1, 6)

In [0]:
desc_lens = get_lengths(df['product_description'])

In [25]:
# plt.hist(desc_lens)
np.median(desc_lens), np.mean(desc_lens), np.min(desc_lens), np.max(desc_lens)

(30.0, 43.05237251427447, 0, 1853)

In [0]:
outs = list(df['median_relevance'])
# plt.hist(outs)
from collections import Counter
label_counter = Counter(outs)

In [0]:
def loadGloveModel(glove_file):
    print("Loading Glove Model")
    model = {}
    with open(glove_file,'r') as f:
        for line in f:
            splitLine = line.split()
            word = splitLine[0]
            embedding = np.array([float(val) for val in splitLine[1:]])
            model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [28]:
glove_file = f'{data_dir}/glove.6B.100d.txt'
try:
    glove_model.keys()
except:
    glove_model = loadGloveModel(glove_file)

Loading Glove Model
Done. 400000  words loaded!


In [0]:
def super_sample(df):
    import random
    all_words =list(glove_model.keys())
    def replace_tokens(text, prob):
        tokens = word_tokenize(text)
        num = int(len(tokens) * prob + 0.25)
        indices = random.sample(range(len(tokens)), num)
        for i in indices:
            tokens[i] = random.choice(all_words)
        # print(len(tokens), num)
        # print(text)
        text = ' '.join(tokens)
        # print(text)
        return text

    max_ = max(label_counter.values())
    
    new_data = []
    for k, v in label_counter.items():
        if k == 1:
            prob = 0.5
        else:
            continue
        num = min(max_ - v, v) 
        if num < 1:
            continue
        else:
            times = num // v + 1
            arr = []
            for i in range(times):
                arr.append(df[df.median_relevance == k].values)
            arr = np.concatenate(arr, axis=0)
            # query
            for i in range(len(arr)):
                text = arr[i, 1]
                arr[i, 1] = replace_tokens(text, prob)
            # title
            for i in range(len(arr)):
                text = arr[i, 2]
                arr[i, 2] = replace_tokens(text, prob)
            new_data.append(arr)
    new_data = np.concatenate(new_data, axis = 0)
    print(new_data.shape)
    new_df = pd.DataFrame(data=new_data, columns=df.columns)       
    return df.append(new_df)

In [0]:
def text_to_tensor(text, max_len):
    tokens = preprocess(text)
    tokens = tokens[:max_len]
    tensor = []
    dim = glove_model['the'].shape[0]
    for token in tokens:
        if token in glove_model:
            tensor.append(glove_model[token])
        else:
            tensor.append(np.random.uniform(-0.05, 0.05, size=(dim,)))
    for i in range(max_len - len(tokens)):
        tensor.append(np.zeros((dim,)))
    return np.array(tensor)

In [0]:
def batch_text_to_tensor(texts, max_len):
    batch_tensor = []
    for html in texts:
        # soup = BeautifulSoup(html)
        # text = soup.get_text()
        text = html
        t = text_to_tensor(text, max_len)
        batch_tensor.append(t)
    return np.array(batch_tensor)

In [0]:
dim = 100
query_max_len = 8
title_max_len = 20
desc_max_len = 40
output_dim = 4
dropout_prob = 0.
model_file = f'{data_dir}/model_1.hdf5'

In [0]:
def get_vectorized_data(df):
    ids = list(df['id'])
    queries = list(df['query'])
    titles = list(df['product_title'])
    descs = list(df['product_description'])
    outputs = list(df['median_relevance'])
    Xq = batch_text_to_tensor(queries, query_max_len)
    # Xq = np.mean(Xq, axis=1)
    print(Xq.shape)
    Xt = batch_text_to_tensor(titles, title_max_len)
    # Xt = np.mean(Xt, axis=1)
    print(Xt.shape)
    # Xd = batch_text_to_tensor(descs, desc_max_len)
    # print(Xd.shape)
    # Xd = list(range(len(descs)))
    Y = np.array(list(map(lambda x: x - 1, outputs)))
    print(Y.shape)
    return Xq, Xt, Y

In [0]:
train_df, valid_df = train_test_split(df, test_size=0.15, random_state=1)

In [115]:
print(len(train_df))
super_df = super_sample(train_df)
print(len(super_df))
print(Counter(super_df['median_relevance']))
train_df = shuffle(super_df)

8634
(1308, 6)
9942
Counter({4: 5283, 1: 1962, 3: 1455, 2: 1242})


In [116]:
Xq, Xt, Y = get_vectorized_data(train_df)


(9942, 8, 100)
(9942, 20, 100)
(9942,)


In [117]:
Xq_v, Xt_v, Y_v = get_vectorized_data(valid_df)

(1524, 8, 100)
(1524, 20, 100)
(1524,)


In [0]:
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Dense, Input, LSTM, Bidirectional, Concatenate, CuDNNLSTM, Embedding, Dropout, Dot
from keras import regularizers
from keras.backend import batch_dot
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam

In [0]:
model = None
query_in = Input(shape=(query_max_len, dim), name='query_in')
title_in = Input(shape=(title_max_len, dim), name='title_in')

r1, r2, r3, r4 = None, None, None, None
# r1=regularizers.l2(0.001)
# r2=regularizers.l2(0.001)
r3=regularizers.l2(0.001)
r4=regularizers.l2(0.001)

q_hidden = CuDNNLSTM(units=100, kernel_regularizer=r3)(query_in)
t_hidden = Bidirectional(CuDNNLSTM(units=100, kernel_regularizer=r4), merge_mode='concat')(title_in)
arr = []
for i in range(8):
    dense1 = Dense(100, activation='tanh', name=f'dense_{i}_1')
    dropout = Dropout(dropout_prob, name=f'dropout_{i}')
    dense2 = Dense(100, activation='tanh', name=f'dense_{i}_2')
    q_scaled = dropout(dense1(q_hidden))
    t_scaled = dropout(dense2(t_hidden))
    dot_prod = Dot(axes=-1, name=f'dot_{i}')([q_scaled, t_scaled])
    arr.append(dot_prod)

joint = Concatenate(axis=1)(arr)
joint = Dense(8, activation='relu')(joint)
joint = Dense(4, activation='relu')(joint)
score = Dense(1, activation='relu')(joint)

model = Model(inputs=[query_in, title_in], outputs=score)
# model.summary()

In [0]:
optim = Adam(lr=1e-4, clipnorm=5., clipvalue=3.)

In [0]:
model.compile(loss='mae', metrics=['mae'], optimizer=optim)

In [0]:
# ckpt_clbk = ModelCheckpoint(model_file, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)
early_clbk = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=7, verbose=2, mode='auto', baseline=None, restore_best_weights=True)
callbacks = [early_clbk]

In [123]:
model.fit(x=[Xq, Xt], y=Y, batch_size=32, epochs=50, validation_split=0.15, callbacks=callbacks)

Train on 8450 samples, validate on 1492 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f2643157198>

In [0]:
def convert_output(y_pred):
    if y_pred <= 0.5:
        y_pred = 0
    elif y_pred <= 1.5:
        y_pred = 1
    elif y_pred <= 2.5:
        y_pred = 2
    else:
        y_pred = 3
    return y_pred

In [125]:
outputs = model.predict(x=[Xq, Xt], batch_size=32)
y_preds = list(map(convert_output, outputs))
quadratic_weighted_kappa(Y, y_preds)

0.8040029033652715

In [126]:
outputs = model.predict(x=[Xq_v, Xt_v], batch_size=32)
y_preds = list(map(convert_output, outputs))
quadratic_weighted_kappa(Y_v, y_preds)

0.5278523120432563

In [127]:
print(classification_report(Y_v, y_preds))

              precision    recall  f1-score   support

           0       0.49      0.38      0.42       120
           1       0.50      0.23      0.31       234
           2       0.31      0.29      0.30       282
           3       0.74      0.88      0.80       888

    accuracy                           0.63      1524
   macro avg       0.51      0.44      0.46      1524
weighted avg       0.60      0.63      0.60      1524



In [128]:
print(confusion_matrix(Y_v, y_preds))

[[ 45  21  27  27]
 [ 21  53  74  86]
 [ 17  19  83 163]
 [  9  14  85 780]]


In [0]:
test_file = f'{data_dir}/test.csv'
test_df = pd.read_csv(test_file)
test_df.fillna('', inplace=True)

In [0]:
def get_vectorized_test_data(df):
    ids = list(df['id'])
    queries = list(df['query'])
    titles = list(df['product_title'])
    descs = list(df['product_description'])
    Xq = batch_text_to_tensor(queries, query_max_len)
    print(Xq.shape)
    Xt = batch_text_to_tensor(titles, title_max_len)
    print(Xt.shape)
    return Xq, Xt, ids

In [131]:
Xq, Xt, ids = get_vectorized_test_data(test_df)

(22513, 8, 100)
(22513, 20, 100)


In [132]:
outputs = model.predict(x=[Xq, Xt], batch_size=32)
y_preds = list(map(convert_output, outputs))
len(ids), len(y_preds)

(22513, 22513)

In [0]:
y_preds = list(map(lambda x: x + 1, y_preds))
submission = pd.DataFrame({"id": ids, "prediction": y_preds})
submission.to_csv("submission.csv", index=False)

In [89]:
!kaggle competitions submit -c crowdflower-search-relevance -f submission.csv -m "DL multi head dotproduct"

100% 168k/168k [00:02<00:00, 79.5kB/s]
Successfully submitted to Crowdflower Search Results Relevance

In [0]:
model.save(model_file)