In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import tensorflow as tf
from collections import defaultdict
from matplotlib import pyplot as plt
%matplotlib inline

# just for better visual performance
plt.style.use('ggplot')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 30

# Theoretical part

Lets try to quickly refresh basic concepts of convolutional neural networks.
It is necessary to answer these questions before you continue to go further.
Firstly, try to answer the questions below and write your answers into cell below.
If you wish, you may draw illustrations, put them into notebook and send all as an archive.

---

### Questions you need to answer first

0. What parameters are trainable in convolutional layers?
* How many trainable weights there are in the single convolutional layer with kernel size [3x3],
     where the input is an RGB image and number of filters is 16? 
* Is convolutional nn stable to resizing of the input data? (It is also necessary to explain `why?`) 
* What is `padding`?
* What is the difference between `"SAME"` & `"VALID"` padding?
* What does `strides` parameter affect?
* What is a `pooling` layer in general? What is the difference between `max pooling` & `global max pooling`?
* How to implement 1D convolution using 2D convolution function?
* What is the `receptive field`?

### Your answers here:

~just a lot of space for you~

---
#### Checking tips:
Every right answer +1 point

## Some naïve examples of 2D convolution

In [None]:
!pip install Pillow

In [None]:
def plot_image(image):
    plt.imshow(image, cmap="gray", interpolation="nearest")
    plt.axis("off")

In [None]:
# Lets make 2 filters with 7x7 size and one input channel;

from sklearn.datasets import load_sample_image
china = load_sample_image("china.jpg")
flower = load_sample_image("flower.jpg")
image = china[150:220, 130:250]
height, width, channels = image.shape
image_grayscale = image.mean(axis=2).astype(np.float32)
images = image_grayscale.reshape(1, height, width, 1)
print("These are the filters that we will apply:")


fmap = np.zeros(shape=(7, 7, 1, 2), dtype=np.float32)
fmap[:, 3, 0, 0] = 1 # mark column 3
fmap[3, :, 0, 1] = 1 # mark row 3

plt.imshow(fmap[:, :, 0, 0])
plt.axis("off")
plt.show()
plt.imshow(fmap[:, :, 0, 1])
plt.axis("off")
plt.show()

In [None]:
# And apply them with 2D convolutions,
# without changing original image size

import tensorflow as tf

X = tf.placeholder(tf.float32, shape=(None, height, width, 1))
feature_maps = tf.constant(fmap)
convolution = tf.nn.conv2d(X, feature_maps, strides=[1,1,1,1], padding="SAME", use_cudnn_on_gpu=False)

with tf.Session() as sess:
    output = convolution.eval(feed_dict={X: images})

    
# simply visualization code below

fig = plt.figure(figsize=(36, 12))

ax = fig.add_subplot(1, 3, 1)
plot_image(images[0, :, :, 0])
ax.set_title("china_original", )
ax.grid('off')
ax.axis('off')

ax = fig.add_subplot(1, 3, 2)
plot_image(output[0, :, :, 0])
ax.set_title("china_vertical")
ax.grid('off')
ax.axis('off')

ax = fig.add_subplot(1, 3, 3)
plot_image(output[0, :, :, 1])
ax.set_title("china_horizontal")
ax.grid('off')
ax.axis('off')

plt.show()

**Try to explain why these images turned out in a such way after applying the horizontal and vertical filters.**

*Your answer here:*

~right here, yes~

---
#### Checking tips:
Right answer +2 points


# Practical part

Download the train data from https://www.kaggle.com/c/quora-question-pairs/data

In [None]:
df = pd.read_csv('train.csv')
df.head(20)

### Create train/dev/test split

Proportions are: 0.8, 0.1, 0.1 for train, validation, & test respectively. 

In [None]:
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

counter = Counter()

for n in tqdm(range(len(df))):
    # str fixes some strange
    q1 = str(df.question1[n])
    q2 = str(df.question2[n])
    toks_1 = word_tokenize(q1.lower())
    toks_2 = word_tokenize(q2.lower())
    counter.update(toks_1)
    counter.update(toks_2)

Lets make dictionary:

In [None]:
n_most_common_to_keep = 20000

base_dict = {token: n + 1 for n, (token, _) in enumerate(counter.most_common()[:n_most_common_to_keep - 1])}

# Dictionary with default value = 0
tok2idx = defaultdict(int)
tok2idx.update(base_dict)

def toks2idxs(tokens):
    return [tok2idx[token] for token in tokens]

In [None]:
N = len(df)
np.random.seed(seed=42)

# Shuffle the indices
idxs = np.random.permutation(N)
split_points = [int(0.8 * N), int(0.9 * N)]

# shuffled dataset will be divided into 3 parts with sizes: [0.8*N, 0.1.*N, 0.1*N]
train_idxs, dev_idxs, test_idxs = np.split(idxs, split_points)

In [None]:
def batch_generator(batch_size,
                    data_type='train',
                    shuffle=True,
                    allow_smaller_last_batch=True):
    
    if data_type == 'train':
        idxs = train_idxs
    elif data_type == 'dev':
        idxs = dev_idxs
    elif data_type == 'test':
        idxs = test_idxs

    n_samples = len(idxs)
    
    n_batches = n_samples // batch_size
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1
    while True:
        if shuffle:
            idxs = np.random.permutation(idxs)
            
        for k in range(n_batches):
            batch_start = k * batch_size
            batch_end = min((k + 1) * batch_size, n_samples)
            current_batch_size = batch_end - batch_start
            x1_utterance_list = []
            x2_utterance_list = []
            y_list = []
            max_len_1 = 0
            max_len_2 = 0
            
            for idx in idxs[batch_start: batch_end]:
                question_1 = word_tokenize(str(df.question1[idx]).lower())
                question_2 = word_tokenize(str(df.question2[idx]).lower())

                x1_utterance_list.append(question_1)
                x2_utterance_list.append(question_2)

                y_list.append(df.is_duplicate[idx])
                
                # find the maximum length of sequence for current batch
                max_len_1 = max(max_len_1, len(question_1))
                max_len_2 = max(max_len_2, len(question_2))

            # Fill in the data into numpy nd-arrays filled with padding indices
            x_1 = np.zeros([current_batch_size, max_len_1], dtype=np.int32)
            x_2 = np.zeros([current_batch_size, max_len_2], dtype=np.int32)
            y = np.array(y_list)

            for n in range(current_batch_size):
                utt_len_1 = len(x1_utterance_list[n])
                utt_len_2 = len(x2_utterance_list[n])

                x_1[n, :utt_len_1] = toks2idxs(x1_utterance_list[n])
                x_2[n, :utt_len_2] = toks2idxs(x2_utterance_list[n])

            yield ({'input_1': x_1, 'input_2': x_2}, {'output': y})


### A quality metric in Quora competition

`Log loss`, or `logistic loss`, or `cross-entropy loss` has been used.

For a single sample with true label $y_t$ in {0,1} and estimated probability $y_p$ that $y_t = 1$, the `log loss` is

 $$log\_loss = -(y_t \log(y_p) + (1 - y_t) \log(1 - y_p))$$

So, the model minimize `binary_crossentropy`;

----

The code below implements the model which **was intentionally corrupted**;

Try to improve the model.

**Some hints:**
* What type of activation is used?
* How to reduce `covariance shift`?
* MOAR LAYERS ~~~
* There're hipster activation functions. O'RLY?

In [None]:
import keras
from keras.layers import Conv1D, Concatenate, Dense, Input, GlobalMaxPool1D, SimpleRNN
from keras.models import Model
from keras.optimizers import Adam
from keras.losses import binary_crossentropy

embedding_dim = 64
n_filters = 16
kernel_width = 3
learning_rate = 1e-0
batch_size = 50

emb_layer = keras.layers.Embedding(n_most_common_to_keep, embedding_dim)

input_1 = Input([None], dtype=tf.float32, name='input_1')
units_1 = emb_layer(input_1)

input_2 = Input([None], dtype=tf.float32, name='input_2')
units_2 = emb_layer(input_2)

conv_layer = Conv1D(n_filters, kernel_width, padding='same')
units_1 = conv_layer(units_1)
units_2 = conv_layer(units_2)
    
units_1 = GlobalMaxPool1D()(units_1)
units_2 = GlobalMaxPool1D()(units_2)

# try to use matmul instead of concatenation just for experiment
units = Concatenate()([units_1, units_2])
predictions = Dense(1, activation='sigmoid', name='output')(units)

model = Model(inputs=[input_1, input_2], outputs=predictions)
optimizer = Adam(learning_rate)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
model.summary()

## Model architecture visualization

To plot Keras model graph `graphviz` and `pydot` packages must be installed:

In [None]:
!pip install pydot

Also `graphviz` should be installed;

If you are _Linux_ user, it is better to do this via your package manager;

For _Ubuntu_ users the command should look like this:
        `sudo apt-get install graphviz`

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model, show_layer_names=True, show_shapes=False).create(prog='dot', format='svg'))

# Train it

In [None]:
epoch_num = 1
hist = model.fit_generator(batch_generator(batch_size, data_type='train'),
                           steps_per_epoch=1000,
                           epochs=epoch_num,
                           validation_data=batch_generator(batch_size, data_type='dev'),
                           validation_steps=1000)

In [None]:
plt.figure(figsize=(10,7))
plt.plot(hist.history['loss'], label = 'train_loss')
plt.plot(hist.history['val_loss'], label = 'val_loss')
plt.plot(hist.history['acc'], label = 'train_acc')
plt.plot(hist.history['val_acc'], label = 'val_acc')
plt.legend(fontsize=13)
plt.title("Train dynamic")
plt.xlabel("Epoch number")
plt.ylabel("Scores")
plt.show()

It is also easy to save picture on the disk:

In [None]:
from keras.utils import plot_model
plot_model(model=model, to_file="lol.png", show_shapes=True)

## Test model performance on the last part of train data

In [None]:
# it takes time to make a prediction
%time result = model.evaluate_generator(batch_generator(batch_size, data_type='test'),
                                        steps=int(0.1*N)//batch_size)

print('\nlogloss: ', result[0])
print('accuracy: ', result[1])

assert result[0]<=0.485, "It seems the result is worse then baseline"

# Task

The task is just to improve the model and get maximum points for practical part

* If your logloss on test is lower than 1, you got **5 points**
* If lower then 0.5 --> you got **7 points**
* If you have reached the result logloss lower then 0.3, then you got **10 points**


! *If a person has used batchnorm __right after the activation__, she/he lose 1 point for final mark!*



---












# Overall

For that Homework your mark is calculated simply:
$$0.5\cdot(theoretical\_part\_points + practical\_part\_points)$$