CNN for text classification
=============
<span style="color: lightsteelblue;">Deep Learning</span>

The goal of this notebook is to train convolutional neural network over text extracted from public articles at [Lenta.ru](https://lenta.ru) for text classification purposes.

**We will use**
- embeddings from scratch (we will train them as well)
- OR (as an excercise) pretrained embeddings from previous word2vec CBoW model
- 3 different filter sizes for convolutions
- relu activations
- max-pooling
- dropout
- multinomial logistic classifier

## Data preparation

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import re
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [21]:
sequence_length = 35 # the length of a truncated (padded) sentence

def clean(string):
  string = re.sub(r"[^А-Яа-я0-9(),!?]", " ", string)
  string = re.sub(r",", " , ", string)
  string = re.sub(r"!", " ! ", string)
  string = re.sub(r"\(", " \( ", string)
  string = re.sub(r"\)", " \) ", string)
  string = re.sub(r"\?", " \? ", string)
  string = re.sub(r"\s{2,}", " ", string)
  return string.strip().lower()

def read_data(folder):
  data = {}
  data_files = os.listdir(folder)
  for data_file in data_files:
    with open(os.path.join(folder,data_file)) as f:
      articles = []
      for line in f:
        articles.append(line)
    data[data_file] = articles

  return data

data = read_data('articles')
num_classes = len(data)

total_articles_count = 0
labels = []
data_x = []
data_y = []

for label_index, label in enumerate(data):
  labels.append(label)

  for article in data[label]:
    sentences = re.split('[\.]', article)
    for sentence in sentences:
      words = clean(sentence).split(' ')

      if len(words) >= sequence_length:
        words = words[:sequence_length]
      else:
        padding = ['<PAD>'] * (sequence_length - len(words))
        words = words + padding

      data_x.append(words)
      data_y.append(label_index)

total_articles_count = len(data_x)

print('Total input size: %d articles\n' % total_articles_count)
x_train = np.array(data_x[:10])
y_train = np.array(data_y[:10])
x_valid = np.array(data_x[11:])
y_valid = np.array(data_x[11:])

print(x_train.shape)
print(x_train)

for label in data:
  excerpt = data[label][0][:175] + '...'
  print('Number of %s articles is %d. For example: \n%s' % (label, len(data[label]), excerpt))

Total input size: 372 articles

(10, 35)
[['индийский' 'слон' 'кофейный' 'рынок' 'лихорадит' '<PAD>' '<PAD>' '<PAD>'
  '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>'
  '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>'
  '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>']
 ['к' 'плохим' 'новостям' 'из' 'южной' 'америки' ',' 'гуляющим' 'уже'
  'несколько' 'недель' ',' 'на' 'днях' 'добавились' 'тревожные' 'звонки'
  'из' 'индии' ',' 'которая' 'является' 'ведущим' 'мировым' 'поставщиком'
  'кофе' 'сорта' 'робуста' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>'
  '<PAD>']
 ['так' ',' 'выяснилось' ',' 'что' 'выпуск' 'кофе' 'в' 'стране' 'в'
  'следующем' 'сезоне' 'может' 'упасть' 'до' 'минимума' 'за' 'последние'
  'восемь' 'лет' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>'
  '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>' '<PAD>']
 ['причина' 'продолжительная' 'засуха' 'на' 'основных' 'территориях'
  'прои

## The model
Let's now prepare Convolutional Neural network. We will wrap it up in `ArticlesClassifier` class to make it reuseable:

In [41]:
class ArticlesClassifier(object):
  def __init__(self, sequence_length, num_labels, vocab_size,
               embedding_size, filter_sizes, filter_out_channels):
    # Placeholders for input, output and dropout
    self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
    self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
    self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

    # Embeddings layer
    with tf.device('/cpu:0'), tf.name_scope("embedding"):
      self.embeddings = tf.Variable(
          tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="embeddings")
      self.embedded_chars = tf.nn.embedding_lookup(self.embeddings, self.input_x)
      self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

    # Convolution and pooling
    pooled = []
    for i, filter_size in enumerate(filter_sizes):
      with tf.name_scope("conv-maxpool-%s" % filter_size):
        # Convolution layer
        filter_shape = [filter_size, embedding_size, 1, filter_out_channels]
        strides = [1, 1, 1, 1]
        W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
        b = tf.Variable(tf.constant(0.1, [filter_out_channels]), name='b')
        conv = tf.nn.conv2d(self.embedded_chars_expanded, W, strides, 'VALID', name='conv')
        # Bias and non-linearity
        net = tf.nn.bias_add(conv, b)
        h = tf.nn.relu(net, name="h")
        # Pooling
        h_pooled = tf.nn.max_pool(h, [1, sequence_length - filter_size + 1, 1, 1], 'VALID', 'pool')
        pooled.append(h_pooled)

    total_channels = len(filter_sizes) * filter_out_channels
    self.h_pool = tf.concat(3, pooled)
    self.h_pool_flat = tf.reshape(self.h_pool, [-1, total_channels])
 
    # Dropout layer
    with tf.name_scope("dropout"):
      self.h_drop = tf.nn.dropout(self.h_pool_flat, dropout_keep_prob)

    # Output layer (Scores and predictions)
    W = tf.Variable(tf.truncated_normal([total_channels, num_classes], stddev=0.1), name='W')
    b = tf.Variable(tf.constant(0.1, [num_classes]), name='b')
    self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
    self.predictions = tf.argmax(self.scores, 1, name="predictions")

    # Loss and accuracy
    with tf.name_scope("loss"):
      losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
      self.loss = tf.reduce_mean(losses, name='loss')
    with tf.name_scope('accuracy'):
      bool_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
      self.accuracy = tf.reduce_mean(tf.cast(bool_predictions, 'float'), name='accuracy')

  def train(self, input_x, input_y):
    feed_dict = {
      input_x: input_x,
      input_y: input_y,
      dropout_keep_prob: self.dropout_keep_prob      
    }
    optimizer = tf.AdamOptimizer(1e-4)
    optimizer.train(self.loss, feed_dict=feed_dict)

  def predict(self, input_x):
    feed_dict = {
      input_x: input_x,
      dropout_keep_prob: 1.0
    }
    session.run([self.predictions], feed_dict=feed_dict)

## Training

In [42]:
with tf.Graph().as_default():
  session_conf = tf.ConfigProto(
    allow_soft_placement=True,
    log_device_placement=True)
  sess = tf.Session(config=session_conf)
  with sess.as_default():

    model = ArticlesClassifier(
      sequence_length = x_train.shape[1],
      num_classes     = num_classes,
      vocab_size      = len(vocabulary),
      embedding_size  = 128,
      filter_sizes    = [3, 4, 5],
      num_filters     = 2
    )

IndexError: tuple index out of range