In [1]:
import os
import sys
import keras
import numpy as np

Using TensorFlow backend.


In [2]:
from gensim.models import word2vec

from keras.models import Sequential
from keras.layers import Dropout
from keras.regularizers import l2
from keras.models import Model
from keras.engine import Input
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from gensim.models import keyedvectors
from collections import defaultdict
from keras.utils.np_utils import to_categorical
from keras.layers import Input, Dense, Flatten
from keras.layers import Conv1D, MaxPooling1D


import pandas as pd

In [3]:
# global variables

nb_filters = 1200  # number of filters
n_gram = 3  # n-gram, or window size of CNN/ConvNet
maxlen = 15  # maximum number of words in a sentence
vecsize = 300  # length of the embedded vectors in the model 
cnn_dropout = 0.0  # dropout rate for CNN/ConvNet
final_activation = 'softmax'  # activation function. Options: softplus, softsign, relu, tanh, sigmoid, hard_sigmoid, linear.
dense_wl2reg = 0.0  # dense_wl2reg: L2 regularization coefficient
dense_bl2reg = 0.0  # dense_bl2reg: L2 regularization coefficient for bias
optimizer = 'adam'  # optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam

# utility functions

def retrieve_csvdata_as_dict(filepath):
    """
    Retrieve the training data in a CSV file, with the first column being the
    class labels, and second column the text data. It returns a dictionary with
    the class labels as keys, and a list of short texts as the value for each key.
    """
    df = pd.read_csv(filepath)
    category_col, descp_col = df.columns.values.tolist()
    shorttextdict = dict()
    for category, descp in zip(df[category_col], df[descp_col]):
        if type(descp) == str:
            shorttextdict.setdefault(category, []).append(descp)
    return shorttextdict

def subjectkeywords():
    """
    Return an example data set, with three subjects and corresponding keywords.
    This is in the format of the training input.
    """
    data_path = os.path.join(os.getcwd(), 'datasets/keras_classifier_training_data.csv')
    return retrieve_csvdata_as_dict(data_path)

def convert_trainingdata(classdict):
    """
    Convert the training data into format put into the neural networks.
    """
    classlabels = classdict.keys()
    lblidx_dict = dict(zip(classlabels, range(len(classlabels))))

    # tokenize the words, and determine the word length
    phrases = []
    indices = []
    for label in classlabels:
        for shorttext in classdict[label]:
            shorttext = shorttext if type(shorttext) == str else ''
            category_bucket = [0]*len(classlabels)
            category_bucket[lblidx_dict[label]] = 1
            indices.append(category_bucket)
            phrases.append(shorttext)

    return classlabels, phrases, indices

def process_text(text):
    """ 
    Process the input text by tokenizing and padding it.
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    x_train = tokenizer.texts_to_sequences(text)

    x_train = pad_sequences(x_train, maxlen=maxlen)
    return x_train

In [4]:
# w2v_model_wv = keyedvectors.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
# The dataset 'GoogleNews-vectors-negative300.bin.gz' can be downloaded from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

w2v_model_wv = keyedvectors.KeyedVectors.load_word2vec_format('datasets/ruscorpora_1_300_10.bin', binary=True)
# w2v_model_wv = keyedvectors.KeyedVectors.load_word2vec_format('datasets/wiki.ru.vec', binary=False)

In [9]:
w2v_model_wv.vocab['топология_NOUN']

<gensim.models.keyedvectors.Vocab at 0x12c20f668>

In [10]:
trainclassdict = subjectkeywords()

nb_labels = len(trainclassdict)  # number of class labels
print(trainclassdict)

{'mathematics': ['алгебра_NOUN', 'топология_NOUN', 'вычисление_NOUN', 'исчисление_NOUN', 'статистика_NOUN', 'вероятность_NOUN'], 'physics': ['электродинамика_NOUN', 'электрон_NOUN'], 'politics': ['президент_NOUN', 'бойкот_NOUN', 'референдум_NOUN'], 'biology': ['организм_NOUN', 'мутация_NOUN', 'растение_NOUN', 'жизнь_NOUN']}


In [11]:
# get embedding layer corresponding to our trained Word2Vec model
embedding_layer = w2v_model_wv.get_embedding_layer()

# create a convnet to solve our classification task
sequence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(filters=nb_filters, kernel_size=n_gram, padding='valid', activation='relu', input_shape=(maxlen, vecsize))(embedded_sequences)
x = MaxPooling1D(pool_size=maxlen - n_gram + 1)(x)
x = Flatten()(x)
preds = Dense(nb_labels, activation=final_activation, kernel_regularizer=l2(dense_wl2reg), bias_regularizer=l2(dense_bl2reg))(x)

In [12]:
classlabels, x_train, y_train = convert_trainingdata(trainclassdict)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)

x_train = pad_sequences(x_train, maxlen=maxlen)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
fit_ret_val = model.fit(x_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [13]:
input_text = 'самый_DET интересный_ADJ вопрос_NOUN который_DET задавать_VERB ученый_NOUN исследователь_NOUN старение_NOUN сам_DET принимать_VERB увеличение_NOUN продолжительность_NOUN жизнь_NOUN узнавать_VERB это_PRON большой_ADJ конференция_NOUN старение_NOUN долголетие_NOUN сразу_ADV видно_ADV современный_ADJ состояние_NOUN дело_NOUN наука_NOUN геропротектор_NOUN услышать_VERB довольно_ADV разный_ADJ ответ_NOUN'

matrix = process_text(input_text)

predictions = model.predict(matrix)

# get the actual categories from output
scoredict = {}
for idx, classlabel in zip(range(len(classlabels)), classlabels):
    scoredict[classlabel] = predictions[0][idx]

print(scoredict)

{'mathematics': 0.065678559, 'physics': 0.79528475, 'politics': 0.064391196, 'biology': 0.074645475}
