Topic Modeling Amarigna

_ Simple topic classifying LSTM model to test if it is possible to identify topics in Amharic text _

In [41]:
from sklearn.datasets import fetch_20newsgroups
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras, numpy as np
from keras.layers import Embedding, Dense, LSTM, GRU
from keras.models import Sequential
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

_A small sample dataset to train and test the model_

In [42]:
data_loc = "./data/articles_sample.csv"
data = pd.read_csv(data_loc, sep='|', engine='python', names=['article_id','body', 'topic'])
# data.columns = ['article_id', 'url_fragment', 'first_published', 'body', 'topic']
data = data[~data.body.isnull()]

In [43]:
nb_words = 100000
max_seq_len = 2000
data.columns

Index(['article_id', 'body', 'topic'], dtype='object')

In [44]:
train_size = int(np.floor(data.shape[0] * .8))

train_x = data["body"][0:train_size]
train_y = data["topic"][0:train_size]

test_x = data["body"][train_size:]
test_y = data["topic"][train_size:]

In [45]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((1240,), (1240,), (311,), (311,))

In [46]:
X = data["body"]
y = data["topic"]

In [47]:
topics = list(y.unique())
y_encoded = [topics.index(topic) for topic in y] 

n_classes = len(topics)
n_classes

632

Preparing the data for the model
* Tokenizing the text - Identifying unique words, creating a dictionary and counting their frequency in the list of documents (texts) in the training data.
* One-hot encoding the labels (topics)
* Splitting the data into train and test(validation) sets

In [48]:
tokenizer = Tokenizer(num_words=nb_words)
tokenizer.fit_on_texts(X)
sequences = Tokenizer.texts_to_sequences(tokenizer, X)
word_index = tokenizer.word_index

ydata = keras.utils.to_categorical(y_encoded)
input_data = pad_sequences(sequences, maxlen=max_seq_len)

Xtrain, Xvalid, ytrain, yvalid = train_test_split(input_data, ydata, test_size=0.2)

_Model definition and training_

In [49]:
embedding_vector_length = 64
model = Sequential()
model.add(Embedding(len(word_index)+1, embedding_vector_length, input_length=max_seq_len, embeddings_initializer='glorot_normal', 
                    embeddings_regularizer=keras.regularizers.l2(0.01)))
model.add(LSTM(80))
model.add(Dense(n_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 2000, 64)          2633600   
_________________________________________________________________
lstm_2 (LSTM)                (None, 80)                46400     
_________________________________________________________________
dense_2 (Dense)              (None, 632)               51192     
Total params: 2,731,192
Trainable params: 2,731,192
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(Xtrain, ytrain, validation_data=(Xvalid, yvalid), nb_epoch=10, batch_size=16)



Train on 1240 samples, validate on 311 samples
Epoch 1/10

In [None]:
preds = model.predict()