### Sample program for LSTM (text input/class output)  

#### Import libraries  

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#import logging
from gensim.models import word2vec
from gensim.parsing.preprocessing import preprocess_string

#### Parameters  

In [None]:
csv_in = 'umich_si650_half.txt'
model_file = 'word2vec_text8-min20-s300-nosim.model'

# To show more rows and columns without "..."
pd.options.display.max_columns=999
pd.options.display.max_rows=999

#### Read CSV file  

In [None]:
# read data
df = pd.read_csv(csv_in, delimiter='\t', header=None)
df.columns = ['label', 'text']
print(df.shape)
print(df.info())
display(df.head())

#### Check and preprocess data  

In [None]:
df['text'] = df['text'].map(preprocess_string)
display(df.head())

In [None]:
print(df['label'].value_counts())

#### Prepare for data  

In [None]:
X = df['text']
y = df['label']
y = to_categorical(y)
y_ndim = y.shape[1]
print(y_ndim)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y,
                                                    random_state=3)
print(X_train.shape, y_train.shape)
print(X_train.head())
print(y_train[:3])
print(X_test.shape, y_test.shape)
print(X_test.head())
print(y_test[:3])

#### Word2Vec  

In [None]:
wv_model = word2vec.Word2Vec.load(model_file)

In [None]:
print(wv_model.wv.vector_size)  # dimension of embedding
print(len(wv_model.wv.vocab.keys()))  # number of words
print(list(wv_model.wv.vocab.keys())[:10])  # show first 10 words

#### Add text of train data to vocab  

In [None]:
X_train_v = X_train.values
print(X_train_v.shape)

n_words = 0
for i in range(len(X_train_v)):
    n_words += len(X_train_v[i])

print(n_words)

In [None]:
wv_model.build_vocab(X_train_v, update=True)
wv_model.train(X_train_v, total_examples=len(X_train_v),
               total_words=n_words, epochs=wv_model.iter)

In [None]:
print(len(wv_model.wv.vocab.keys()))  # number of words

#### Add word_index to vocabularies  

In [None]:
tokenizer = Tokenizer()
texts = ' '.join(wv_model.wv.vocab.keys())
tokenizer.fit_on_texts([texts])

In [None]:
word_index = tokenizer.word_index
emb_size = wv_model.wv.vector_size
print(len(word_index.values()))

#### Make word2vec vectors  

In [None]:
nw = len(word_index)
emb = np.zeros((nw+1, emb_size))
for w, i in word_index.items():
    emb[i] = wv_model[w]

#### Padding for train data  

In [None]:
max_len = X_train.map(lambda x: len(x)).max()

In [None]:
seq= tokenizer.texts_to_sequences(X_train_v)
print(len(seq))
print(seq[:3])
print(X_train_v[:3])
seq_pad = pad_sequences(seq, maxlen=max_len)
print(len(seq_pad))
print(seq_pad[:3])
seq_ar = np.array(seq_pad)
print(seq_ar.shape)

#### Build LSTM  

In [None]:
n_hidden = 20
n_out = y_ndim

model = Sequential()
model.add(Embedding(nw+1, emb_size, weights=[emb],
                    input_length=max_len, mask_zero=True,
                    trainable=False))
model.add(LSTM(n_hidden, return_sequences=False))
model.add(Dense(n_out, activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

#### Training (learning)  

In [None]:
%%time

batch_size=32
n_epochs=5
val_split=0.2
model.fit(seq_ar, y_train,
          epochs=n_epochs, batch_size=batch_size,
          validation_split=val_split)

#### Prediction  

In [None]:
seq_test=tokenizer.texts_to_sequences(X_test.values)
seq_test_pad = pad_sequences(seq_test, maxlen=max_len)
seq_test_ar = np.array(seq_test_pad)
print(seq_test_ar.shape)

In [None]:
y_pred = model.predict(seq_test_ar)

In [None]:
y_pred1 = y_pred.argmax(axis=1)
print(y_pred.shape)
print(y_pred1.shape)

y_test1 = y_test.argmax(axis=1)
print(y_test.shape)
print(y_test1.shape)

#### Calculate accuracy  

In [None]:
display(pd.crosstab(y_pred1,y_test1))
print(accuracy_score(y_test1,y_pred1))