Deep Learning Techniques for Text Classification
===

- https://medium.com/datadriveninvestor/deep-learning-techniques-for-text-classification-9392ca9492c7

In [3]:
from sklearn.datasets import fetch_20newsgroups
from keras.layers import  Dropout, Dense
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics

Using TensorFlow backend.


In [4]:
#  convert text to TF-IDF:

def TFIDF(X_train, X_test,MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with",str(np.array(X_train).shape[1]),"features")
    return (X_train,X_test)

In [5]:
#  Build a DNN Model for Text:

def Build_Model_DNN_Text(shape, nClasses, dropout=0.5):
    """
    buildModel_DNN_Tex(shape, nClasses,dropout)
    Build Deep neural networks Model for text classification
    Shape is input feature space
    nClasses is number of classes
    """
    model = Sequential()
    node = 512 # number of nodes
    nLayers = 4 # number of  hidden layer
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [6]:
#  Load text dataset (20newsgroups):

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [7]:
# run DNN and see result:

X_train_tfidf,X_test_tfidf = TFIDF(X_train,X_test)
model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1], 20)
model_DNN.fit(X_train_tfidf, y_train,
                              validation_data=(X_test_tfidf, y_test),
                              epochs=10,
                              batch_size=128,
                              verbose=2)
predicted = model_DNN.predict(X_test_tfidf)


tf-idf with 75000 features
Train on 11314 samples, validate on 7532 samples
Epoch 1/10
 - 113s - loss: 2.8520 - accuracy: 0.0911 - val_loss: 2.2282 - val_accuracy: 0.2309
Epoch 2/10
 - 118s - loss: 1.5232 - accuracy: 0.4513 - val_loss: 1.0817 - val_accuracy: 0.6329
Epoch 3/10
 - 123s - loss: 0.7422 - accuracy: 0.7189 - val_loss: 0.8733 - val_accuracy: 0.7404
Epoch 4/10
 - 116s - loss: 0.3764 - accuracy: 0.8710 - val_loss: 0.8519 - val_accuracy: 0.7823
Epoch 5/10
 - 116s - loss: 0.1858 - accuracy: 0.9396 - val_loss: 0.8923 - val_accuracy: 0.7973
Epoch 6/10
 - 124s - loss: 0.1165 - accuracy: 0.9649 - val_loss: 0.8957 - val_accuracy: 0.7978
Epoch 7/10
 - 124s - loss: 0.0783 - accuracy: 0.9788 - val_loss: 0.9068 - val_accuracy: 0.8087
Epoch 8/10
 - 114s - loss: 0.0572 - accuracy: 0.9853 - val_loss: 0.8996 - val_accuracy: 0.8163
Epoch 9/10
 - 114s - loss: 0.0466 - accuracy: 0.9875 - val_loss: 0.9404 - val_accuracy: 0.8121
Epoch 10/10
 - 117s - loss: 0.0374 - accuracy: 0.9894 - val_loss: 0.9

In [9]:
from sklearn.metrics import classification_report
    
print (classification_report(X_test_tfidf, predicted))

ValueError: continuous-multioutput is not supported

In [10]:
>>> from sklearn import metrics
>>> print(metrics.classification_report(y_test, predicted, target_names=X_test_tfidf))

ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets

In [9]:
print(metrics.classification_report(X_test_tfidf, predicted))

ValueError: continuous-multioutput is not supported

## Long Short-Term Memory (LSTM)

In [13]:
# import packages:


from keras.layers import Dropout, Dense, GRU, Embedding
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.datasets import fetch_20newsgroups

In [14]:
# convert text to word embedding (Using GloVe):

def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_test = text[len(X_train):, ]
    embeddings_index = {}
    f = open("C:\\Users\\swayam\\Documents\\Glove\\glove.6B.50d.txt", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_test, word_index,embeddings_index)

In [15]:
#  Build a RNN Model for Text:

def Build_Model_RNN_Text(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
    """
    def buildModel_RNN(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
    word_index in word index ,
    embeddings_index is embeddings index, look at data_helper.py
    nClasses is number of classes,
    MAX_SEQUENCE_LENGTH is maximum lenght of text sequences
    """
    model = Sequential()
    hidden_layer = 3
    gru_node = 32
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    print(gru_node)
    for i in range(0,hidden_layer):
        model.add(GRU(gru_node,return_sequences=True, recurrent_dropout=0.2))
        model.add(Dropout(dropout))
    model.add(GRU(gru_node, recurrent_dropout=0.2))
    model.add(Dropout(dropout))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(nclasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model

In [16]:
# run RNN and see result:

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target
X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(X_train,X_test)
model_RNN = Build_Model_RNN_Text(word_index,embeddings_index, 20)
model_RNN.fit(X_train_Glove, y_train,
                              validation_data=(X_test_Glove, y_test),
                              epochs=10,
                              batch_size=128,
                              verbose=2)
predicted = Build_Model_RNN_Text.predict_classes(X_test_Glove)
print(metrics.classification_report(y_test, predicted))

MemoryError: Unable to allocate 11.3 GiB for an array with shape (18846,) and data type <U160616