### Creating Word Embedding for a given corpus

In [10]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
#Importing packages

from gensim.models import Word2Vec
from nltk.corpus import stopwords
import nltk
import re
import os


In [None]:
## Using T8 corpus available from Word2Vec in gensim to create word embeddings
## Before training Word2Vec doing pre-processing on the data 
## Using nltk tokenizer to tokenize the text from file into sentence and further sentence into words

In [2]:
def conv_txt2_sentences(text):
    """
     Split text into sentences 
     Takes input as text
     return string with each line as input from text
    """
    
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # split text into sentences using nltk packages
    for sentence in tokenizer.tokenize(txt):
        yield sentence

In [3]:
def conv_txt2_words(text, lower=True, remove_stop_words=True):
    """
    Split text into words list
    Takes input as text
    Text converted in lowercase
    Remove the stop words
    return list of words from input text.
    """
    
    if lower:
        txt = txt.lower()
    words = SentenceWord2Vec.RE_WIHTE_SPACES.split(txt.strip().lower())
    if remove_stop_words:
        #remove stop words from text
        words = [w for w in words if w not in SentenceWord2Vec.STOP_WORDS]
    return words

In [4]:
class SentenceWord2Vec(object):
    """
    This class returns Sentences from input text files directory
    """
    RE_WIHTE_SPACES = re.compile("\s+")
    STOP_WORDS = set(stopwords.words("english"))
   
    def __init__(self, dirname):
        
        self.dirname = dirname

    def __iter__(self):
        
        """
        Iterate through directory and return
        sentences as list of words
        """
        #Iterating on the input directory
        for fname in os.listdir(self.dirname):
            # read line from file (Without reading the entire file)
            for line in file(os.path.join(self.dirname, fname), "rb"):
                line=unicode(line,"utf-8")
                # split the read line into sentences using NLTK
                for s in conv_txt2_sentences(line):
                    # split the sentence into words using regex
                    word =conv_txt2_words(s, lower=True, remove_stop_words=False)
                    
                    yield word

In [5]:
#Directory to read the corpus for training Word2Vec
PATH_DIR = "data"
sentences = SentenceWord2Vec(PATH_DIR+"/txt")
sentences.dirname

'data/txt'

In [None]:
#Training Word2Vec
model = Word2Vec(sentences, size=300, workers=4, min_count=40)

In [None]:
## Tried training this corpus using Word2Vec and it was taking lot of time so had to stop it.

In [None]:
## Once trained model is there we have the word embeddings from the corpus

## Part 2 Intent Classification

## Steps

- Assuming we have the labelled data available for intent classification.

- Use the Word2Vec model to map each word into its corresponding vector.

- Calculate the average vector of all the word vectors.

- We will use the average calculated vector as input to our classfication algorithms.

- For base model we can use classification algorithm like Logistic Regression,SVM , Naive Bayes etc and compare the results.

- For advanced model we can use multilayer perceptron or use RNN and LSTM 

In [None]:
#Some pseudo code from keras for MLP model for multiclass classification.

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD

model = Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 20-dimensional vectors.
model.add(Dense(64, input_dim=20, init='uniform'))
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Dense(64, init='uniform'))
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Dense(10, init='uniform'))
model.add(Activation('softmax'))

sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])

model.fit(X_train, y_train,
          nb_epoch=20,
          batch_size=16)
score = model.evaluate(X_test, y_test, batch_size=16)
