# Text Classification with Muse Emebedding (Vector Average)
## Contents:
1. Load Muse Embiddings
2. Convert tokenized documents to the embedding vectors calculted by averaging
3. Build Simple Keras Model
4. Train and evaluate
5. Predict

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.models import load_model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
import os
import tensorflow as tf
#disable warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
np.random.seed(0)

Using TensorFlow backend.


In [2]:
EN_VEC = "./muse_embeddings/wiki.multi.en.vec"
RU_VEC = "./muse_embeddings/wiki.multi.ru.vec"
CLEAN_TRAIN_DATA = "clean_train_data.csv"
CLEAN_TEST_DATA = "clean_test_data.csv"

In [3]:
def read_muse_vecs(muse_file):
    """ Reads the muse embedding vector file
    
        Args:
            muse_file - embedding file name
        
        Returns:
            words_to_index - words to index map 
            index_to_words - index to words map
            word_to_vec_map - word to vector map
    """
    with open(muse_file, 'r',  errors='ignore', encoding="utf-8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word_list = line[0: len(line) - 300]
            curr_word = ""
            for t in curr_word_list:
                curr_word = curr_word + str(t) + " "
            curr_word = curr_word.strip()
            words.add(curr_word)
            try:
                word_to_vec_map[curr_word] = np.array(line[-300:], dtype=np.float64)
            except:
                print(line, len(line))

        i = 1
        words_to_index = {}
        index_to_words = {}

        words.add("nokey")
        word_to_vec_map["nokey"] = np.zeros((300,), dtype=np.float64)

        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map


In [4]:
def docs_to_vector(docs, vec_map):
    """ Convert tokenized docs to vector embeddings by averaging
    
        Args:
            docs - array of tokenized texts
            vec_map - word to vector map
            
        Returns:
            array of average vectors for every text
            
    """
    vectors = []
    
    for doc in docs:
        vector = np.zeros((300,), dtype=np.float64)
        for token in doc:
            if token.lower() in vec_map:
                vector += vec_map[token.lower()]
            else:
                vector += vec_map["nokey"]
        vector /= len(doc)
        vectors.append(vector)
    return np.array(vectors)

In [5]:
def convert_to_one_hot(y, C):
    """ Convert lables to one-hot vectors
    """
    Y = np.eye(C)[y.reshape(-1)]
    return Y

### Read train data

In [6]:
df = pd.read_csv(CLEAN_TRAIN_DATA)
df.info()
df.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3983203 entries, 0 to 3983202
Data columns (total 4 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   Unnamed: 0  int64 
 1   label       int64 
 2   ru_tocks    object
 3   eng_tocks   object
dtypes: int64(2), object(2)
memory usage: 121.6+ MB


Unnamed: 0.1,Unnamed: 0,label,ru_tocks,eng_tocks
256257,256257,0,"['зао', 'европа', 'ойл', 'пайп', 'сэпплай']","['pozitive', 'design', 'liability', 'limited',..."
1015512,1015512,0,"['акционерное', 'общество', 'транснед', 'групп']","['limited', 'liability', 'company', 'tv', 'com..."
1998474,1998474,0,"['гуп', 'краснодарского', 'края', 'кубаньпортс...","['dinara', 'jsc']"


### Split all train data into train and validation set

In [7]:
train, validation = train_test_split(df, test_size=0.2)
len(train), len(validation)

(3186562, 796641)

In [8]:
train.iloc[3]

Unnamed: 0                                               645976
label                                                         0
ru_tocks      ['общество', 'с', 'ограниченной', 'ответственн...
eng_tocks        ['limited', 'liability', 'company', 'frigate']
Name: 645976, dtype: object

In [9]:
def create_vectors(df, word_to_vec_map_eng, word_to_vec_map_ru):
    """ Create vectors
    
        Args:
            df - data frame with tokenized data set
            word_to_vec_map_eng - English word to vector map 
            word_to_vec_map_ru - Russian word to vector map
            
        Returns:
            vectors - concatenated average English and Russian vectors
        
    """
    eng_vectors = docs_to_vector(df['eng_tocks'].values, word_to_vec_map_eng)
    ru_vectors = docs_to_vector(df['ru_tocks'].values, word_to_vec_map_ru)
    vectors = np.concatenate((eng_vectors, ru_vectors), axis=1)
    return vectors

### Create vectors for train and validation sets

In [10]:
# Load Muse  English Embeddings
word_to_index_eng, index_to_words_eng, word_to_vec_map_eng = read_muse_vecs(
    './muse_embeddings/wiki.multi.en.vec')
# Load Muse Russian Embeddings
word_to_index_ru, index_to_words_ru, word_to_vec_map_ru = read_muse_vecs(
    './muse_embeddings/wiki.multi.ru.vec')

In [11]:
X_train_vectors = create_vectors(train, word_to_vec_map_eng, word_to_vec_map_ru)

In [12]:
X_validation_vectors = create_vectors(validation, word_to_vec_map_eng, word_to_vec_map_ru)

In [13]:
print(X_train_vectors.shape, X_validation_vectors.shape)

(3186562, 600) (796641, 600)


### Create one-hot label encoddings 

In [14]:
y_train = train['label'].values
y_train_oh = convert_to_one_hot(y_train, 2)
y_validation = validation['label'].values
y_validation_oh = convert_to_one_hot(y_validation, 2)
print(y_train_oh.shape, y_validation_oh.shape)

(3186562, 2) (796641, 2)


### Simple Keras model

In [15]:
def my_model():
    input_layer = Input(shape=(600,))
    X = Dense(128)(input_layer)
    X = Activation("relu")(X)
    X = Dense(2)(X)
    X = Activation("softmax")(X)
    return Model(input=input_layer, output=X)

In [16]:
model = my_model()
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 600)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               76928     
_________________________________________________________________
activation_1 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 77,186
Trainable params: 77,186
Non-trainable params: 0
_________________________________________________________________


  import sys


### Train model and validate

In [17]:
%%time
model.fit(X_train_vectors, y_train_oh, epochs = 4, batch_size = 32, shuffle=True, 
          validation_data=(X_validation_vectors, y_validation_oh))

Train on 3186562 samples, validate on 796641 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
CPU times: user 35min 18s, sys: 52min 37s, total: 1h 27min 56s
Wall time: 3h 16min 59s


<keras.callbacks.callbacks.History at 0x1a58c96bd0>

In [18]:
# Save model
# model.save('model.h5')  # creates a HDF5 file 'my_model.h5'
#del model  # deletes the existing model


### Load test set

In [19]:
test = pd.read_csv(CLEAN_TEST_DATA, index_col=0)
test.info()
test.sample(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 996052 entries, 0 to 996051
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ru_tocks   996052 non-null  object
 1   eng_tocks  996052 non-null  object
dtypes: object(2)
memory usage: 22.8+ MB


Unnamed: 0,ru_tocks,eng_tocks
106271,"['общество', 'с', 'ограниченной', 'ответственн...","['international', 'corporation', 'dzhun', 'kha..."
7387,"['ооо', 'золотая', 'формулакаспий']","['duala', 'ooo']"
227468,"['общество', 'с', 'ограниченной', 'ответственн...","['limite', 'liability', 'company', 'belongs', ..."


### Create test vectors

In [20]:
X_test_vectors = create_vectors(test, word_to_vec_map_eng, word_to_vec_map_ru)

In [21]:
# Load model
# model = load_model('model.h5')

### Predict classes and save predictions to 'result.tsv'

In [22]:
y_prob = model.predict(X_test_vectors) 
y_classes = y_prob.argmax(axis=-1)
y_answers = [y > 0 for y in y_classes]

In [23]:
result = pd.DataFrame(y_answers)
result.columns = ['answer']
result.info()
result.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 996052 entries, 0 to 996051
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   answer  996052 non-null  bool 
dtypes: bool(1)
memory usage: 972.8 KB


Unnamed: 0,answer
26397,False
545236,False
199878,False
473507,False
942847,False


In [24]:
result['answer'].value_counts()

False    989640
True       6412
Name: answer, dtype: int64

In [26]:
result.to_csv('result.tsv', sep = '\t')