# 1. Connect to database

Creating connection to Microsoft SQL Server and two databases with positive and negative sets. We should use generators for batch processing.

In [51]:
import pyodbc
import numpy as np

from pprint import pprint

batch_size = 32

In [52]:
def data_generator(constring, query):
    cnxn = pyodbc.connect(constring)
    cursor = cnxn.cursor()
    cursor.execute(query)
    
    y, t = [], []
    for row in cursor:
        r_text = row.ttext
        r_type = row.ttype
        y.append(r_text)
        t.append(r_type)
        
        if len(y) == batch_size:
            npx = np.array(y)
            npy = np.array(t)
            yield npx, npy
            y, t = [], []
    pyodbc.Connection.close(cnxn)

# 2. Tokenization
We should encode words as their indexes (computed by overall frequency in the dataset).
Using russian http://www.ruscorpora.ru/en/
#### Step 1. Clear dataset. 
#### Step 2. Select meaningful words.
#### Step 3. Calculate frequency of each word
#### Step 4. Replace words by indexes

In [53]:
import nltk
import string
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize

hash_map = {}
max_features = 56000

#### Step 1. Clear dataset. 
Select words one by one. Symbols are meaningful because of smiles and emotions

In [54]:
def tokenize(file_text):
    if file_text is not None:
        #firstly let's apply nltk tokenization
        tokens = nltk.word_tokenize(file_text)

        #let's delete punctuation symbols
        stop_words = ([',','\\','/','*','','-','http',';',':','@',',','.'])
        tokens = [i for i in tokens if ( i not in stop_words )]

        #deleting stop_words
        #stop_words = stopwords.words('russian')
        #stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'в', '—', 'к', 'на', 'http'])
       
       # tokens = [i for i in tokens if ( i not in stop_words )]

        #cleaning words
        #tokens = [i.replace("«", "").replace("»", "") for i in tokens]

        return tokens
    else:
        return None

We will create a HashMap by using a Python dictionary to store the word frequencies of a book.
A dictionary is an associative array (also known as hashes).
Any key of the dictionary is associated, or mapped, to a value.
The values of a dictionary can be any Python data type, so dictionaries are unordered key-value-pairs.

By creating the dictionary, we will store the words as the keys and the value will represent the count. By doing this, we can retrieve any word without having to recount every single word.

In [55]:

def map_words(tokens):
    
    if tokens is not None:
        for word in tokens:
            word = word.lower()
            # Word Exist?
            if word in hash_map:
                hash_map[word] = hash_map[word] + 1
            else:
                hash_map[word] = 1

        return hash_map
    else:
        return None
    

In [56]:
cnstr_positive = 'Trusted_Connection=yes;DRIVER={SQL Server};SERVER=DESKTOP-1RHDOBR\GORDASQL;DATABASE=positive;UID=pyuser;PWD=pypypy'

cnstr_negative = 'Trusted_Connection=yes;DRIVER={SQL Server};SERVER=DESKTOP-1RHDOBR\GORDASQL;DATABASE=negative;UID=pyuser;PWD=pypypy'

In [57]:
def fill_frequency():
    hash_map.clear()
    
    query_positive = "SELECT [ttext], [ttype] FROM [dbo].[sortpos]"
    pdg = data_generator(cnstr_positive, query_positive)
    
    query_negative = "SELECT [ttext], [ttype] FROM [dbo].[sortneg]"
    ndg = data_generator(cnstr_negative, query_negative)
    
    for current_positive_set in pdg:
        for sentence in current_positive_set[0]:
            words = tokenize(sentence)
            map = map_words(words)
            
    for current_negative_set in ndg:
        for sentence in current_negative_set[0]:
            words = tokenize(sentence)
            map = map_words(words)
    
    min_frequency = 2#max(map.values()) - max_features if max(map.values()) > max_features else 2
    map = {key: value for key, value in map.items() if value > min_frequency}
    return map

In [17]:
map = fill_frequency()

Let's check few words

In [58]:
word_list = ['привет',',','дела',')','(']

for word in word_list:
    print('Word: [' + word + '] Frequency: ' + str(map.get(word,0)))
len(map)

Word: [привет] Frequency: 969
Word: [,] Frequency: 0
Word: [дела] Frequency: 564
Word: [)] Frequency: 151314
Word: [(] Frequency: 181929


56166

In [59]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb

maxlen = 128  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

In [63]:
def train_model():
    max_frequency = max(map.values())
    
    train_cnt = 30000
    #cnxn = pyodbc.connect(cnstr_positive)
    #query = "SELECT count(*) as cnt FROM [dbo].[mixedmessages]"
    #cursor = cnxn.cursor()
    #cursor.execute(query)
    #for row in cursor:
    #    train_cnt = row.cnt/2
    #pyodbc.Connection.close(cnxn)
    
    x_train, y_train, x_test, y_test = ([] for i in range(4))
    
    query = "SELECT top 60000 [ttext], [ttype] FROM [dbo].[mixedmessages] order by newid()"
    mdg = data_generator(cnstr_positive, query)
    
    for current_set in mdg:
        for sentence in current_set[0]:
            words = tokenize(sentence)
            w = []
            #print(words)
            for word in words:
                #print(word, map.get(word,0))
                w.append(map.get(word,0))
            if len(x_train)<train_cnt:
                x_train.append(w)
            else:
                x_test.append(w)
                
        for value in current_set[1]:
            if len(y_train)<train_cnt:
                y_train.append(value)
            else:
                y_test.append(value)
                
                       
    x_train = np.array(x_train)
    x_test = np.array(x_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    #print(x_train)
    return x_train, y_train, x_test, y_test

In [64]:
x_train, y_train, x_test, y_test = train_model()
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

In [65]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

Build model...


In [66]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_data=(x_test, y_test))


Train...
Train on 30000 samples, validate on 30000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x22b74652cc0>

In [67]:
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.71510816261
Test accuracy: 0.665666666667


In [69]:
from keras.models import model_from_json

#serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
#serialize weights to HDF5
model.save_weights("model.h5")

In [70]:
#load json and create model
json_file = open('model.json','r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
#load weights into new model
loaded_model.load_weights("model.h5")
loaded_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [33]:
def tokenize_message(d):
    words = tokenize(d)
    res = []
    for word in words:
        res.append(map.get(word,0))
    ntm = np.array([res])
    
    nres = sequence.pad_sequences(ntm, maxlen=maxlen)
        #res.append([word_2_ind[w] for w in sent.split(' ')])
    return nres#np.pad(ntm, (maxlen-len(ntm),0),'constant', constant_values=(0))

In [95]:
def getRating(value):
    return {
         value < 0.2: 'Крайне негативная оценка',
         0.2 <= value < 0.4: 'Негативная оценка',
         0.4 <= value < 0.6: 'Нейтральная оценка',
         0.6 <= value < 0.8: 'Положительная оценка',
         0.8 <= value:       'Крайне положительная оценка'
    }[True]

In [99]:
def assess(text):
    inp = np.array(tokenize_message(text), dtype=np.int32)
    p = model.predict(inp)
    return getRating(p[0][0]), p[0][0]

In [100]:
message1 = 'Мне нравится фильм. Отличная игра актеров и интересный сюжет'
assess(message1)

('Крайне положительная оценка', 0.95532614)

In [107]:
message2 = 'Мне не нравится фильм. Хотя актеры сыграли отлично'
assess(message2)

('Нейтрально', 0.40854117)

In [102]:
message2 = 'Мне не нравится фильм. Сюжет совсем неинтересный и актеры сыграли плохо'
assess(message2)

('Негативная оценка', 0.24573153)

In [94]:

inp = np.array(tokenize_message(data), dtype=np.int32)
inp = np.array(tokenize_message(data), dtype=np.int32)
p = model.predict(inp)
p[0][0]

0.30967087

In [118]:
#import pymorphy2
#morph = pymorphy2.MorphAnalyzer()