# 1. Connect to database

Creating connection to Microsoft SQL Server and two databases with positive and negative sets. We should use generators for batch processing.

In [1]:
import pyodbc
import numpy as np

from pprint import pprint

batch_size = 32

In [2]:
def data_generator(constring, query):
    cnxn = pyodbc.connect(constring)
    cursor = cnxn.cursor()
    cursor.execute(query)
    
    y, t = [], []
    for row in cursor:
        r_text = row.ttext
        r_type = row.ttype
        y.append(r_text)
        t.append(r_type)
        
        if len(y) == batch_size:
            npx = np.array(y)
            npy = np.array(t)
            yield npx, npy
            y, t = [], []
    pyodbc.Connection.close(cnxn)

# 2. Tokenization
We should encode words as their indexes (computed by overall frequency in the dataset).
Using russian http://www.ruscorpora.ru/en/
#### Step 1. Clear dataset. 
#### Step 2. Select meaningful words.
#### Step 3. Calculate frequency of each word
#### Step 4. Replace words by indexes

In [3]:
import nltk
import string
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize

hash_map = {}
max_features = 20000

#### Step 1. Clear dataset. 
Select words one by one. Symbols are meaningful because of smiles and emotions

In [4]:
def tokenize(file_text):
    if file_text is not None:
        #firstly let's apply nltk tokenization
        tokens = nltk.word_tokenize(file_text)

        #let's delete punctuation symbols
        stop_words = ([',','\\','/','*','','-','http',';',':','@'])
        tokens = [i for i in tokens if ( i not in stop_words )]

        #deleting stop_words
        #stop_words = stopwords.words('russian')
        #stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'в', '—', 'к', 'на', 'http'])
       
       # tokens = [i for i in tokens if ( i not in stop_words )]

        #cleaning words
        #tokens = [i.replace("«", "").replace("»", "") for i in tokens]

        return tokens
    else:
        return None

We will create a HashMap by using a Python dictionary to store the word frequencies of a book.
A dictionary is an associative array (also known as hashes).
Any key of the dictionary is associated, or mapped, to a value.
The values of a dictionary can be any Python data type, so dictionaries are unordered key-value-pairs.

By creating the dictionary, we will store the words as the keys and the value will represent the count. By doing this, we can retrieve any word without having to recount every single word.

In [5]:

def map_words(tokens):
    
    if tokens is not None:
        for element in tokens:
            # Remove Punctuation
            word = element.replace(",","")
            word = word.replace(".","")
            word = word.lower()
            # Word Exist?
            if word in hash_map:
                hash_map[word] = hash_map[word] + 1
            else:
                hash_map[word] = 1

        return hash_map
    else:
        return None
    

In [6]:
cnstr_positive = 'Trusted_Connection=yes;DRIVER={SQL Server};SERVER=DESKTOP-1RHDOBR\GORDASQL;DATABASE=positive;UID=pyuser;PWD=pypypy'

cnstr_negative = 'Trusted_Connection=yes;DRIVER={SQL Server};SERVER=DESKTOP-1RHDOBR\GORDASQL;DATABASE=negative;UID=pyuser;PWD=pypypy'

In [7]:
def fill_frequency():
    hash_map.clear()
    
    query_positive = "SELECT top 10000 [ttext], [ttype] FROM [dbo].[sortpos]"
    pdg = data_generator(cnstr_positive, query_positive)
    
    query_negative = "SELECT top 10000 [ttext], [ttype] FROM [dbo].[sortneg]"
    ndg = data_generator(cnstr_negative, query_negative)
    
    for current_positive_set in pdg:
        for sentence in current_positive_set[0]:
            words = tokenize(sentence)
            map = map_words(words)
            
    for current_negative_set in ndg:
        for sentence in current_negative_set[0]:
            words = tokenize(sentence)
            map = map_words(words)
    
    min_frequency = 2#max(map.values()) - max_features if max(map.values()) > max_features else 2
    map = {key: value for key, value in map.items() if value > min_frequency}
    return map

In [8]:
map = fill_frequency()

Let's check few words

In [9]:
word_list = ['привет',',','дела',')','(']

for word in word_list:
    print('Word: [' + word + '] Frequency: ' + str(map.get(word,0)))

Word: [привет] Frequency: 129
Word: [,] Frequency: 0
Word: [дела] Frequency: 60
Word: [)] Frequency: 17125
Word: [(] Frequency: 18035


In [10]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb

maxlen = 32  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

Using TensorFlow backend.


In [11]:
def train_data_generator():
    max_frequency = max(map.values())
    
    #train_cnt = 500
    #cnxn = pyodbc.connect(cnstr_positive)
    #query = "SELECT count(*) as cnt FROM [dbo].[mixedmessages]"
    #cursor = cnxn.cursor()
    #cursor.execute(query)
    #for row in cursor:
    #    train_cnt = row.cnt/2
    #pyodbc.Connection.close(cnxn)
    
    x_train, y_train = ([] for i in range(2))
    
    query = "SELECT top 100 [ttext], [ttype] FROM [dbo].[train]"
    while 1:
        mdg = data_generator(cnstr_positive, query)

        for current_set in mdg:
            for sentence in current_set[0]:
                words = tokenize(sentence)
                w = []
                #print(words)
                for word in words:
                    #print(word, map.get(word,0))
                    w.append(max_frequency-map.get(word,0))
                x_train.append(w)

            for value in current_set[1]:
                y_train.append(value)

            if len(x_train) == batch_size:
                x_train = np.array(x_train)
                #y_train = np.array(y_train)
                x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
                x_train = x_train.tolist()
                yield x_train, y_train  
                        
    #print(x_train)
    #return x_train, y_train, x_test, y_test

In [12]:
train_dg = train_data_generator()

In [13]:
next(train_dg)

([[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   18035,
   18035,
   18035,
   18035,
   16674,
   17970,
   17837,
   18027,
   0,
   18035],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   18035,
   18035,
   12933,
   18010,
   15130,
   17697,
   17588,
   17925,
   0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   18035,
   18035,
   17917,
   18035,
   17862,
   18024,
   17937,
   12873,
   18028,
   18035,
   18035,
   18035,
   910],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   18035,
   18035,
   17908,
   18035,
   18035,
   12933,
   18035,
   18029,
   17079,
   18035,
   18035,
   18035,
   16957,
   12873,
   18016,
   18035,
   18035,
   12229],
  [0,
   0,
   0,
   0,
   0,
   0,
  

In [14]:
def test_data_generator():
    max_frequency = max(map.values())
    
    #train_cnt = 500
    #cnxn = pyodbc.connect(cnstr_positive)
    #query = "SELECT count(*) as cnt FROM [dbo].[mixedmessages]"
    #cursor = cnxn.cursor()
    #cursor.execute(query)
    #for row in cursor:
    #    train_cnt = row.cnt/2
    #pyodbc.Connection.close(cnxn)
    
    x_test, y_test = ([] for i in range(2))
    
    query = "SELECT top 100 [ttext], [ttype] FROM [dbo].[test]"
    while 1:
        mdg = data_generator(cnstr_positive, query)

        for current_set in mdg:
            for sentence in current_set[0]:
                words = tokenize(sentence)
                w = []
                #print(words)
                for word in words:
                    #print(word, map.get(word,0))
                    w.append(max_frequency-map.get(word,0))
                x_test.append(w)

            for value in current_set[1]:
                y_test.append(value)

            if len(x_train) == batch_size:
                #x_test = np.array(x_test)
                #y_test = np.array(y_test)
                x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
                x_test = x_test.tolist()
                yield x_test, y_test    


In [None]:
test_dg = test_data_generator()
next(te)

In [15]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

Build model...


In [None]:
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
#model.fit(x_train, y_train,
#          batch_size=batch_size,
#          epochs=15,
#          validation_data=(x_test, y_test))
train_dg = train_data_generator()
test_dg = test_data_generator()
model.fit_generator(train_dg,steps_per_epoch = 1, epochs=1, validation_data=test_dg, validation_steps=1)
        

Train...
Epoch 1/1


In [None]:
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
my_example = np.array([[23, 75, 43, 225, 322]])
my_example = sequence.pad_sequences(my_example, maxlen=maxlen)

In [None]:
model.predict_classes(my_example)

In [None]:
model.predict(my_example)

In [None]:
#import pymorphy2
#morph = pymorphy2.MorphAnalyzer()