# Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
import re

from collections import Counter
from sklearn.model_selection import train_test_split
from tflearn.data_utils import to_categorical
from nltk.stem.snowball import RussianStemmer
from nltk.tokenize import TweetTokenizer

# Constants

In [1]:
POSITIVE_TWEETS_CSV = 'positive.csv'
NEGATIVE_TWEETS_CSV = 'negative.csv'

VOCAB_SIZE = 5000

# Load data

In [2]:
tweets_col_number = 3

negative_tweets = pd.read_csv(
    'negative.csv', header=None, delimiter=';')[[tweets_col_number]]
positive_tweets = pd.read_csv(
    'positive.csv', header=None, delimiter=';')[[tweets_col_number]]

NameError: name 'pd' is not defined

# Stemmer

In [3]:
stemer = RussianStemmer()
regex = re.compile('[^а-яА-Я ]')
stem_cache = {}

def get_stem(token):
    stem = stem_cache.get(token, None)
    if stem:
        return stem
    token = regex.sub('', token).lower()
    stem = stemer.stem(token)
    stem_cache[token] = stem
    return stem

NameError: name 'RussianStemmer' is not defined

# Vocabulary creation

In [4]:
stem_count = Counter()
tokenizer = TweetTokenizer()

def count_unique_tokens_in_tweets(tweets):
    for _, tweet_series in tweets.iterrows():
        tweet = tweet_series[3]
        tokens = tokenizer.tokenize(tweet)
        for token in tokens:
            stem = get_stem(token)
            stem_count[stem] += 1

count_unique_tokens_in_tweets(negative_tweets)
count_unique_tokens_in_tweets(positive_tweets)

NameError: name 'Counter' is not defined

In [6]:
print("Total unique stems found: ", len(stem_count))

Total unique stems found:  91780


In [5]:
vocab = sorted(stem_count, key=stem_count.get, reverse=True)[:VOCAB_SIZE]
print(vocab[:100])

NameError: name 'stem_count' is not defined

In [10]:
idx = 2
print("stem: {}, count: {}"
      .format(vocab[idx], stem_count.get(vocab[idx])))

stem: я, count: 66045


In [11]:
token_2_idx = {vocab[i] : i for i in range(VOCAB_SIZE)}
len(token_2_idx)

5000

In [13]:
token_2_idx['сказа']

99

In [14]:
def tweet_to_vector(tweet, show_unknowns=False):
    vector = np.zeros(VOCAB_SIZE, dtype=np.int_)
    for token in tokenizer.tokenize(tweet):
        stem = get_stem(token)
        idx = token_2_idx.get(stem, None)
        if idx is not None:
            vector[idx] = 1
        elif show_unknowns:
            print("Unknown token: {}".format(token))
    return vector

In [21]:
tweet = negative_tweets.iloc[1][3]
print("tweet: {}".format(tweet))
print("vector: {}".format(tweet_to_vector(tweet)[:10]))
print(vocab[5])

tweet: Коллеги сидят рубятся в Urban terror, а я из-за долбанной винды не могу :(
vector: [1 1 1 0 1 0 1 0 0 0]
на


# Converting Tweets to vectors

In [22]:
tweet_vectors = np.zeros(
    (len(negative_tweets) + len(positive_tweets), VOCAB_SIZE), 
    dtype=np.int_)
tweets = []
for ii, (_, tweet) in enumerate(negative_tweets.iterrows()):
    tweets.append(tweet[3])
    tweet_vectors[ii] = tweet_to_vector(tweet[3])
for ii, (_, tweet) in enumerate(positive_tweets.iterrows()):
    tweets.append(tweet[3])
    tweet_vectors[ii + len(negative_tweets)] = tweet_to_vector(tweet[3])

# Preparing labels

In [23]:
labels = np.append(
    np.zeros(len(negative_tweets), dtype=np.int_), 
    np.ones(len(positive_tweets), dtype=np.int_))

In [24]:
labels[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [25]:
labels[-10:]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# Preparing the data for the training

In [26]:
X = tweet_vectors
y = to_categorical(labels, 2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [27]:
print(y_test[:10])

[[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]]


# Building the NN

In [28]:
def build_model(learning_rate=0.1):
    tf.reset_default_graph()
    
    net = tflearn.input_data([None, VOCAB_SIZE])
    net = tflearn.fully_connected(net, 125, activation='ReLU')
    net = tflearn.fully_connected(net, 25, activation='ReLU')
    net = tflearn.fully_connected(net, 2, activation='softmax')
    regression = tflearn.regression(
        net, 
        optimizer='sgd', 
        learning_rate=learning_rate, 
        loss='categorical_crossentropy')
    
    model = tflearn.DNN(net)
    return model

In [29]:
model = build_model(learning_rate=0.75)

In [30]:
model.fit(
    X_train, 
    y_train, 
    validation_set=0.1, 
    show_metric=True, 
    batch_size=128, 
    n_epoch=30)

Training Step: 33509  | total loss: [1m[32m0.27954[0m[0m | time: 16.567s
| SGD | epoch: 030 | loss: 0.27954 - acc: 0.9419 -- iter: 142848/142904
Training Step: 33510  | total loss: [1m[32m0.26963[0m[0m | time: 17.970s
| SGD | epoch: 030 | loss: 0.26963 - acc: 0.9399 | val_loss: 1.13499 - val_acc: 0.6951 -- iter: 142904/142904
--


# Testing

In [31]:
predictions = (np.array(model.predict(X_test))[:,0] >= 0.5).astype(np.int_)
accuracy = np.mean(predictions == y_test[:,0], axis=0)
print("Accuracy: ", accuracy)

Accuracy:  0.691848760488


In [33]:
def test_tweet(tweet):
    tweet_vector = tweet_to_vector(tweet, True)
    positive_prob = model.predict([tweet_vector])[0][1]
    print('Original tweet: {}'.format(tweet))
    print('P(positive) = {:.5f}. Result: '.format(positive_prob), 
          'Positive' if positive_prob > 0.5 else 'Negative')

In [34]:
def test_tweet_number(idx):
    test_tweet(tweets[idx])

In [36]:
test_tweet_number(120705)

Unknown token: обладает
Unknown token: извлечь
Unknown token: выгоду
Original tweet: Он, якобы, обладает информацией, и может извлечь из нее выгоду. ::-) #RU_FF #FF_RU
P(positive) = 0.63527. Result:  Positive


# Real life testing

In [47]:
tweets_for_testing = [
    "меня оштрафовали по дороге домой"
]
for tweet in tweets_for_testing:
    test_tweet(tweet) 
    print("---------")

Unknown token: оштрафовали
Original tweet: меня оштрафовали по дороге домой
P(positive) = 0.00599. Result:  Negative
---------


# Links
* [Скачать корпус твиттов](http://study.mokoron.com);
* [Ю. В. Рубцова. Построение корпуса текстов для настройки тонового классификатора // Программные продукты и системы, 2015, №1(109), –С.72-78](http://www.swsys.ru/index.php?page=article&id=3962&lang=);