# Dependencies
Let's download and import all the needed libraries.
- **NumPy** is needed for intermidiate computaions,
- **Pandas** is needed to store data,
- **Python's RE** is needed for text preprocessing,
- **lxml** is needed to parse data files stored in XML format,
- **Scikit-learn** is needed to evaluate F1 score,
- **MXNet** is choosed as deep learning framework,
- **google.colab.drive** is need to mount Google Drive with all the data.

In [0]:
%%capture
!pip install mxnet-cu100

In [0]:
import numpy as np
import pandas as pd

import re
import lxml.etree
from sklearn.metrics import f1_score

import mxnet as mx
import mxnet.ndarray as nd
import mxnet.gluon as gluon
import mxnet.autograd as autograd

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')
DRIVE_PATH = '/content/gdrive/My Drive/NLP Data/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Data Loading
Data should by extracted from XML files: tweets and their sentiments.
And then before we can apply any algorithm data should be normalized.

**Tweets normalization**
- delete URLs
- delete user tags
- delete non-russian letters

**Sentiments normalization**

In data provided sentiments is given company wise, so it is need to assign sentiment on corresponding company to the whole tweet. In case of multiple different sentiments the given tweet will be assumed as sum of all sentiments.

In [0]:
def normalize_tweet(string):
    string = re.sub(r'(?:http[^\s]+)($|\s)', '', string)
    string = re.sub(r'(?:@[^\s]+)($|\s)', '', string)
    string = re.sub(r'[^абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ0123456789()!?\- ]', '', string)
    return string

def normalize_sentiment(raw_sentiments_on_companies):
    sentiment = 0
    for raw_value in raw_sentiments_on_companies:
        if raw_value.text != 'NULL':
            sentiment += int(raw_value.text)
    return np.sign(sentiment)

def load_dataframe(filename, n_companies):
    tweets = []
    sentiments = []
    
    for sample in lxml.etree.parse(filename).xpath('database/table'):
        tweet = normalize_tweet(sample[3].text)
        sentiment = normalize_sentiment(sample[4:4+n_companies])

        tweets.append(tweet)
        sentiments.append(sentiment)
    
    return pd.DataFrame({'tweet': tweets, 'sent': sentiments})

In [0]:
bank_train = load_dataframe(DRIVE_PATH + 'SentiRuEval/bank_train_2016.xml', n_companies=8)
bank_test = load_dataframe(DRIVE_PATH + 'SentiRuEval/bank_test_etalon.xml', n_companies=8)

comm_train = load_dataframe(DRIVE_PATH + 'SentiRuEval/tkk_train_2016.xml', n_companies=7)
comm_test = load_dataframe(DRIVE_PATH + 'SentiRuEval/tkk_test_etalon.xml', n_companies=7)

Let's see the data we are working with.

In [6]:
bank_train.head(20)

Unnamed: 0,tweet,sent
0,Взять кредит тюмень альфа банк,0
1,Мнение о кредитной карте втб 24,0
2,Райффайзенбанк Снижение ключевой ставки ЦБ на ...,0
3,Современное состояние кредитного поведения в р...,0
4,Главное чтоб банки СБЕР и ВТБ!!!,1
5,Оформить краткосрочный кредит оао банк москвы,0
6,Самый выгодный автокредит в втб 24,1
7,Кредит иногородним в москве сбербанк,0
8,Кредитный калькулятор россельхозбанк чита,0
9,Легко можно получить денежный кредит ы втб 24 ...,1


# Character CNN

### Text quantization

In [0]:
TWEET_MAX_LENGTH = 140
CHARS = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ0123456789()!?- '
CHARS = np.array(list(CHARS))

def one_hot_by_letters(tweet):
    padded_tweet = tweet + '~' * np.max(TWEET_MAX_LENGTH - len(tweet), 0)
    padded_tweet = padded_tweet[0:TWEET_MAX_LENGTH]
    tweet_as_array = np.array(list(padded_tweet))
    return (CHARS[:,np.newaxis] == tweet_as_array[np.newaxis,:]).astype(np.float32)

def one_hot_label(sentiment):
    return (np.arange(3) == (sentiment + 1)).astype(np.float32)

### Preprocess data for charcter-level CNN
Here we can choose dataset for our investigation:
1. Tweets about banks,
2. Tweets about telecommunication companies.

In [0]:
dataframes = [bank_train, bank_test]
#dataframes = [comm_train, comm_test]

In [0]:
X_train = nd.zeros((len(dataframes[0]), len(CHARS), TWEET_MAX_LENGTH))
y_train = nd.zeros((len(dataframes[0]), 3))
                        
X_test = nd.zeros((len(dataframes[1]), len(CHARS), TWEET_MAX_LENGTH))
y_test = nd.zeros((len(dataframes[1]), 3))

for i in range(len(bank_train)):
    X_train[i] = one_hot_by_letters(dataframes[0].tweet[i])
    y_train[i] = one_hot_label(dataframes[0].sent[i])
    
for i in range(len(bank_test)):
    X_test[i] = one_hot_by_letters(dataframes[1].tweet[i])
    y_test[i] = one_hot_label(dataframes[1].sent[i])

In [0]:
train_dataset = gluon.data.ArrayDataset(X_train, y_train)
test_dataset = gluon.data.ArrayDataset(X_test, y_test)

train_dataloader = gluon.data.DataLoader(train_dataset, batch_size=1024, shuffle=False)
test_dataloader = gluon.data.DataLoader(test_dataset, batch_size=1024, shuffle=False)

### Network configuration

In [0]:
net = gluon.nn.HybridSequential()
with net.name_scope():
    net.add(gluon.nn.Conv1D(channels=256, kernel_size=7, activation='relu'))
    net.add(gluon.nn.MaxPool1D(pool_size=3))
    net.add(gluon.nn.Conv1D(channels=256, kernel_size=7, activation='relu'))
    net.add(gluon.nn.MaxPool1D(pool_size=3))
    net.add(gluon.nn.Conv1D(channels=256, kernel_size=3, activation='relu'))
    net.add(gluon.nn.Conv1D(channels=256, kernel_size=3, activation='relu'))
    net.add(gluon.nn.Conv1D(channels=256, kernel_size=3, activation='relu'))
    net.add(gluon.nn.Conv1D(channels=256, kernel_size=3, activation='relu'))
    net.add(gluon.nn.MaxPool1D(pool_size=3))
    net.add(gluon.nn.Flatten())
    net.add(gluon.nn.Dense(units=1024, activation='relu'))
    net.add(gluon.nn.Dropout(0.5))
    net.add(gluon.nn.Dense(units=1024, activation='relu'))
    net.add(gluon.nn.Dropout(0.5))
    net.add(gluon.nn.Dense(units=3))

net.hybridize()
softmax = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False)

### Training

In [0]:
net.initialize(mx.init.Normal(sigma=0.05), force_reinit=True, ctx=mx.gpu())
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 1e-4})

In [13]:
net.collect_params().reset_ctx(mx.gpu())
for epoch in range(200):
    accumulated_loss = 0
    for features, label in train_dataloader:
        features = features.as_in_context(mx.gpu())
        label = label.as_in_context(mx.gpu())
        with autograd.record(train_mode=True):
            output = net(features)
            loss = softmax(output, label)
        loss.backward()
        accumulated_loss += nd.mean(loss).asscalar()
        trainer.step(batch_size=len(label))
  
    if (epoch + 1) % 10 == 0:
        print('epoch', epoch + 1, '-- train loss', accumulated_loss / len(train_dataloader))

epoch 10 -- train loss 0.5998935788869858
epoch 20 -- train loss 0.40318554937839507
epoch 30 -- train loss 0.19931847974658012
epoch 40 -- train loss 0.09152067825198174
epoch 50 -- train loss 0.06047657029703259
epoch 60 -- train loss 0.21127912104129792
epoch 70 -- train loss 0.013271485595032573
epoch 80 -- train loss 0.008066454855725168
epoch 90 -- train loss 0.0054689147509634495
epoch 100 -- train loss 0.004935975081752986
epoch 110 -- train loss 0.004122918611392379
epoch 120 -- train loss 0.0028939835727214815
epoch 130 -- train loss 0.003715270553948358
epoch 140 -- train loss 0.004006528505124151
epoch 150 -- train loss 0.0023377157951472325
epoch 160 -- train loss 0.003948430751916021
epoch 170 -- train loss 0.0032697880611522122
epoch 180 -- train loss 0.0033823663368821146
epoch 190 -- train loss 0.003247474985255394
epoch 200 -- train loss 0.0028242381958989426


### Testing
For testing we will use only positive and negative tweets ignoring neutral ones.

In [14]:
net.collect_params().reset_ctx(mx.cpu())
y_true = nd.argmax(y_test, axis=1).asnumpy()
y_pred = nd.argmax(net(X_test), axis=1).asnumpy()

mask = np.logical_or(y_true == 0, y_true == 2)

accuracy = np.mean(y_true[mask] == y_pred[mask])
f1_macro = f1_score(y_true[mask], y_pred[mask], average='macro', labels=(0,2))
f1_micro = f1_score(y_true[mask], y_pred[mask], average='micro', labels=(0,2))

print('Accuracy:', accuracy)
print('F1-macro:', f1_macro)
print('F1-micro:', f1_micro)

Accuracy: 0.31115276476101217
F1-macro: 0.3620841136369086
F1-micro: 0.4328552803129074
