In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

In [2]:
class XML2DataFrame:
    def __init__(self, xml_path):
        xml_data = open(xml_path)
        self.root = ET.XML(xml_data.read())[1]

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            parsed[element.attrib["name"]] = None if element.text == "NULL" else element.text
        for child in list(element):
            self.parse_element(child, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)

In [3]:
train_tkk = XML2DataFrame("tkk_train_2016.xml").process_data().fillna(0)

In [4]:
train_tkk_texts = train_tkk["text"].values
train_tkk_labels = train_tkk[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].values

In [5]:
train_tkk_texts

array(['@mkomov Максим, Вашем письмо мы получили. Наши сотрудники свяжутся с Вами завтра и направят запрос инженерам для проверки. #билайн',
       '«Мегафон» стал владельцем 50% акций «Евросети»',
       'RT @fuckkiev: “@EvaKobb: МТС Россия прислала жителям Херсонщины сообщения, в которых обозвала украинцев фашистами? http://t.co/RbSesXlOUZ” …',
       ...,
       'RT @Olympialeigh: У МТС проблемы со связью из-за замены оборудования',
       'RT @ManisaMerkez45: У МТС проблемы со связью из-за замены оборудования',
       'RT @MertKaraoban: У МТС проблемы со связью из-за замены оборудования'],
      dtype=object)

In [6]:
test_tkk = XML2DataFrame("tkk_test_etalon.xml").process_data().fillna(0)

In [7]:
test_tkk_texts = test_tkk["text"].values
test_tkk_labels = test_tkk[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].values

In [8]:
new_labels = []
for string in train_tkk_labels:
    new_labels.append(sum([int(val) for val in string]))
    if new_labels[-1]<-1:
        new_labels[-1]=-1
    elif new_labels[-1]>1:
        new_labels[-1]=1
train_tkk_labels = new_labels

new_labels = []
for string in test_tkk_labels:
    new_labels.append(sum([int(val) for val in string]))
    if new_labels[-1]<-1:
        new_labels[-1]=-1
    elif new_labels[-1]>1:
        new_labels[-1]=1
test_tkk_labels=new_labels

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)

In [11]:
tfidf = vectorizer.fit(train_tkk_texts)

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
lr = LogisticRegression(n_jobs=7,random_state=42,verbose=1,warm_start=True)

In [16]:
lr.fit(tfidf.transform(train_tkk_texts), train_tkk_labels)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=7,
          penalty='l2', random_state=42, solver='warn', tol=0.0001,
          verbose=1, warm_start=True)

In [18]:
pred = list(lr.predict(tfidf.transform(test_tkk_texts)))

In [19]:
from sklearn.metrics import f1_score

In [21]:
# With no regularization except L2
print(f1_score(test_tkk_labels,pred, average='macro'))
print(f1_score(test_tkk_labels,pred, average='micro'))

0.4542176544883651
0.654650645304851


In [23]:
train_tkk_tfidf = tfidf.transform(train_tkk_texts).toarray()
test_tkk_tfidf = tfidf.transform(test_tkk_texts).toarray()

In [24]:
train_tkk_tfidf_reg = np.array([(string-string.mean())/string.std() for string in train_tkk_tfidf])

In [25]:
test_tkk_tfidf_reg = np.array([(string-string.mean())/string.std() for string in test_tkk_tfidf])

In [29]:
lr = LogisticRegression(random_state=42,verbose=1,warm_start=True,max_iter=50)

In [30]:
lr.fit(train_tkk_tfidf_reg, train_tkk_labels)



[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=1, warm_start=True)

In [32]:
pred_reg = list(lr.predict(test_tkk_tfidf_reg))

In [34]:
print(f1_score(test_tkk_labels,pred_reg,average='macro'))
print(f1_score(test_tkk_labels,pred_reg,average='micro'))

0.5266087980476643
0.6444147752558967


Best results at SentiRueval2016: <br>
F1_macro = 0.5493 <br>
F1_micro = 0.6822

# Banks analysis

In [35]:
train_banks = XML2DataFrame('bank_train_2016.xml').process_data().fillna(0)

In [36]:
test_banks = XML2DataFrame('banks_test_etalon.xml').process_data().fillna(0)

In [37]:
train_banks_texts = train_banks['text'].values
train_banks_labels = train_banks[[
    "sberbank", "vtb", "gazprom", "alfabank", "bankmoskvy", "raiffeisen",
    "uralsib", "rshb"
]].values
new_labels = []
for string in train_banks_labels:
    new_labels.append(sum([int(val) for val in string]))
    if new_labels[-1]<-1:
        new_labels[-1]=-1
    elif new_labels[-1]>1:
        new_labels[-1]=1
train_banks_labels = new_labels

In [38]:
test_banks_texts = test_banks['text'].values
test_banks_labels = test_banks[[
    "sberbank", "vtb", "gazprom", "alfabank", "bankmoskvy", "raiffeisen",
    "uralsib", "rshb"
]].values
new_labels = []
for string in test_banks_labels:
    new_labels.append(sum([int(val) for val in string]))
    if new_labels[-1]<-1:
        new_labels[-1]=-1
    elif new_labels[-1]>1:
        new_labels[-1]=1
test_banks_labels = new_labels

In [39]:
lr = LogisticRegression(n_jobs=3,random_state=42,verbose=1,warm_start=True,max_iter=50)

In [40]:
tfidf_banks = vectorizer.fit(train_banks_texts)

In [41]:
train_banks_tfidf = tfidf_banks.transform(train_banks_texts).toarray()
test_banks_tfidf = tfidf_banks.transform(test_banks_texts).toarray()

In [42]:
lr.fit(train_banks_tfidf,train_banks_labels)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='warn', n_jobs=3,
          penalty='l2', random_state=42, solver='warn', tol=0.0001,
          verbose=1, warm_start=True)

### Banks results with no regularization

In [43]:
banks_pred = list(lr.predict(test_banks_tfidf))

In [44]:
print(f1_score(banks_pred,test_banks_labels,average='macro'))
print(f1_score(banks_pred,test_banks_labels,average='micro'))

0.4979622045828058
0.73407787503773


Best results at SentiRueval2016: <br>
F1_macro = 0.5252 <br>
F1_micro = 0.5881

### Regularization for banks tf/idf

In [42]:
train_banks_tfidf_reg = np.array([(string-string.mean())/string.std() for string in train_banks_tfidf])
test_banks_tfidf_reg = np.array([(string-string.mean())/string.std() for string in test_banks_tfidf])

In [43]:
lr = LogisticRegression(n_jobs=3,random_state=42,verbose=1,warm_start=True,max_iter=50)

In [44]:
lr.fit(train_banks_tfidf_reg,train_banks_labels)

  " = {}.".format(self.n_jobs))


[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='ovr', n_jobs=3,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=True)

### Prediction for banks with regularization

In [45]:
banks_pred_reg = list(lr.predict(test_banks_tfidf_reg))

In [49]:
print(f1_score(banks_pred_reg,test_banks_labels,average='macro'))
print(f1_score(banks_pred_reg,test_banks_labels,average='micro'))

0.29355505573876134
0.7214005433142169


  'precision', 'predicted', average, warn_for)


Best results at SentiRueval2016: <br>
F1_macro = 0.5252 <br>
F1_micro = 0.5881

In [50]:
lr = LogisticRegression(n_jobs=3,random_state=42,verbose=1,warm_start=True,max_iter=50)

# Using all train tweets for both banks and tkk with logreg and no regularization

In [45]:
total_train_labels = train_banks_labels
total_train_labels.extend(train_tkk_labels)

In [46]:
total_train_texts = []
total_train_texts.extend(train_banks_texts)
total_train_texts.extend(train_tkk_texts)

In [47]:
total_tfidf = vectorizer.fit(total_train_texts)
total_train_tfidf = total_tfidf.transform(total_train_texts)

In [48]:
lr.fit(total_train_tfidf,total_train_labels)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='warn', n_jobs=3,
          penalty='l2', random_state=42, solver='warn', tol=0.0001,
          verbose=1, warm_start=True)

### Results for test_banks

In [49]:
tot_pred = list(lr.predict(total_tfidf.transform(test_banks_texts)))

In [50]:
print(f1_score(tot_pred,test_banks_labels,average='macro'))
print(f1_score(tot_pred,test_banks_labels,average='micro'))

0.5382323874148384
0.7524901901599759


Best results at SentiRueval2016: <br>
F1_macro = 0.5252 <br>
F1_micro = 0.5881

### Results for test_tkk

In [52]:
tot_pred = list(lr.predict(total_tfidf.transform(test_tkk_texts)))

In [54]:
print(f1_score(tot_pred,test_tkk_labels,average='macro'))
print(f1_score(tot_pred,test_tkk_labels,average='micro'))

0.4802356659930958
0.6653315531820204


Best results at SentiRueval2016: <br>
F1_macro = 0.5493 <br>
F1_micro = 0.6813

# CNN aproach

In [59]:
import os
os.environ['KERAS_BACKEND'] = 'theano'
from keras.models import Sequential
from keras.layers import Conv1D, Embedding, Dropout, Dense, GlobalAveragePooling1D, MaxPooling1D, Softmax

Using Theano backend.


In [60]:
from gensim.models import FastText

In [75]:
from keras.preprocessing.sequence import pad_sequences
wvs = np.array([lang_model.wv[sent] for sent in prepared_for_embed])
size= 50
padded = pad_sequences(wvs,padding='post', value=np.full(size,0.0))

from pandas import get_dummies

In [71]:
test_banks_wvs = []
for sent in test_banks_texts:
    prep_sent = []
    for word in sent.split():
        try:
            prep_sent.append(lang_model.wv[word])
        except KeyError:
            pass
    test_banks_wvs.append(np.array(prep_sent))
test_banks_wvs = np.array(test_banks_wvs)

test_banks_padded = pad_sequences(test_banks_wvs,padding='post', maxlen=30, value=np.full(size,0.0))

test_tkk_wvs = []
for sent in test_tkk_texts:
    prep_sent = []
    for word in sent.split():
        try:
            prep_sent.append(lang_model.wv[word])
        except KeyError:
            pass
    test_tkk_wvs.append(np.array(prep_sent))
test_tkk_wvs = np.array(test_tkk_wvs)

test_tkk_padded = pad_sequences(test_tkk_wvs,padding='post', maxlen=30, value=np.full(size,0.0))

In [72]:
def f1_macro_from_dummies(pred,gt, average='macro'):
    gt_val = [np.argmax(string)-1 for string in gt]
    pred_val = [np.argmax(string)-1 for string in pred]
    return mean(np.array([f1_score(gt_val, pred_val, average=average)]))
def f1_micro_from_dummies(pred,gt, average='micro'):
    gt_val = [np.argmax(string)-1 for string in gt]
    pred_val = [np.argmax(string)-1 for string in pred]
    return mean(np.array([f1_score(gt_val, pred_val, average=average)]))

In [73]:
size= 50
lang_model = FastText(min_count=1,window=5,workers=7,size=size)
prepared_for_embed = [sent.split() for sent in total_train_texts]
lang_model.build_vocab(prepared_for_embed)
lang_model.train(prepared_for_embed, total_examples=lang_model.corpus_count, epochs=10)

In [64]:
model = Sequential()
seq_length=30
model.add(Conv1D(64, 10, activation='relu',input_shape=(seq_length, size)))
model.add(MaxPooling1D(10))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.5))
model.add(Dense(3, activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='adagrad',
              metrics=['accuracy'])

In [79]:
model.fit(padded, get_dummies(total_train_labels).values,
          validation_data=(test_banks_padded,
                           get_dummies(test_banks_labels).values),
          batch_size=128, epochs=20)

Train on 18035 samples, validate on 3313 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
 4096/18035 [=====>........................] - ETA: 15s - loss: 0.5855 - acc: 0.7385

KeyboardInterrupt: 

In [80]:
cnn_pred = model.predict(test_banks_padded)
cnn_pred_val = [np.argmax(string)-1 for string in cnn_pred]

INFO (theano.gof.compilelock): Refreshing lock /home/zubkov/.theano/compiledir_Linux-4.4--Microsoft-x86_64-with-Ubuntu-16.04-xenial-x86_64-3.7.0-64/lock_dir/lock


In [81]:
print(f1_score(cnn_pred_val,test_banks_labels, average='macro'))
print(f1_score(cnn_pred_val,test_banks_labels, average='micro'))

0.3866006424419824
0.5759130697253245


  'recall', 'true', average, warn_for)


In [84]:
cnn_pred = model.predict(test_tkk_padded)
cnn_pred_val = [np.argmax(string)-1 for string in cnn_pred]

In [85]:
print(f1_score(cnn_pred_val,test_tkk_labels, average='macro'))
print(f1_score(cnn_pred_val,test_tkk_labels, average='micro'))

0.3588088335999437
0.5331553182020472


In [94]:
from keras import backend as K

In [15]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [14]:
def f1_loss(y_true,y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return (precision+recall+K.epsilon())/(2*((precision*recall)))

In [86]:
from gensim.models import FastText

In [95]:
pretrained = FastText.load('araneum_none_fasttextskipgram_300_5_2018/araneum_none_fasttextskipgram_300_5_2018.model')

In [122]:
model = Sequential()
seq_length=30
model.add(Conv1D(64, 10, activation='relu',input_shape=(seq_length, pretrained.vector_size)))
model.add(MaxPooling1D(10))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.5))
model.add(Dense(3, activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='adagrad',
              metrics=['accuracy'])

In [117]:
from keras.preprocessing.sequence import pad_sequences
wvs = []
for sent in prepared_for_embed:
    sent_vec = []
    for word in sent:
        try:
            sent_vec.append(pretrained.wv[word])
        except KeyError:
            pass
    wvs.append(np.array(sent_vec))
wvs = np.array(wvs)

In [None]:
padded = pad_sequences(wvs, padding='post', maxlen=30, value=np.full(pretrained.vector_size,0.0))

In [124]:
lang_model = pretrained
test_banks_wvs = []
for sent in test_banks_texts:
    prep_sent = []
    for word in sent.split():
        try:
            prep_sent.append(lang_model.wv[word])
        except KeyError:
            pass
    test_banks_wvs.append(np.array(prep_sent))
test_banks_wvs = np.array(test_banks_wvs)

test_banks_padded = pad_sequences(test_banks_wvs,padding='post', maxlen=30, value=np.full(lang_model.vector_size,0.0))

test_tkk_wvs = []
for sent in test_tkk_texts:
    prep_sent = []
    for word in sent.split():
        try:
            prep_sent.append(lang_model.wv[word])
        except KeyError:
            pass
    test_tkk_wvs.append(np.array(prep_sent))
test_tkk_wvs = np.array(test_tkk_wvs)

test_tkk_padded = pad_sequences(test_tkk_wvs,padding='post', maxlen=30, value=np.full(lang_model.vector_size,0.0))

In [125]:
model.fit(padded, get_dummies(total_train_labels).values,
          validation_data=(test_banks_padded,
                           get_dummies(test_banks_labels).values),
          batch_size=128, epochs=20)

Train on 18035 samples, validate on 3313 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

KeyboardInterrupt: 

(18035, 30, 300)