In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

In [3]:
class XML2DataFrame:
    def __init__(self, xml_path):
        xml_data = open(xml_path)
        self.root = ET.XML(xml_data.read())[1]

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            parsed[element.attrib["name"]] = None if element.text == "NULL" else element.text
        for child in list(element):
            self.parse_element(child, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)

In [4]:
train = XML2DataFrame("tkk_train_2016.xml").process_data().fillna(0)

In [5]:
train_texts = train["text"].values
train_labels = train[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].values

In [5]:
train_texts

array(['@mkomov Максим, Вашем письмо мы получили. Наши сотрудники свяжутся с Вами завтра и направят запрос инженерам для проверки. #билайн',
       '«Мегафон» стал владельцем 50% акций «Евросети»',
       'RT @fuckkiev: “@EvaKobb: МТС Россия прислала жителям Херсонщины сообщения, в которых обозвала украинцев фашистами? http://t.co/RbSesXlOUZ” …',
       ...,
       'RT @Olympialeigh: У МТС проблемы со связью из-за замены оборудования',
       'RT @ManisaMerkez45: У МТС проблемы со связью из-за замены оборудования',
       'RT @MertKaraoban: У МТС проблемы со связью из-за замены оборудования'],
      dtype=object)

In [6]:
test = XML2DataFrame("tkk_test_etalon.xml").process_data().fillna(0)

In [7]:
test_texts = test["text"].values
test_labels = test[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].values

In [8]:
new_labels = []
for string in train_labels:
    new_labels.append(sum([int(val) for val in string]))
    if new_labels[-1]<-1:
        new_labels[-1]=-1
    elif new_labels[-1]>1:
        new_labels[-1]=1
train_labels = new_labels

new_labels = []
for string in test_labels:
    new_labels.append(sum([int(val) for val in string]))
    if new_labels[-1]<-1:
        new_labels[-1]=-1
    elif new_labels[-1]>1:
        new_labels[-1]=1
test_labels=new_labels

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)

In [10]:
tfidf = vectorizer.fit(train_texts)

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
lr = LogisticRegression(n_jobs=7,random_state=42,verbose=1,warm_start=True)

In [13]:
lr.fit(tfidf.transform(train_texts), train_labels)

[LibLinear]

  " = {}.".format(self.n_jobs))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=7,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=True)

In [14]:
pred = list(lr.predict(tfidf.transform(test_texts)))

In [15]:
from sklearn.metrics import f1_score

In [16]:
# With no regularization except L2
print(f1_score(test_labels,pred, average='macro'))
print(f1_score(test_labels,pred, average='micro'))

0.4542176544883651
0.654650645304851


In [17]:
train_tfidf = tfidf.transform(train_texts).toarray()
test_tfidf = tfidf.transform(test_texts).toarray()

In [18]:
train_tfidf_reg = np.array([(string-string.mean())/string.std() for string in train_tfidf])

In [19]:
test_tfidf_reg = np.array([(string-string.mean())/string.std() for string in test_tfidf])

In [20]:
lr = LogisticRegression(n_jobs=3,random_state=42,verbose=1,warm_start=True,max_iter=50)

In [21]:
lr.fit(train_tfidf_reg, train_labels)

  " = {}.".format(self.n_jobs))


[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='ovr', n_jobs=3,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=True)

In [22]:
pred_reg = list(lr.predict(test_tfidf_reg))

In [23]:
print(f1_score(test_labels,pred_reg,average='macro'))
print(f1_score(test_labels,pred_reg,average='micro'))

0.5266087980476643
0.6444147752558967


Best results at SentiRueval2016: <br>
F1_macro = 0.5493 <br>
F1_micro = 0.6822

# Banks analysis

In [24]:
train_banks = XML2DataFrame('bank_train_2016.xml').process_data().fillna(0)

In [25]:
test_banks = XML2DataFrame('banks_test_etalon.xml').process_data().fillna(0)

In [26]:
train_banks_texts = train_banks['text'].values
train_banks_labels = train_banks[[
    "sberbank", "vtb", "gazprom", "alfabank", "bankmoskvy", "raiffeisen",
    "uralsib", "rshb"
]].values
new_labels = []
for string in train_banks_labels:
    new_labels.append(sum([int(val) for val in string]))
    if new_labels[-1]<-1:
        new_labels[-1]=-1
    elif new_labels[-1]>1:
        new_labels[-1]=1
train_banks_labels = new_labels

In [27]:
test_banks_texts = test_banks['text'].values
test_banks_labels = test_banks[[
    "sberbank", "vtb", "gazprom", "alfabank", "bankmoskvy", "raiffeisen",
    "uralsib", "rshb"
]].values
new_labels = []
for string in test_banks_labels:
    new_labels.append(sum([int(val) for val in string]))
    if new_labels[-1]<-1:
        new_labels[-1]=-1
    elif new_labels[-1]>1:
        new_labels[-1]=1
test_banks_labels = new_labels

In [28]:
lr = LogisticRegression(n_jobs=3,random_state=42,verbose=1,warm_start=True,max_iter=50)

In [29]:
tfidf_banks = vectorizer.fit(train_banks_texts)

In [30]:
train_banks_tfidf = tfidf_banks.transform(train_banks_texts).toarray()
test_banks_tfidf = tfidf_banks.transform(test_banks_texts).toarray()

In [31]:
lr.fit(train_banks_tfidf,train_banks_labels)

  " = {}.".format(self.n_jobs))


[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='ovr', n_jobs=3,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=True)

### Banks results with no regularization

In [32]:
banks_pred = list(lr.predict(test_banks_tfidf))

In [33]:
print(f1_score(banks_pred,test_banks_labels,average='macro'))
print(f1_score(banks_pred,test_banks_labels,average='micro'))

0.4979622045828058
0.73407787503773


### Regularization for banks tf/idf

In [42]:
train_banks_tfidf_reg = np.array([(string-string.mean())/string.std() for string in train_banks_tfidf])
test_banks_tfidf_reg = np.array([(string-string.mean())/string.std() for string in test_banks_tfidf])

In [43]:
lr = LogisticRegression(n_jobs=3,random_state=42,verbose=1,warm_start=True,max_iter=50)

In [44]:
lr.fit(train_banks_tfidf_reg,train_banks_labels)

  " = {}.".format(self.n_jobs))


[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='ovr', n_jobs=3,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=True)

### Prediction for banks with regularization

In [45]:
banks_pred_reg = list(lr.predict(test_banks_tfidf_reg))

In [49]:
print(f1_score(banks_pred_reg,test_banks_labels,average='macro'))
print(f1_score(banks_pred_reg,test_banks_labels,average='micro'))

0.29355505573876134
0.7214005433142169


  'precision', 'predicted', average, warn_for)


In [50]:
lr = LogisticRegression(n_jobs=3,random_state=42,verbose=1,warm_start=True,max_iter=50)

# Using all train tweets for both banks and tkk with logreg and no regularization

In [34]:
total_train_labels = train_banks_labels
total_train_labels.extend(train_labels)

In [35]:
total_train_texts = []
total_train_texts.extend(train_banks_texts)
total_train_texts.extend(train_texts)

In [36]:
total_tfidf = vectorizer.fit(total_train_texts)
total_train_tfidf = total_tfidf.transform(total_train_texts)

In [37]:
lr.fit(total_train_tfidf,total_train_labels)

[LibLinear]

  " = {}.".format(self.n_jobs))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='ovr', n_jobs=3,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=True)

### Results for test_banks

In [38]:
tot_pred = list(lr.predict(total_tfidf.transform(test_banks_texts)))

In [39]:
print(f1_score(tot_pred,test_banks_labels,average='macro'))
print(f1_score(tot_pred,test_banks_labels,average='micro'))

0.5382323874148384
0.7524901901599759


Best results at SentiRueval2016: <br>
F1_macro = 0.5252 <br>
F1_micro = 0.5881

### Results for test_tkk

In [40]:
tot_pred = list(lr.predict(total_tfidf.transform(test_texts)))

In [41]:
print(f1_score(tot_pred,test_labels,average='macro'))
print(f1_score(tot_pred,test_labels,average='micro'))

0.4802356659930958
0.6653315531820204


Best results at SentiRueval2016: <br>
F1_macro = 0.5493 <br>
F1_micro = 0.6813

# CNN aproach

In [42]:
from keras.models import Sequential
from keras.layers import Conv1D, Embedding, Dropout, Dense, GlobalAveragePooling1D, MaxPooling1D, Softmax

Using TensorFlow backend.


In [43]:
from gensim.models import FastText

In [44]:
size= 50
lang_model = FastText(min_count=1,window=5,workers=7,size=size)
prepared_for_embed = [sent.split() for sent in total_train_texts]
lang_model.build_vocab(prepared_for_embed)
lang_model.train(prepared_for_embed, total_examples=lang_model.corpus_count, epochs=10)

In [122]:
model = Sequential()
seq_length=30
model.add(Conv1D(64, 10, activation='relu',input_shape=(seq_length, size)))
model.add(MaxPooling1D(10))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.5))
model.add(Dense(3, activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='adagrad',
              metrics=['accuracy',f1])

In [123]:
model.fit(padded, get_dummies(total_train_labels).values, validation_data=(test_banks_padded, get_dummies(test_banks_labels).values), batch_size=128, epochs=10)

Train on 18035 samples, validate on 3313 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f84f8322278>

In [108]:
cnn_pred = model.predict(test_banks_padded)
cnn_pred_val = [np.argmax(string)-1 for string in cnn_pred]

In [124]:
print(f1_score(cnn_pred_val,test_banks_labels, average='macro'))
print(f1_score(cnn_pred_val,test_banks_labels, average='micro'))

0.42798596045027865
0.7060066405070933


In [128]:
cnn_pred = model.predict(test_tkk_padded)
cnn_pred_val = [np.argmax(string)-1 for string in cnn_pred]

In [129]:
print(f1_score(cnn_pred_val,test_labels, average='macro'))
print(f1_score(cnn_pred_val,test_labels, average='micro'))

0.35845707364251805
0.5571873609256787


  'recall', 'true', average, warn_for)


In [None]:
from keras.preprocessing.sequence import pad_sequences

In [48]:
wvs = np.array([lang_model.wv[sent] for sent in prepared_for_embed])

In [49]:
size= 50
padded = pad_sequences(wvs,padding='post', value=np.full(size,0.0))

In [50]:
from pandas import get_dummies

In [53]:
test_banks_wvs = []
for sent in test_banks_texts:
    prep_sent = []
    for word in sent.split():
        try:
            prep_sent.append(lang_model.wv[word])
        except KeyError:
            pass
    test_banks_wvs.append(np.array(prep_sent))
test_banks_wvs = np.array(test_banks_wvs)

test_banks_padded = pad_sequences(test_banks_wvs,padding='post', maxlen=30, value=np.full(size,0.0))

In [90]:
def f1_macro_from_dummies(pred,gt, average='macro'):
    gt_val = [np.argmax(string)-1 for string in gt]
    pred_val = [np.argmax(string)-1 for string in pred]
    return mean(np.array([f1_score(gt_val, pred_val, average=average)]))
def f1_micro_from_dummies(pred,gt, average='micro'):
    gt_val = [np.argmax(string)-1 for string in gt]
    pred_val = [np.argmax(string)-1 for string in pred]
    return mean(np.array([f1_score(gt_val, pred_val, average=average)]))

In [127]:
test_tkk_wvs = []
for sent in test_texts:
    prep_sent = []
    for word in sent.split():
        try:
            prep_sent.append(lang_model.wv[word])
        except KeyError:
            pass
    test_tkk_wvs.append(np.array(prep_sent))
test_tkk_wvs = np.array(test_tkk_wvs)

test_tkk_padded = pad_sequences(test_tkk_wvs,padding='post', maxlen=30, value=np.full(size,0.0))

0.42798596045027865

0.7060066405070933

AttributeError: 'numpy.dtype' object has no attribute 'base_dtype'

In [94]:
from keras import backend as K

In [95]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [98]:
def f1_loss(y_true,y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return (precision+recall+K.epsilon())/(2*((precision*recall)))