In [1]:
data_dir = "../rusentiment/Dataset/"
embeddings_filename = "../fasttext.min_count_100.vk_posts_all_443550246.300d.vec"

In [2]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten
from keras.utils.np_utils import to_categorical

from model_runtime.transformers import TextToFreqRankTransformer, SequencePaddingTransformer
from utils import load_embeddings

SEED = 291018
np.random.seed(SEED)


Using TensorFlow backend.


In [3]:
preselected_posts = pd.read_csv(os.path.join(data_dir, "rusentiment_preselected_posts.csv"))
random_posts = pd.read_csv(os.path.join(data_dir, "rusentiment_random_posts.csv"))

raw_test = pd.read_csv(os.path.join(data_dir, "rusentiment_test.csv"))
raw_train = pd.concat([preselected_posts, random_posts]).reset_index(drop=True)  # only 12 posts are in both sets

In [4]:
num_classes = len(raw_train.label.value_counts())

label_encoder = LabelEncoder().fit(raw_train.label)

y_train = label_encoder.transform(raw_train.label)
y_test = label_encoder.transform(raw_test.label)

y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

In [5]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) 

Первая модель, которую попробую - random forest на мешке слов

In [6]:
def tokenization_and_stemming(text, preserve_case=True):
    stemmer = SnowballStemmer("russian")
    tt = TweetTokenizer(preserve_case=preserve_case)
    return np.array([stemmer.stem(token) for token in tt.tokenize(text)])

In [7]:
bag_of_words = CountVectorizer(tokenizer=tokenization_and_stemming)

In [8]:
rf = RandomForestClassifier(n_jobs=1, random_state=SEED)
random_forest_pipeline = Pipeline([
    # Put BoW inside pipeline, because top N feature selection should not be affected by validation fold
    ('bow', bag_of_words), 
    ('rf', rf)])

In [9]:
baseline_rf = GridSearchCV(
    estimator = random_forest_pipeline, 
    param_grid = {
        "bow__max_features": [500, 1000, 10000],
         "rf__n_estimators": [50, 100]
    }, 
    scoring = "f1_weighted",
    n_jobs = -1,
    verbose=0,
    cv=folds)

In [10]:
baseline_rf_cv = baseline_rf.fit(raw_train.text, y_train)

In [11]:
baseline_rf_cv.cv_results_["mean_test_score"]

array([0.53285365, 0.53607148, 0.54349038, 0.54474539, 0.54901617,
       0.55037176])

Вторая модель - нейронная сеть с предобученными эмбеддингами и одним полносвязным слоем

In [12]:
_, embedding_dim, embedding = load_embeddings(embeddings_filename)

In [13]:
# TODO: finetuning
dictionary_size = 10000
padding_size = 200

In [14]:
def prepare_mebedding_matrix(embedding_layer_size, text_transformer):
    embedding_matrix = np.zeros((embedding_layer_size, embedding_dim))
    for token, rank in text_transformer.token_rank.items():
        embedding_vector = embedding.get(token)
        if embedding_vector is not None:
            embedding_matrix[rank] = embedding_vector
    return embedding_matrix

In [22]:
def train_nn_model(x_train, y_train, x_val, y_val, **kwargs):
    
    dictionary_size = kwargs["dictionary_size"]
    padding_size = kwargs["padding_size"]
    
    text_transformer = TextToFreqRankTransformer(dictionary_size).fit(x_train)
    sequence_padder = SequencePaddingTransformer(padding_size).fit(x_train)

    preprocessing_pipeline = Pipeline([
        ("text_to_freq_rank_sequences", text_transformer),
        ("pad_sequences", sequence_padder)
    ])
    
    x_train = preprocessing_pipeline.transform(x_train)
    x_val = preprocessing_pipeline.transform(x_val)
    
    embedding_layer_size = dictionary_size+1
    embedding_matrix = prepare_mebedding_matrix(embedding_layer_size, text_transformer)
    
    model = Sequential()
    model.add(Embedding(embedding_layer_size, embedding_dim, input_length=padding_size))
    model.add(Flatten())
    model.add(Dense(32, activation="relu"))
    model.add(Dense(num_classes, activation="softmax"))
    model.layers[0].set_weights([embedding_matrix])
    model.layers[0].trainable = False

    model.compile(optimizer="adam",
                  loss="categorical_crossentropy",
                  metrics=["acc"])
    history = model.fit(x_train, to_categorical(y_train),
                        epochs=5,
                        batch_size=32,
                        validation_data=(x_val, to_categorical(y_val)))
    
    f1 = f1_score(y_val, model.predict_classes(x_val), average="weighted")
    
    return history, f1

In [23]:
nn_cv_results = []

for train_index, val_index in folds.split(raw_train, y_train):
    x_train_fold, x_val_fold = raw_train.text[train_index], raw_train.text[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
    history, f1_fold = train_nn_model(x_train_fold, y_train_fold, 
                                       x_val_fold, y_val_fold, 
                                       dictionary_size=dictionary_size,
                                       padding_size=padding_size)
    
    nn_cv_results.append(f1_fold)

Train on 22574 samples, validate on 5644 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 22574 samples, validate on 5644 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 22574 samples, validate on 5644 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 22574 samples, validate on 5644 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 22576 samples, validate on 5642 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [28]:
np.mean(nn_cv_results)

0.5706784837986254

In [30]:
nn_history_all_data, nn_f1 = train_nn_model(raw_train.text, y_train, 
                                            raw_test.text, y_test, 
                                            dictionary_size=dictionary_size,
                                            padding_size=padding_size)

Train on 28218 samples, validate on 2967 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Val loss растёт, когда train падает, так что модель переобучается, можно попробовать использовать модель попроще.

Сравнение двух моделей

In [31]:
best_rf_f1_test_score = f1_score(y_test, baseline_rf_cv.best_estimator_.predict(raw_test.text), average="weighted")

In [32]:
pd.DataFrame({"mean cv": {"rf": baseline_rf_cv.best_score_, "nn": np.mean(cv_results)}, 
              "test": {"rf": best_rf_f1_test_score, "nn": nn_f1}})

Unnamed: 0,mean cv,test
nn,0.570678,0.632176
rf,0.550372,0.631939
