In [2]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import os
from transformers import BertTokenizer, BertModel, BertConfig, BertPreTrainedModel, BertForPreTraining, BertForMaskedLM
from tqdm import tqdm, tqdm_notebook
RUBERT_PATH = 'C:\\Users\\User\\Downloads\\nlp\\ru_conversational_cased_L-12_H-768_A-12_pt'
modelpath = os.path.join(RUBERT_PATH,'pytorch_model.bin')

In [3]:
os.path.isfile(os.path.join(RUBERT_PATH,'pytorch_model.bin'))

True

In [4]:
# tokenizer = BertTokenizer.from_pretrained(os.path.join(RUBERT_PATH,'vocab.txt'))
tokenizer = BertTokenizer.from_pretrained(RUBERT_PATH, do_lower_case=False)
config = BertConfig.from_json_file(os.path.join(RUBERT_PATH,'bert_config.json'))
model = BertForPreTraining.from_pretrained(modelpath, config=config)
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
# model.eval()
# model = BertForMaskedLM.from_pretrained(modelpath, config=config)
model.eval()

# model = BertForPreTraining.from_pretrained(RUBERT_PATH)


BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [5]:
dataset = pd.read_csv('texts_train.txt', sep="\t", header=None)
dataset.columns = ["text"]
dataset.head(10)

Unnamed: 0,text
0,"Сериал очень люблю, но Академия и Земля вызыва..."
1,"думал, что будет лучше идея очень интересна - ..."
2,с творчеством Головачева я познакомился посред...
3,"то-то я и в большое неудовольствие прочитал ""А..."
4,как мне показалось местами сильно смахивает на...
5,от первой части книги просто оторваться не мог...
6,"читается, конечно, легко.. но уж очень ощущени..."
7,прочитал на одном дыхании! очень понравилось! ...
8,"Дочитав, я ещё несколько дней не могла в себя ..."
9,Сериал впечатлил! Интересная идея Слов Силы!


In [6]:
def calc_means(sentence):
    tokenized_text = tokenizer.tokenize(sentence)
    tokenized_text = tokenized_text[:max_input_length-2]
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    segments_ids = [1] * len(tokenized_text)    
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    predictions = model(tokens_tensor, token_type_ids=segments_tensors)
    sr, sd, td = predictions[0].shape
    
    finalVector = []
    
    for i in range(sd):
        currentArr = predictions[0][0][i].detach().numpy()
        if len(finalVector) == 0:
            finalVector = currentArr
        else:
            finalVector = np.add(finalVector, currentArr)
    return np.mean(finalVector)

In [7]:
calc_means(dataset['text'][0])

-290.20065

In [8]:
    df = pd.read_csv('scores_train.txt', sep="\t", header=None, dtype='float64')
    df.columns = ["tonality"]
    vector_means = [calc_means(sentence) for sentence in dataset["text"].tolist()]
    df['vector_means'] = vector_means

    df.head(10)

Unnamed: 0,tonality,vector_means
0,6.0,-290.200653
1,7.0,-183.830276
2,10.0,-440.67691
3,5.0,-231.999893
4,6.0,-362.513855
5,9.0,-283.887817
6,8.0,-124.312889
7,9.0,-493.314056
8,10.0,-621.127014
9,9.0,-31.138296


In [10]:
df.to_csv('output.csv', index = False, header=True)

In [45]:
X = np.array(df['tonality']).reshape(-1, 1)
y = np.array(df['vector_means']).reshape(-1, 1)

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn import metrics
X_train, X_test, y_train, y_test  = train_test_split(y, X, test_size = 0.3)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=2000, random_state=0)
model = rfc.fit(X_train, y_train)
y_predict = rfc.predict(X_test)

In [53]:
results = confusion_matrix(y_test, y_predict) 
print('Confusion Matrix :')
print(results)
print('Report : ')
print(classification_report(y_test, y_predict))

Confusion Matrix :
[[  2   2   3   5   6   7  10  16  14  31]
 [  1   1   3   4   5   3  10  12  28  25]
 [  3   2   5   5   7   6   9  27  48  59]
 [  3   2  13   8  23  11  22  44  57  64]
 [  8   7  17  19  19  17  43  82  97  97]
 [  4   6  12  13  22  17  32  55  70  74]
 [  5  13  24  20  49  33  56  90 138 133]
 [ 17  18  27  37  60  63 100 194 266 282]
 [ 20  27  46  55 100  81 125 227 367 382]
 [ 32  30  60  50  94  78 148 285 408 443]]
Report : 
              precision    recall  f1-score   support

         1.0       0.02      0.02      0.02        96
         2.0       0.01      0.01      0.01        92
         3.0       0.02      0.03      0.03       171
         4.0       0.04      0.03      0.03       247
         5.0       0.05      0.05      0.05       406
         6.0       0.05      0.06      0.05       305
         7.0       0.10      0.10      0.10       561
         8.0       0.19      0.18      0.19      1064
         9.0       0.25      0.26      0.25      1430