In [3]:
import pandas as pd
import numpy as np


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_validate

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, f1_score

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import pymorphy2
import string
import re

In [4]:
data = pd.read_csv('rusent.csv')
#data_2 = pd.read_csv('rusentiment_preselected_posts.csv')
#data_1 = pd.read_csv('rusentiment_random_posts.csv')
data.head()

Unnamed: 0,label,text
0,negative,"А попа подозревала давно,что ты с кавказа..пер..."
1,speech,З прошедшим Днем Ангела))))))))
2,skip,Два дня до отлёта с острова!!!!!!!
3,negative,"Блин, почему эта жизнь столь не справедлива (((("
4,skip,где еще встречать свой день рождения как не на...


In [5]:
# Избавляемся от таких классов как skip и speech
data = data[(data.label!='skip') & (data.label != 'speech') & (data.label != 'neutral')]

In [6]:
len(data)

9764

# Предобработка текстовых данных

In [7]:
stopwords = set(stopwords.words('russian'))
morph = pymorphy2.MorphAnalyzer()

def text_processing(data):
    t_arr = TweetTokenizer().tokenize(data)
    cleaned_data = []
    for word in t_arr:
        if word.lower() not in stopwords:
            if word.isalnum() and word != 'RT':
                lemma = morph.parse(word.lower())[0].normal_form
                cleaned_data.append(lemma)
        else:
            pass
        fin_str = ' '.join(cleaned_data)
    return fin_str

def hands_re(text):
    punctuation = string.punctuation + '\u2014\u2013\u2012\u2010\u2212' + '«»‹›‘’“”„`'
    word_tokenize = re.compile(r"([^\w_\u2019\u2010\u002F-]|[+])")
    tokenized_words = []
    for token in word_tokenize.split(text):
        if token and not token.isspace() and not token in punctuation: # если слово не попадает в те которые мы исключаем
            tokenized_words.append(token.lower())        # добавляем слово в список
    return tokenized_words                     # возвращаем список слов

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dns\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
clean_data = []

for text in data['text']:
    new_text = text_processing(text)
    clean_data.append(new_text)

In [10]:
# Записываем фактические значения текста в X, фактические значения меток в y:
X = clean_data
y = data['label'].values

In [11]:
import gensim 

model = gensim.models.Word2Vec(
        X,
        size=20,
        window=10,
        min_count=2,
        workers=10)
model.train(X, total_examples=len(X), epochs=10)



(1168897, 5651030)

In [12]:
from gensim.models import KeyedVectors, doc2vec, FastText
w2v = KeyedVectors.load_word2vec_format('model.bin', binary=True, encoding='utf-8')

In [13]:
w2v.most_similar(positive=['королева_NOUN'],negative=['женщина_NOUN'])

[('королева_PROPN', 0.5260607004165649),
 ('король_NOUN', 0.5213080644607544),
 ('король_PROPN', 0.46887069940567017),
 ('высочество_PROPN', 0.41855698823928833),
 ('королевский_ADJ', 0.4164617657661438),
 ('величество_PROPN', 0.41055727005004883),
 ('королева_ADV', 0.4056137800216675),
 ('принцесса_NOUN', 0.3994409441947937),
 ('герцог_NOUN', 0.3904263377189636),
 ('величество_NOUN', 0.39014965295791626)]

In [14]:
y

array(['negative', 'negative', 'positive', ..., 'positive', 'positive',
       'positive'], dtype=object)

In [15]:
len(X)

9764

Некоторые классификаторы не способны справляться с нечисловыми классами, в таком случае решением является преобразовать к числовому формату.

Это делается посредством класса LabelEncoder:

In [16]:
classEncoder = LabelEncoder() # Создаем объект класса
labelled_y = classEncoder.fit_transform(y) 
# Здесь LabelEncoder смотрит на весь вектор, и ставит в соответствие каждому символьному классу в соответствие число

In [17]:
labelled_y

array([0, 0, 1, ..., 1, 1, 1], dtype=int64)

In [18]:
classEncoder.classes_

array(['negative', 'positive'], dtype=object)

In [63]:
f = open('X.txt', 'w')
f.write('/n'.join(X))


UnicodeEncodeError: 'charmap' codec can't encode character '\u0295' in position 30552: character maps to <undefined>

In [230]:
positive = ['ахи',
            'крутой',
           'прекрасный',
           'обожать',
           'понравиться',
           'отличный',
           'ура',
           'классный',
           'хороший',
           'круто',
           'шикарный',
           'приятно',
           'любимый',
           'ахахахах',
           'супер',
           'красивый',
           'любить',
           'счастие',
           'замечательный',
           'офигенный',
           'красиво',
           'молодец',
           'прикольный',
           'нравиться',
            'красавчик',
           'идеальный',
           'охуенно',
           'еее',
           'красавец',
           'наслаждаться',
           'тащиться',
            'улыбнуться',
           'счастливый',
           'отлично',
           'классно',
           'милый',
           'ржать',
           'слава',
            'чемпион',
           'подруга',
           'здорово',
           'удача',
           'чудесный',
           'влюбиться',
           'родной',
            'гордиться',
           'трек',
           'правда',
           'красота',
           'довольный',
           'танцевать']
negative = ['скучно',
           'пиздец',
           'говно', 
           'ненавидеть', 
           'жалко',
            'заболеть',
           'грустно',
           'блять', 
           'умереть', 
           'лох',
            'болеть',
           'жаль',
           'плохо', 
           'послать', 
           'убивать',
            'надоесть',
           'ужасный',
           'бред', 
           'обидный', 
           'ппц',
            'достать',
           'ужас',
           'хватить', 
           'хреновый', 
           'дура',
            'тормоз',
           'тупой',
           'смерть', 
           'скорбеть',
            'дебил',
           'идиот',
           'хер', 
           'подонок', 
           'хулить',
            'одиночество',
           'кошмар',
           'забыть', 
           'бедный', 
           'нахрен',
            'олень',
           'докатиться',
           'тварь', 
           'больно', 
           'печаль',
            'бесить',
           'зло',
           'брр', 
           'грустить', 
           'мудак']

In [212]:
positive_distributions = []
words = []
positive_dict = {}
for word in positive:
#word = 'танцевать'
    for text in X:
        stroka = text.split(' ')
        for ind, token in enumerate(stroka):
        #print(token)
            if word == token:
                if token != stroka[-1]:
                    words.append(stroka[ind+1])
    
    counts = Counter(words)
    positive_dict[word] = counts.most_common(1000)
    words = []
#print(positive_dict)

In [231]:
negative_distributions = []
words = []
negative_dict = {}
special_texts = []
for word in negative:
        #print(word)
        for text in X:
            stroka = text.split(' ')
            for ind, token in enumerate(stroka):
                #print(token)
                if word == token:
                    if token != stroka[-1]:
                        words.append(stroka[ind+1])
    
        counts = Counter(words)
        negative_dict[word] = counts.most_common(1000)
        words = []
#print(positive_dict)

In [232]:
negative_dict


{'бедный': [('бедный', 2),
  ('неразумный', 1),
  ('врач', 1),
  ('бабулька', 1),
  ('мой', 1),
  ('сосед', 1),
  ('богатый', 1),
  ('девица', 1),
  ('неимущий', 1),
  ('коренной', 1),
  ('реал', 1),
  ('влад', 1),
  ('губка', 1),
  ('печень', 1),
  ('збежать', 1),
  ('крестьянин', 1),
  ('мишка', 1)],
 'бесить': [('чувство', 1),
  ('мама', 1),
  ('ностальгировать', 1),
  ('полярный', 1),
  ('судить', 1),
  ('весь', 1),
  ('скучать', 1),
  ('друг', 1),
  ('смотреть', 1),
  ('проблема', 1),
  ('особенно', 1),
  ('жлоб', 1),
  ('мудоеб', 1),
  ('болеть', 1),
  ('человек', 1),
  ('начинать', 1),
  ('россия', 1),
  ('тот', 1),
  ('наивный', 1),
  ('худой', 1),
  ('писать', 1),
  ('стоять', 1),
  ('ты', 1)],
 'блять': [('ебать', 2),
  ('парень', 1),
  ('сегодня', 1),
  ('завтра', 1),
  ('ахах', 1),
  ('сук', 1),
  ('нынче', 1),
  ('каво', 1),
  ('играть', 1),
  ('гоу', 1),
  ('ваш', 1),
  ('видать', 1),
  ('идти', 1),
  ('перестать', 1),
  ('фу', 1),
  ('идиотииизм', 1),
  ('совет', 1),
  (

In [242]:
for i in negative_dict:
    print('СЛОВО ' + i + ', Количество дистрибуций: ' + str(len(negative_dict[i])))

СЛОВО скучно, Количество дистрибуций: 13
СЛОВО пиздец, Количество дистрибуций: 44
СЛОВО говно, Количество дистрибуций: 17
СЛОВО ненавидеть, Количество дистрибуций: 39
СЛОВО жалко, Количество дистрибуций: 18
СЛОВО заболеть, Количество дистрибуций: 10
СЛОВО грустно, Количество дистрибуций: 22
СЛОВО блять, Количество дистрибуций: 53
СЛОВО умереть, Количество дистрибуций: 33
СЛОВО лох, Количество дистрибуций: 10
СЛОВО болеть, Количество дистрибуций: 32
СЛОВО жаль, Количество дистрибуций: 49
СЛОВО плохо, Количество дистрибуций: 41
СЛОВО послать, Количество дистрибуций: 21
СЛОВО убивать, Количество дистрибуций: 11
СЛОВО надоесть, Количество дистрибуций: 23
СЛОВО ужасный, Количество дистрибуций: 16
СЛОВО бред, Количество дистрибуций: 17
СЛОВО обидный, Количество дистрибуций: 19
СЛОВО ппц, Количество дистрибуций: 14
СЛОВО достать, Количество дистрибуций: 16
СЛОВО ужас, Количество дистрибуций: 10
СЛОВО хватить, Количество дистрибуций: 37
СЛОВО хреновый, Количество дистрибуций: 8
СЛОВО дура, Кол

In [243]:
for i in positive_dict:
    print('СЛОВО ' + i + ', Количество дистрибуций: ' + str(len(positive_dict[i])))

СЛОВО ахи, Количество дистрибуций: 24
СЛОВО крутой, Количество дистрибуций: 56
СЛОВО прекрасный, Количество дистрибуций: 56
СЛОВО обожать, Количество дистрибуций: 36
СЛОВО понравиться, Количество дистрибуций: 36
СЛОВО отличный, Количество дистрибуций: 52
СЛОВО ура, Количество дистрибуций: 40
СЛОВО классный, Количество дистрибуций: 44
СЛОВО хороший, Количество дистрибуций: 247
СЛОВО круто, Количество дистрибуций: 34
СЛОВО шикарный, Количество дистрибуций: 31
СЛОВО приятно, Количество дистрибуций: 36
СЛОВО любимый, Количество дистрибуций: 169
СЛОВО ахахахах, Количество дистрибуций: 9
СЛОВО супер, Количество дистрибуций: 30
СЛОВО красивый, Количество дистрибуций: 79
СЛОВО любить, Количество дистрибуций: 299
СЛОВО счастие, Количество дистрибуций: 88
СЛОВО замечательный, Количество дистрибуций: 34
СЛОВО офигенный, Количество дистрибуций: 9
СЛОВО красиво, Количество дистрибуций: 23
СЛОВО молодец, Количество дистрибуций: 25
СЛОВО прикольный, Количество дистрибуций: 15
СЛОВО нравиться, Количес

# Логистическая регрессия

## Репрезентация TF-IDF



In [None]:
# Создаем объект класса TF-IDF Vectorizer, который преобразует корпус текстов в TF-IDF
TFIDFVectorizer = TfidfVectorizer(max_df=0.9, min_df=0.01, sublinear_tf=True)
X_TF = TFIDFVectorizer.fit_transform(X)

# Тогда разбить на тренировочную\тестовую выборки:
X_train, X_test, y_train, y_test = train_test_split(X_TF, labelled_y, test_size=0.2)

In [24]:
# Создаем объект класса логистической регрессии
logistic_model = LogisticRegression()  
logistic_model.fit(X_train, y_train)                     # Обучаем на тренировочной выборке
predicted = logistic_model.predict(X_test)         # Тестируем на тестовой выборке

# Указываем micro\macro, поскольку у нас три класса, т.е. мультиклассовая классификация
print('Accuracy: {:0.4f}'.format(accuracy_score(predicted, y_test)))
print('Precision: {:0.4f}'.format(precision_score(predicted, y_test, average = 'micro')))
print('Recall: {:0.4f}'.format(recall_score(predicted, y_test, average = 'micro')))

print(classification_report(y_test, predicted))


Accuracy: 0.6559
Precision: 0.6559
Recall: 0.6559
             precision    recall  f1-score   support

          0       0.66      0.24      0.35       761
          1       0.65      0.92      0.77      1192

avg / total       0.66      0.66      0.60      1953



In [25]:
# Рассмотрим более консервативную оценку с помощью кросс-валидации
# Реализуем сначала самую простую оценку

accuracy_list = cross_val_score(logistic_model, X_TF, labelled_y, cv=5)   # Возвращает список score (в нашем случае это accuracy)
print('Accuracy list: {}'.format(accuracy_list))
print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))

Accuracy list: [0.66513057 0.67434716 0.66461854 0.65540195 0.66752049]
Mean of accuracy list: 0.6654 (+/- 0.0061)


In [26]:
# Теперь реализуем посредством того, что мы сами будем разбивать
kf = KFold(n_splits=5)
iter_number = 0

accuracy_list = []
precision_list = []
recall_list = []
f1_measure_list = []

for train_index, test_index in kf.split(X_TF):
    iter_number += 1
    X_train, X_test = X_TF[train_index], X_TF[test_index]
    y_train, y_test = labelled_y[train_index], labelled_y[test_index]
    
    logistic_model.fit(X_train, y_train)                     # Обучаем на тренировочной выборке
    predicted = logistic_model.predict(X_test)         # Тестируем на тестовой выборке

    # Указываем micro\macro, поскольку у нас три класса, т.е. мультиклассовая классификация
    
    print('Iteration #{} | Accuracy: {:0.4f} | Precision: {:0.4f} | Recall: {:0.4f} | F1-score: {:0.4f} '.format(
        iter_number,
        accuracy_score(predicted, y_test),
        precision_score(predicted, y_test, average = 'micro'),
        recall_score(predicted, y_test, average = 'micro'),
        f1_score(predicted, y_test, average = 'micro')
    ))
    accuracy_list.append(accuracy_score(predicted, y_test))
    precision_list.append(precision_score(predicted, y_test, average = 'micro'))
    recall_list.append(recall_score(predicted, y_test, average = 'micro'))
    f1_measure_list.append(f1_score(predicted, y_test, average = 'micro'))

print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))
print('Mean of precision list: {:.4f} (+/- {:.4f})'.format(np.mean(precision_list), np.std(precision_list)))
print('Mean of recall list: {:.4f} (+/- {:.4f})'.format(np.mean(recall_list), np.std(recall_list)))
print('Mean of f1 score list: {:.4f} (+/- {:.4f})'.format(np.mean(f1_measure_list), np.std(f1_measure_list)))
    

Iteration #1 | Accuracy: 0.6733 | Precision: 0.6733 | Recall: 0.6733 | F1-score: 0.6733 
Iteration #2 | Accuracy: 0.7066 | Precision: 0.7066 | Recall: 0.7066 | F1-score: 0.7066 
Iteration #3 | Accuracy: 0.7025 | Precision: 0.7025 | Recall: 0.7025 | F1-score: 0.7025 
Iteration #4 | Accuracy: 0.5581 | Precision: 0.5581 | Recall: 0.5581 | F1-score: 0.5581 
Iteration #5 | Accuracy: 0.6773 | Precision: 0.6773 | Recall: 0.6773 | F1-score: 0.6773 
Mean of accuracy list: 0.6636 (+/- 0.0544)
Mean of precision list: 0.6636 (+/- 0.0544)
Mean of recall list: 0.6636 (+/- 0.0544)
Mean of f1 score list: 0.6636 (+/- 0.0544)


## Объяснение предсказаний ELI5

In [34]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline


X_train, X_test, y_train, y_test = train_test_split(X, labelled_y, test_size=0.2)

vec = TfidfVectorizer(max_df=0.9, min_df=0.01, sublinear_tf=True, ngram_range=(1,2))


clf = LogisticRegressionCV()   

pipe = make_pipeline(vec, clf)
pipe.fit(X_train, y_train);

In [35]:

from sklearn import metrics

def print_report(pipe):
    
    y_pred = pipe.predict(X_test)
    report = metrics.classification_report(y_test, y_pred,
        #target_names=twenty_test.target_names
                                          )
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)


             precision    recall  f1-score   support

          0       0.64      0.28      0.39       707
          1       0.69      0.91      0.79      1246

avg / total       0.67      0.68      0.64      1953

accuracy: 0.684


In [36]:
import eli5
df = eli5.explain_weights_df(clf, vec=vec,
                 target_names=classEncoder.classes_
                 )

  from numpy.core.umath_tests import inner1d


In [37]:
df.to_csv('bayes_linregr_tfidf_pred.txt', sep='\t', encoding='utf-8') 

## Репрезентация CountVectorizer

In [38]:

# Создаем объект класса CountVectorizer, который преобразует корпус текстов в one-hot представление
CVectorizer = CountVectorizer()
X_Count = CVectorizer.fit_transform(X)

# Тогда разбить на тренировочную\тестовую выборки:
X_train, X_test, y_train, y_test = train_test_split(X_Count, labelled_y, test_size=0.2)

In [39]:
# Создаем объект класса логистической регрессии
logistic_model = LogisticRegression()  
logistic_model.fit(X_train, y_train)                     # Обучаем на тренировочной выборке
predicted = logistic_model.predict(X_test)         # Тестируем на тестовой выборке

# Указываем micro\macro, поскольку у нас три класса, т.е. мультиклассовая классификация
print('Accuracy: {:0.4f}'.format(accuracy_score(predicted, y_test)))
print('Precision: {:0.4f}'.format(precision_score(predicted, y_test, average = 'micro')))
print('Recall: {:0.4f}'.format(recall_score(predicted, y_test, average = 'micro')))

print(classification_report(y_test, predicted))


Accuracy: 0.8095
Precision: 0.8095
Recall: 0.8095
             precision    recall  f1-score   support

          0       0.84      0.61      0.71       735
          1       0.80      0.93      0.86      1218

avg / total       0.81      0.81      0.80      1953



In [40]:
# Рассмотрим более консервативную оценку с помощью кросс-валидации
# Реализуем сначала самую простую оценку

accuracy_list = cross_val_score(logistic_model, X_Count, labelled_y, cv=5)   # Возвращает список score (в нашем случае это accuracy)
print('Accuracy list: {}'.format(accuracy_list))
print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))

Accuracy list: [0.79313876 0.80030722 0.80286738 0.79774706 0.79815574]
Mean of accuracy list: 0.7984 (+/- 0.0032)


In [41]:
# Теперь реализуем посредством того, что мы сами будем разбивать
kf = KFold(n_splits=5)
iter_number = 0

accuracy_list = []
precision_list = []
recall_list = []
f1_measure_list = []

for train_index, test_index in kf.split(X_Count):
    iter_number += 1
    X_train, X_test = X_Count[train_index], X_Count[test_index]
    y_train, y_test = labelled_y[train_index], labelled_y[test_index]
    
    logistic_model.fit(X_train, y_train)                     # Обучаем на тренировочной выборке
    predicted = logistic_model.predict(X_test)         # Тестируем на тестовой выборке

    # Указываем micro\macro, поскольку у нас три класса, т.е. мультиклассовая классификация
    
    print('Iteration #{} | Accuracy: {:0.4f} | Precision: {:0.4f} | Recall: {:0.4f} | F1-score: {:0.4f} '.format(
        iter_number,
        accuracy_score(predicted, y_test),
        precision_score(predicted, y_test, average = 'micro'),
        recall_score(predicted, y_test, average = 'micro'),
        f1_score(predicted, y_test, average = 'micro')
    ))
    accuracy_list.append(accuracy_score(predicted, y_test))
    precision_list.append(precision_score(predicted, y_test, average = 'micro'))
    recall_list.append(recall_score(predicted, y_test, average = 'micro'))
    f1_measure_list.append(f1_score(predicted, y_test, average = 'micro'))

print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))
print('Mean of precision list: {:.4f} (+/- {:.4f})'.format(np.mean(precision_list), np.std(precision_list)))
print('Mean of recall list: {:.4f} (+/- {:.4f})'.format(np.mean(recall_list), np.std(recall_list)))
print('Mean of f1 score list: {:.4f} (+/- {:.4f})'.format(np.mean(f1_measure_list), np.std(f1_measure_list)))
    

Iteration #1 | Accuracy: 0.7962 | Precision: 0.7962 | Recall: 0.7962 | F1-score: 0.7962 
Iteration #2 | Accuracy: 0.8187 | Precision: 0.8187 | Recall: 0.8187 | F1-score: 0.8187 
Iteration #3 | Accuracy: 0.8305 | Precision: 0.8305 | Recall: 0.8305 | F1-score: 0.8305 
Iteration #4 | Accuracy: 0.7465 | Precision: 0.7465 | Recall: 0.7465 | F1-score: 0.7465 
Iteration #5 | Accuracy: 0.8028 | Precision: 0.8028 | Recall: 0.8028 | F1-score: 0.8028 
Mean of accuracy list: 0.7990 (+/- 0.0288)
Mean of precision list: 0.7990 (+/- 0.0288)
Mean of recall list: 0.7990 (+/- 0.0288)
Mean of f1 score list: 0.7990 (+/- 0.0288)


## Объяснение с ELI5

In [48]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline


X_train, X_test, y_train, y_test = train_test_split(X, labelled_y, test_size=0.2)


vec = CountVectorizer(ngram_range = (1,2))

clf = LogisticRegressionCV()   

pipe = make_pipeline(vec, clf)
pipe.fit(X_train, y_train);

In [49]:
from sklearn import metrics

def print_report(pipe):
    
    y_pred = pipe.predict(X_test)
    report = metrics.classification_report(y_test, y_pred,
        #target_names=twenty_test.target_names
                                          )
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)


             precision    recall  f1-score   support

          0       0.83      0.61      0.71       735
          1       0.80      0.93      0.86      1218

avg / total       0.81      0.81      0.80      1953

accuracy: 0.808


In [50]:
import eli5
df = eli5.explain_weights_df(clf, vec=vec,
                 target_names=classEncoder.classes_
                 )

In [51]:
df.to_csv('bayes_linregr_countvec_pred.txt', sep='\t', encoding='utf-8') 

# Support Vector Machine

## Репрезентация TF-IDF

In [52]:
# Создаем объект класса TF-IDF Vectorizer, который преобразует корпус текстов в TF-IDF
TFIDFVectorizer = TfidfVectorizer(max_df=0.9, min_df=0.01, sublinear_tf=True)
X_TF = TFIDFVectorizer.fit_transform(X)

# Тогда разбить на тренировочную\тестовую выборки:
X_train, X_test, y_train, y_test = train_test_split(X_TF, labelled_y, test_size=0.2)

In [53]:

svm_model = LinearSVC()  
svm_model.fit(X_train, y_train)                     # Обучаем на тренировочной выборке
predicted = svm_model.predict(X_test)         # Тестируем на тестовой выборке
print('Accuracy: {:0.4f}'.format(accuracy_score(predicted, y_test)))
print('Precision: {:0.4f}'.format(precision_score(predicted, y_test, average = 'micro')))
print('Recall: {:0.4f}'.format(recall_score(predicted, y_test, average = 'micro')))

print(classification_report(y_test, predicted))


Accuracy: 0.6575
Precision: 0.6575
Recall: 0.6575
             precision    recall  f1-score   support

          0       0.67      0.26      0.38       773
          1       0.65      0.92      0.76      1180

avg / total       0.66      0.66      0.61      1953



In [54]:
accuracy_list = cross_val_score(svm_model, X_TF, labelled_y, cv=5)   
print('Accuracy list: {}'.format(accuracy_list))
print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))

Accuracy list: [0.6656426  0.67332309 0.6641065  0.65847414 0.66752049]
Mean of accuracy list: 0.6658 (+/- 0.0048)


In [55]:
kf = KFold(n_splits=10)
iter_number = 0

accuracy_list = []
precision_list = []
recall_list = []
f1_measure_list = []

for train_index, test_index in kf.split(X_TF):
    iter_number += 1
    X_train, X_test = X_TF[train_index], X_TF[test_index]
    y_train, y_test = labelled_y[train_index], labelled_y[test_index]
    
    svm_model.fit(X_train, y_train)                     
    predicted = svm_model.predict(X_test)         

    
    
    print('Iteration #{} | Accuracy: {:0.4f} | Precision: {:0.4f} | Recall: {:0.4f} | F1-score: {:0.4f} '.format(
        iter_number,
        accuracy_score(predicted, y_test),
        precision_score(predicted, y_test, average = 'micro'),
        recall_score(predicted, y_test, average = 'micro'),
        f1_score(predicted, y_test, average = 'micro')
    ))
    accuracy_list.append(accuracy_score(predicted, y_test))
    precision_list.append(precision_score(predicted, y_test, average = 'micro'))
    recall_list.append(recall_score(predicted, y_test, average = 'micro'))
    f1_measure_list.append(f1_score(predicted, y_test, average = 'micro'))

print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))
print('Mean of precision list: {:.4f} (+/- {:.4f})'.format(np.mean(precision_list), np.std(precision_list)))
print('Mean of recall list: {:.4f} (+/- {:.4f})'.format(np.mean(recall_list), np.std(recall_list)))
print('Mean of f1 score list: {:.4f} (+/- {:.4f})'.format(np.mean(f1_measure_list), np.std(f1_measure_list)))
    

Iteration #1 | Accuracy: 0.6489 | Precision: 0.6489 | Recall: 0.6489 | F1-score: 0.6489 
Iteration #2 | Accuracy: 0.6981 | Precision: 0.6981 | Recall: 0.6981 | F1-score: 0.6981 
Iteration #3 | Accuracy: 0.7124 | Precision: 0.7124 | Recall: 0.7124 | F1-score: 0.7124 
Iteration #4 | Accuracy: 0.6888 | Precision: 0.6888 | Recall: 0.6888 | F1-score: 0.6888 
Iteration #5 | Accuracy: 0.6998 | Precision: 0.6998 | Recall: 0.6998 | F1-score: 0.6998 
Iteration #6 | Accuracy: 0.7059 | Precision: 0.7059 | Recall: 0.7059 | F1-score: 0.7059 
Iteration #7 | Accuracy: 0.6977 | Precision: 0.6977 | Recall: 0.6977 | F1-score: 0.6977 
Iteration #8 | Accuracy: 0.4119 | Precision: 0.4119 | Recall: 0.4119 | F1-score: 0.4119 
Iteration #9 | Accuracy: 0.4744 | Precision: 0.4744 | Recall: 0.4744 | F1-score: 0.4744 
Iteration #10 | Accuracy: 0.8504 | Precision: 0.8504 | Recall: 0.8504 | F1-score: 0.8504 
Mean of accuracy list: 0.6588 (+/- 0.1194)
Mean of precision list: 0.6588 (+/- 0.1194)
Mean of recall list: 0

## Объяснение с ELI5

In [64]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline


X_train, X_test, y_train, y_test = train_test_split(X, labelled_y, test_size=0.2)

vec = TfidfVectorizer(max_df=0.9, min_df=0.01, sublinear_tf=True, ngram_range=(1,2))


clf = LinearSVC()   

pipe = make_pipeline(vec, clf)
pipe.fit(X_train, y_train);

In [65]:
from sklearn import metrics

def print_report(pipe):
    
    y_pred = pipe.predict(X_test)
    report = metrics.classification_report(y_test, y_pred,
        #target_names=twenty_test.target_names
                                          )
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)


             precision    recall  f1-score   support

          0       0.62      0.25      0.36       748
          1       0.66      0.90      0.76      1205

avg / total       0.64      0.65      0.61      1953

accuracy: 0.653


In [66]:

df = eli5.explain_weights_df(clf, vec=vec,
                 target_names=classEncoder.classes_
                 )

In [67]:
df.to_csv('bayes_svm_tfidf_pred.txt', sep='\t', encoding='utf-8') 

## Репрезентация с помощью CountVectorizer

In [68]:

# Создаем объект класса CountVectorizer, который преобразует корпус текстов в one-hot представление
CVectorizer = CountVectorizer()
X_Count = CVectorizer.fit_transform(X)

# Тогда разбить на тренировочную\тестовую выборки:
X_train, X_test, y_train, y_test = train_test_split(X_Count, labelled_y, test_size=0.2)

In [69]:

svm_model = LinearSVC()  
svm_model.fit(X_train, y_train)                     # Обучаем на тренировочной выборке
predicted = svm_model.predict(X_test)         # Тестируем на тестовой выборке
print('Accuracy: {:0.4f}'.format(accuracy_score(predicted, y_test)))
print('Precision: {:0.4f}'.format(precision_score(predicted, y_test, average = 'micro')))
print('Recall: {:0.4f}'.format(recall_score(predicted, y_test, average = 'micro')))

print(classification_report(y_test, predicted))


Accuracy: 0.8008
Precision: 0.8008
Recall: 0.8008
             precision    recall  f1-score   support

          0       0.78      0.63      0.70       711
          1       0.81      0.90      0.85      1242

avg / total       0.80      0.80      0.80      1953



In [70]:
accuracy_list = cross_val_score(svm_model, X_Count, labelled_y, cv=5)   
print('Accuracy list: {}'.format(accuracy_list))
print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))

Accuracy list: [0.7921147  0.80286738 0.80337942 0.7921147  0.79661885]
Mean of accuracy list: 0.7974 (+/- 0.0049)


In [71]:
kf = KFold(n_splits=10)
iter_number = 0

accuracy_list = []
precision_list = []
recall_list = []
f1_measure_list = []

for train_index, test_index in kf.split(X_Count):
    iter_number += 1
    X_train, X_test = X_Count[train_index], X_Count[test_index]
    y_train, y_test = labelled_y[train_index], labelled_y[test_index]
    
    svm_model.fit(X_train, y_train)                     
    predicted = svm_model.predict(X_test)         

    
    
    print('Iteration #{} | Accuracy: {:0.4f} | Precision: {:0.4f} | Recall: {:0.4f} | F1-score: {:0.4f} '.format(
        iter_number,
        accuracy_score(predicted, y_test),
        precision_score(predicted, y_test, average = 'micro'),
        recall_score(predicted, y_test, average = 'micro'),
        f1_score(predicted, y_test, average = 'micro')
    ))
    accuracy_list.append(accuracy_score(predicted, y_test))
    precision_list.append(precision_score(predicted, y_test, average = 'micro'))
    recall_list.append(recall_score(predicted, y_test, average = 'micro'))
    f1_measure_list.append(f1_score(predicted, y_test, average = 'micro'))

print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))
print('Mean of precision list: {:.4f} (+/- {:.4f})'.format(np.mean(precision_list), np.std(precision_list)))
print('Mean of recall list: {:.4f} (+/- {:.4f})'.format(np.mean(recall_list), np.std(recall_list)))
print('Mean of f1 score list: {:.4f} (+/- {:.4f})'.format(np.mean(f1_measure_list), np.std(f1_measure_list)))
    

Iteration #1 | Accuracy: 0.7769 | Precision: 0.7769 | Recall: 0.7769 | F1-score: 0.7769 
Iteration #2 | Accuracy: 0.8137 | Precision: 0.8137 | Recall: 0.8137 | F1-score: 0.8137 
Iteration #3 | Accuracy: 0.8332 | Precision: 0.8332 | Recall: 0.8332 | F1-score: 0.8332 
Iteration #4 | Accuracy: 0.8147 | Precision: 0.8147 | Recall: 0.8147 | F1-score: 0.8147 
Iteration #5 | Accuracy: 0.8135 | Precision: 0.8135 | Recall: 0.8135 | F1-score: 0.8135 
Iteration #6 | Accuracy: 0.8391 | Precision: 0.8391 | Recall: 0.8391 | F1-score: 0.8391 
Iteration #7 | Accuracy: 0.8186 | Precision: 0.8186 | Recall: 0.8186 | F1-score: 0.8186 
Iteration #8 | Accuracy: 0.6988 | Precision: 0.6988 | Recall: 0.6988 | F1-score: 0.6988 
Iteration #9 | Accuracy: 0.7162 | Precision: 0.7162 | Recall: 0.7162 | F1-score: 0.7162 
Iteration #10 | Accuracy: 0.8914 | Precision: 0.8914 | Recall: 0.8914 | F1-score: 0.8914 
Mean of accuracy list: 0.8016 (+/- 0.0545)
Mean of precision list: 0.8016 (+/- 0.0545)
Mean of recall list: 0

## Объяснение с ELI5

In [74]:
from sklearn.pipeline import make_pipeline


X_train, X_test, y_train, y_test = train_test_split(X, labelled_y, test_size=0.2)


vec = CountVectorizer(ngram_range = (1,2))

clf = LinearSVC()   

pipe = make_pipeline(vec, clf)
pipe.fit(X_train, y_train);

In [75]:
from sklearn import metrics

def print_report(pipe):
    
    y_pred = pipe.predict(X_test)
    report = metrics.classification_report(y_test, y_pred,
        #target_names=twenty_test.target_names
                                          )
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)

             precision    recall  f1-score   support

          0       0.81      0.60      0.68       729
          1       0.79      0.91      0.85      1224

avg / total       0.80      0.80      0.79      1953

accuracy: 0.795


In [76]:
import eli5
df = eli5.explain_weights_df(clf, vec=vec,
                 target_names=classEncoder.classes_
                 )

In [77]:
df.to_csv('bayes_svm_countvec_pred.txt', sep='\t', encoding='utf-8') 

# Multinomial Naive Bayes

## Репрезентация TF-IDF

In [78]:
# Создаем объект класса TF-IDF Vectorizer, который преобразует корпус текстов в TF-IDF
TFIDFVectorizer = TfidfVectorizer(max_df=0.9, min_df=0.01, sublinear_tf=True)
X_TF = TFIDFVectorizer.fit_transform(X)

# Тогда разбить на тренировочную\тестовую выборки:
X_train, X_test, y_train, y_test = train_test_split(X_TF, labelled_y, test_size=0.2)

In [79]:
from sklearn.naive_bayes import MultinomialNB
multnb_model = MultinomialNB()  
multnb_model.fit(X_train, y_train)                     # Обучаем на тренировочной выборке
predicted = multnb_model.predict(X_test)         # Тестируем на тестовой выборке

# Указываем micro\macro, поскольку у нас три класса, т.е. мультиклассовая классификация
print('Accuracy: {:0.4f}'.format(accuracy_score(predicted, y_test)))
print('Precision: {:0.4f}'.format(precision_score(predicted, y_test, average = 'micro')))
print('Recall: {:0.4f}'.format(recall_score(predicted, y_test, average = 'micro')))

print(classification_report(y_test, predicted))


Accuracy: 0.6723
Precision: 0.6723
Recall: 0.6723
             precision    recall  f1-score   support

          0       0.66      0.24      0.35       725
          1       0.67      0.93      0.78      1228

avg / total       0.67      0.67      0.62      1953



In [80]:
accuracy_list = cross_val_score(multnb_model, X_TF, labelled_y, cv=5)   
print('Accuracy list: {}'.format(accuracy_list))
print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))

Accuracy list: [0.65847414 0.67127496 0.67178699 0.65540195 0.67059426]
Mean of accuracy list: 0.6655 (+/- 0.0071)


In [81]:
kf = KFold(n_splits=10)
iter_number = 0

accuracy_list = []
precision_list = []
recall_list = []
f1_measure_list = []

for train_index, test_index in kf.split(X_TF):
    iter_number += 1
    X_train, X_test = X_TF[train_index], X_TF[test_index]
    y_train, y_test = labelled_y[train_index], labelled_y[test_index]
    
    multnb_model.fit(X_train, y_train)                     
    predicted = multnb_model.predict(X_test)         

    
    
    print('Iteration #{} | Accuracy: {:0.4f} | Precision: {:0.4f} | Recall: {:0.4f} | F1-score: {:0.4f} '.format(
        iter_number,
        accuracy_score(predicted, y_test),
        precision_score(predicted, y_test, average = 'micro'),
        recall_score(predicted, y_test, average = 'micro'),
        f1_score(predicted, y_test, average = 'micro')
    ))
    accuracy_list.append(accuracy_score(predicted, y_test))
    precision_list.append(precision_score(predicted, y_test, average = 'micro'))
    recall_list.append(recall_score(predicted, y_test, average = 'micro'))
    f1_measure_list.append(f1_score(predicted, y_test, average = 'micro'))

print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))
print('Mean of precision list: {:.4f} (+/- {:.4f})'.format(np.mean(precision_list), np.std(precision_list)))
print('Mean of recall list: {:.4f} (+/- {:.4f})'.format(np.mean(recall_list), np.std(recall_list)))
print('Mean of f1 score list: {:.4f} (+/- {:.4f})'.format(np.mean(f1_measure_list), np.std(f1_measure_list)))
    

Iteration #1 | Accuracy: 0.6479 | Precision: 0.6479 | Recall: 0.6479 | F1-score: 0.6479 
Iteration #2 | Accuracy: 0.6970 | Precision: 0.6970 | Recall: 0.6970 | F1-score: 0.6970 
Iteration #3 | Accuracy: 0.7236 | Precision: 0.7236 | Recall: 0.7236 | F1-score: 0.7236 
Iteration #4 | Accuracy: 0.6899 | Precision: 0.6899 | Recall: 0.6899 | F1-score: 0.6899 
Iteration #5 | Accuracy: 0.7111 | Precision: 0.7111 | Recall: 0.7111 | F1-score: 0.7111 
Iteration #6 | Accuracy: 0.7141 | Precision: 0.7141 | Recall: 0.7141 | F1-score: 0.7141 
Iteration #7 | Accuracy: 0.6988 | Precision: 0.6988 | Recall: 0.6988 | F1-score: 0.6988 
Iteration #8 | Accuracy: 0.3811 | Precision: 0.3811 | Recall: 0.3811 | F1-score: 0.3811 
Iteration #9 | Accuracy: 0.4631 | Precision: 0.4631 | Recall: 0.4631 | F1-score: 0.4631 
Iteration #10 | Accuracy: 0.8750 | Precision: 0.8750 | Recall: 0.8750 | F1-score: 0.8750 
Mean of accuracy list: 0.6602 (+/- 0.1328)
Mean of precision list: 0.6602 (+/- 0.1328)
Mean of recall list: 0

## Объяснение с ELI5

In [86]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline


X_train, X_test, y_train, y_test = train_test_split(X, labelled_y, test_size=0.2)

vec = TfidfVectorizer(max_df=0.9, min_df=0.01, sublinear_tf=True, ngram_range=(1,3))


clf = MultinomialNB()   

pipe = make_pipeline(vec, clf)
pipe.fit(X_train, y_train);

In [87]:
from sklearn import metrics

def print_report(pipe):
    
    y_pred = pipe.predict(X_test)
    report = metrics.classification_report(y_test, y_pred,
        #target_names=twenty_test.target_names
                                          )
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)

             precision    recall  f1-score   support

          0       0.63      0.22      0.33       708
          1       0.68      0.93      0.78      1245

avg / total       0.66      0.67      0.62      1953

accuracy: 0.670


In [88]:
df = eli5.explain_weights_df(clf, vec=vec,
                 target_names=classEncoder.classes_
                 )

In [89]:
df.to_csv('bayes_multNB_tfidf_pred.txt', sep='\t', encoding='utf-8') 

AttributeError: 'NoneType' object has no attribute 'to_csv'

## Репрезентация с CountVectorizer

In [90]:
# Создаем объект класса CountVectorizer, который преобразует корпус текстов в one-hot представление
CVectorizer = CountVectorizer()
X_Count = CVectorizer.fit_transform(X)

# Тогда разбить на тренировочную\тестовую выборки:
X_train, X_test, y_train, y_test = train_test_split(X_Count, labelled_y, test_size=0.2)

In [91]:
from sklearn.naive_bayes import MultinomialNB
multnb_model = MultinomialNB()  
multnb_model.fit(X_train, y_train)                     # Обучаем на тренировочной выборке
predicted = multnb_model.predict(X_test)         # Тестируем на тестовой выборке

# Указываем micro\macro, поскольку у нас три класса, т.е. мультиклассовая классификация
print('Accuracy: {:0.4f}'.format(accuracy_score(predicted, y_test)))
print('Precision: {:0.4f}'.format(precision_score(predicted, y_test, average = 'micro')))
print('Recall: {:0.4f}'.format(recall_score(predicted, y_test, average = 'micro')))

print(classification_report(y_test, predicted))

Accuracy: 0.8228
Precision: 0.8228
Recall: 0.8228
             precision    recall  f1-score   support

          0       0.82      0.67      0.73       719
          1       0.82      0.91      0.87      1234

avg / total       0.82      0.82      0.82      1953



In [92]:
accuracy_list = cross_val_score(multnb_model, X_Count, labelled_y, cv=5)   
print('Accuracy list: {}'.format(accuracy_list))
print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))

Accuracy list: [0.80798771 0.8202765  0.81618024 0.82488479 0.81762295]
Mean of accuracy list: 0.8174 (+/- 0.0056)


In [93]:
kf = KFold(n_splits=10)
iter_number = 0

accuracy_list = []
precision_list = []
recall_list = []
f1_measure_list = []

for train_index, test_index in kf.split(X_Count):
    iter_number += 1
    X_train, X_test = X_Count[train_index], X_Count[test_index]
    y_train, y_test = labelled_y[train_index], labelled_y[test_index]
    
    multnb_model.fit(X_train, y_train)                     
    predicted = multnb_model.predict(X_test)         

    
    
    print('Iteration #{} | Accuracy: {:0.4f} | Precision: {:0.4f} | Recall: {:0.4f} | F1-score: {:0.4f} '.format(
        iter_number,
        accuracy_score(predicted, y_test),
        precision_score(predicted, y_test, average = 'micro'),
        recall_score(predicted, y_test, average = 'micro'),
        f1_score(predicted, y_test, average = 'micro')
    ))
    accuracy_list.append(accuracy_score(predicted, y_test))
    precision_list.append(precision_score(predicted, y_test, average = 'micro'))
    recall_list.append(recall_score(predicted, y_test, average = 'micro'))
    f1_measure_list.append(f1_score(predicted, y_test, average = 'micro'))

print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))
print('Mean of precision list: {:.4f} (+/- {:.4f})'.format(np.mean(precision_list), np.std(precision_list)))
print('Mean of recall list: {:.4f} (+/- {:.4f})'.format(np.mean(recall_list), np.std(recall_list)))
print('Mean of f1 score list: {:.4f} (+/- {:.4f})'.format(np.mean(f1_measure_list), np.std(f1_measure_list)))
    

Iteration #1 | Accuracy: 0.8055 | Precision: 0.8055 | Recall: 0.8055 | F1-score: 0.8055 
Iteration #2 | Accuracy: 0.8199 | Precision: 0.8199 | Recall: 0.8199 | F1-score: 0.8199 
Iteration #3 | Accuracy: 0.8495 | Precision: 0.8495 | Recall: 0.8495 | F1-score: 0.8495 
Iteration #4 | Accuracy: 0.8393 | Precision: 0.8393 | Recall: 0.8393 | F1-score: 0.8393 
Iteration #5 | Accuracy: 0.8484 | Precision: 0.8484 | Recall: 0.8484 | F1-score: 0.8484 
Iteration #6 | Accuracy: 0.8207 | Precision: 0.8207 | Recall: 0.8207 | F1-score: 0.8207 
Iteration #7 | Accuracy: 0.8248 | Precision: 0.8248 | Recall: 0.8248 | F1-score: 0.8248 
Iteration #8 | Accuracy: 0.7264 | Precision: 0.7264 | Recall: 0.7264 | F1-score: 0.7264 
Iteration #9 | Accuracy: 0.7254 | Precision: 0.7254 | Recall: 0.7254 | F1-score: 0.7254 
Iteration #10 | Accuracy: 0.9037 | Precision: 0.9037 | Recall: 0.9037 | F1-score: 0.9037 
Mean of accuracy list: 0.8164 (+/- 0.0518)
Mean of precision list: 0.8164 (+/- 0.0518)
Mean of recall list: 0

## Объяснение с ELI5

In [94]:
from sklearn.pipeline import make_pipeline


X_train, X_test, y_train, y_test = train_test_split(X, labelled_y, test_size=0.2)


vec = CountVectorizer(ngram_range = (1,3))

clf = MultinomialNB()   

pipe = make_pipeline(vec, clf)
pipe.fit(X_train, y_train);

In [95]:
from sklearn import metrics

def print_report(pipe):
    
    y_pred = pipe.predict(X_test)
    report = metrics.classification_report(y_test, y_pred,
        #target_names=twenty_test.target_names
                                          )
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)

             precision    recall  f1-score   support

          0       0.83      0.64      0.72       692
          1       0.82      0.93      0.87      1261

avg / total       0.83      0.83      0.82      1953

accuracy: 0.826


In [96]:
import eli5
df = eli5.explain_weights_df(clf, vec=vec,
                 target_names=classEncoder.classes_
                 )

In [97]:
df.to_csv('bayes_multNB_countvec_pred.txt', sep='\t', encoding='utf-8') 

AttributeError: 'NoneType' object has no attribute 'to_csv'

# Random Forest

## Репрезентация TF-IDF

In [98]:
from sklearn.ensemble import RandomForestClassifier

# Создаем объект класса TF-IDF Vectorizer, который преобразует корпус текстов в TF-IDF
TFIDFVectorizer = TfidfVectorizer(max_df=0.9, min_df=0.01, sublinear_tf=True)
X_TF = TFIDFVectorizer.fit_transform(X)

# Тогда разбить на тренировочную\тестовую выборки:
X_train, X_test, y_train, y_test = train_test_split(X_TF, labelled_y, test_size=0.2)

In [99]:

rand_model = RandomForestClassifier(n_estimators=1000, max_depth=4,
                             random_state=2)  
rand_model.fit(X_train, y_train)                     # Обучаем на тренировочной выборке
predicted = rand_model.predict(X_test)         # Тестируем на тестовой выборке

# Указываем micro\macro, поскольку у нас три класса, т.е. мультиклассовая классификация
print('Accuracy: {:0.4f}'.format(accuracy_score(predicted, y_test)))
print('Precision: {:0.4f}'.format(precision_score(predicted, y_test, average = 'micro')))
print('Recall: {:0.4f}'.format(recall_score(predicted, y_test, average = 'micro')))

print(classification_report(y_test, predicted))

Accuracy: 0.6318
Precision: 0.6318
Recall: 0.6318
             precision    recall  f1-score   support

          0       0.86      0.03      0.07       740
          1       0.63      1.00      0.77      1213

avg / total       0.72      0.63      0.50      1953



In [100]:
accuracy_list = cross_val_score(rand_model, X_TF, labelled_y, cv=5)   
print('Accuracy list: {}'.format(accuracy_list))
print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))

Accuracy list: [0.63236047 0.63338454 0.63850486 0.6359447  0.63114754]
Mean of accuracy list: 0.6343 (+/- 0.0026)


In [101]:
kf = KFold(n_splits=10)
iter_number = 0

accuracy_list = []
precision_list = []
recall_list = []
f1_measure_list = []

for train_index, test_index in kf.split(X_TF):
    iter_number += 1
    X_train, X_test = X_TF[train_index], X_TF[test_index]
    y_train, y_test = labelled_y[train_index], labelled_y[test_index]
    
    rand_model.fit(X_train, y_train)                     
    predicted = rand_model.predict(X_test)         

    
    
    print('Iteration #{} | Accuracy: {:0.4f} | Precision: {:0.4f} | Recall: {:0.4f} | F1-score: {:0.4f} '.format(
        iter_number,
        accuracy_score(predicted, y_test),
        precision_score(predicted, y_test, average = 'micro'),
        recall_score(predicted, y_test, average = 'micro'),
        f1_score(predicted, y_test, average = 'micro')
    ))
    accuracy_list.append(accuracy_score(predicted, y_test))
    precision_list.append(precision_score(predicted, y_test, average = 'micro'))
    recall_list.append(recall_score(predicted, y_test, average = 'micro'))
    f1_measure_list.append(f1_score(predicted, y_test, average = 'micro'))

print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))
print('Mean of precision list: {:.4f} (+/- {:.4f})'.format(np.mean(precision_list), np.std(precision_list)))
print('Mean of recall list: {:.4f} (+/- {:.4f})'.format(np.mean(recall_list), np.std(recall_list)))
print('Mean of f1 score list: {:.4f} (+/- {:.4f})'.format(np.mean(f1_measure_list), np.std(f1_measure_list)))
    

Iteration #1 | Accuracy: 0.6182 | Precision: 0.6182 | Recall: 0.6182 | F1-score: 0.6182 
Iteration #2 | Accuracy: 0.6673 | Precision: 0.6673 | Recall: 0.6673 | F1-score: 0.6673 
Iteration #3 | Accuracy: 0.7032 | Precision: 0.7032 | Recall: 0.7032 | F1-score: 0.7032 
Iteration #4 | Accuracy: 0.6714 | Precision: 0.6714 | Recall: 0.6714 | F1-score: 0.6714 
Iteration #5 | Accuracy: 0.7008 | Precision: 0.7008 | Recall: 0.7008 | F1-score: 0.7008 
Iteration #6 | Accuracy: 0.6906 | Precision: 0.6906 | Recall: 0.6906 | F1-score: 0.6906 
Iteration #7 | Accuracy: 0.6855 | Precision: 0.6855 | Recall: 0.6855 | F1-score: 0.6855 
Iteration #8 | Accuracy: 0.2910 | Precision: 0.2910 | Recall: 0.2910 | F1-score: 0.2910 
Iteration #9 | Accuracy: 0.3801 | Precision: 0.3801 | Recall: 0.3801 | F1-score: 0.3801 
Iteration #10 | Accuracy: 0.9221 | Precision: 0.9221 | Recall: 0.9221 | F1-score: 0.9221 
Mean of accuracy list: 0.6330 (+/- 0.1682)
Mean of precision list: 0.6330 (+/- 0.1682)
Mean of recall list: 0

## Объяснение с ELI5

In [102]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline


X_train, X_test, y_train, y_test = train_test_split(X, labelled_y, test_size=0.2)

vec = TfidfVectorizer(max_df=0.9, min_df=0.01, sublinear_tf=True, ngram_range=(1,2))


clf = RandomForestClassifier(n_estimators=1000, max_depth=4,
                             random_state=2)   

pipe = make_pipeline(vec, clf)
pipe.fit(X_train, y_train);

In [103]:
from sklearn import metrics

def print_report(pipe):
    
    y_pred = pipe.predict(X_test)
    report = metrics.classification_report(y_test, y_pred,
        #target_names=twenty_test.target_names
                                          )
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)

             precision    recall  f1-score   support

          0       1.00      0.02      0.04       732
          1       0.63      1.00      0.77      1221

avg / total       0.77      0.63      0.50      1953

accuracy: 0.633


In [104]:
df = eli5.explain_weights_df(clf, vec=vec,
                 target_names=classEncoder.classes_
                 )

In [105]:
df.to_csv('bayes_rand_tfidf_pred.txt', sep='\t', encoding='utf-8') 

## Репрезентация CountVectorizer

In [106]:
# Создаем объект класса CountVectorizer, который преобразует корпус текстов в one-hot представление
CVectorizer = CountVectorizer()
X_Count = CVectorizer.fit_transform(X)

# Тогда разбить на тренировочную\тестовую выборки:
X_train, X_test, y_train, y_test = train_test_split(X_Count, labelled_y, test_size=0.2)

In [107]:
rand_model = RandomForestClassifier(n_estimators=1000, max_depth=4,
                             random_state=2)  
rand_model.fit(X_train, y_train)                     # Обучаем на тренировочной выборке
predicted = rand_model.predict(X_test)         # Тестируем на тестовой выборке

# Указываем micro\macro, поскольку у нас три класса, т.е. мультиклассовая классификация
print('Accuracy: {:0.4f}'.format(accuracy_score(predicted, y_test)))
print('Precision: {:0.4f}'.format(precision_score(predicted, y_test, average = 'micro')))
print('Recall: {:0.4f}'.format(recall_score(predicted, y_test, average = 'micro')))

print(classification_report(y_test, predicted))

Accuracy: 0.6272
Precision: 0.6272
Recall: 0.6272
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       728
          1       0.63      1.00      0.77      1225

avg / total       0.39      0.63      0.48      1953



  'precision', 'predicted', average, warn_for)


In [108]:
accuracy_list = cross_val_score(rand_model, X_Count, labelled_y, cv=5)   
print('Accuracy list: {}'.format(accuracy_list))
print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))

Accuracy list: [0.62570405 0.62570405 0.62570405 0.62570405 0.62602459]
Mean of accuracy list: 0.6258 (+/- 0.0001)


In [109]:
kf = KFold(n_splits=10)
iter_number = 0

accuracy_list = []
precision_list = []
recall_list = []
f1_measure_list = []

for train_index, test_index in kf.split(X_Count):
    iter_number += 1
    X_train, X_test = X_Count[train_index], X_Count[test_index]
    y_train, y_test = labelled_y[train_index], labelled_y[test_index]
    
    rand_model.fit(X_train, y_train)                     
    predicted = rand_model.predict(X_test)         

    
    
    print('Iteration #{} | Accuracy: {:0.4f} | Precision: {:0.4f} | Recall: {:0.4f} | F1-score: {:0.4f} '.format(
        iter_number,
        accuracy_score(predicted, y_test),
        precision_score(predicted, y_test, average = 'micro'),
        recall_score(predicted, y_test, average = 'micro'),
        f1_score(predicted, y_test, average = 'micro')
    ))
    accuracy_list.append(accuracy_score(predicted, y_test))
    precision_list.append(precision_score(predicted, y_test, average = 'micro'))
    recall_list.append(recall_score(predicted, y_test, average = 'micro'))
    f1_measure_list.append(f1_score(predicted, y_test, average = 'micro'))

print('Mean of accuracy list: {:.4f} (+/- {:.4f})'.format(np.mean(accuracy_list), np.std(accuracy_list)))
print('Mean of precision list: {:.4f} (+/- {:.4f})'.format(np.mean(precision_list), np.std(precision_list)))
print('Mean of recall list: {:.4f} (+/- {:.4f})'.format(np.mean(recall_list), np.std(recall_list)))
print('Mean of f1 score list: {:.4f} (+/- {:.4f})'.format(np.mean(f1_measure_list), np.std(f1_measure_list)))
    

Iteration #1 | Accuracy: 0.6141 | Precision: 0.6141 | Recall: 0.6141 | F1-score: 0.6141 
Iteration #2 | Accuracy: 0.6612 | Precision: 0.6612 | Recall: 0.6612 | F1-score: 0.6612 
Iteration #3 | Accuracy: 0.6919 | Precision: 0.6919 | Recall: 0.6919 | F1-score: 0.6919 
Iteration #4 | Accuracy: 0.6633 | Precision: 0.6633 | Recall: 0.6633 | F1-score: 0.6633 
Iteration #5 | Accuracy: 0.6875 | Precision: 0.6875 | Recall: 0.6875 | F1-score: 0.6875 
Iteration #6 | Accuracy: 0.6814 | Precision: 0.6814 | Recall: 0.6814 | F1-score: 0.6814 
Iteration #7 | Accuracy: 0.6752 | Precision: 0.6752 | Recall: 0.6752 | F1-score: 0.6752 
Iteration #8 | Accuracy: 0.2838 | Precision: 0.2838 | Recall: 0.2838 | F1-score: 0.2838 
Iteration #9 | Accuracy: 0.3760 | Precision: 0.3760 | Recall: 0.3760 | F1-score: 0.3760 
Iteration #10 | Accuracy: 0.9232 | Precision: 0.9232 | Recall: 0.9232 | F1-score: 0.9232 
Mean of accuracy list: 0.6258 (+/- 0.1686)
Mean of precision list: 0.6258 (+/- 0.1686)
Mean of recall list: 0

## Объяснение с ELI5

In [110]:
from sklearn.pipeline import make_pipeline


X_train, X_test, y_train, y_test = train_test_split(X, labelled_y, test_size=0.2)


vec = CountVectorizer(ngram_range = (1,2))

clf = RandomForestClassifier(n_estimators=1000, max_depth=4,
                             random_state=2) 
pipe = make_pipeline(vec, clf)
pipe.fit(X_train, y_train);

In [111]:
from sklearn import metrics

def print_report(pipe):
    
    y_pred = pipe.predict(X_test)
    report = metrics.classification_report(y_test, y_pred,
        #target_names=twenty_test.target_names
                                          )
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       734
          1       0.62      1.00      0.77      1219

avg / total       0.39      0.62      0.48      1953

accuracy: 0.624


  'precision', 'predicted', average, warn_for)


In [112]:
df = eli5.explain_weights_df(clf, vec=vec,
                 target_names=classEncoder.classes_
                 )

In [113]:
df.to_csv('bayes_rand_countvec_pred.txt', sep='\t', encoding='utf-8') 