In [23]:
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import re
import string
import gensim
from collections import Counter,defaultdict
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

In [2]:
data_url = 'https://raw.githubusercontent.com/TatianaShavrina/hse_ml_m1/master/ensembles/complaints.csv'
data = pd.read_csv(data_url, sep='\t')
y = data["PRODUCT_ID"]
X = data["cleaned_text"]

In [3]:
def normalize(data):
  res = []
  for item in data:
    item = item.translate(str.maketrans('', '', string.punctuation))
    res.append(' '.join([x for x in item.split() if len(x) > 3]))
  return res

In [4]:
X_norm = normalize(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.1, random_state=42)

## Попробуем **Voting**

In [6]:
clf1 = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=8, max_iter=300)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

clf4 = MultinomialNB(alpha=0.1, fit_prior=True)

clf5 = KNeighborsClassifier(n_neighbors=2) 

eclf = VotingClassifier(
        estimators=[('lr', clf1), 
                    ('etc', clf2), 
                    ('gnb', clf3), 
                    ('mnb', clf4),
                    ('knc', clf5)], 
        voting='hard')

voting = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features=500)),
    ('tfidf', TfidfTransformer(sublinear_tf=True)),
    ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
    ('clf', eclf),
    ])
voting = voting.fit(X_train, y_train)
predictions = voting.predict(X_test)
print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

Precision:   0.70
Recall:   0.69
F1-measure:   0.69
Accuracy:   0.68


In [7]:
model = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/My Drive/NN_INTRO_HW/taiga_w2v_model/model.bin', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
mapping = {}

for line in open('/content/drive/My Drive/NN_INTRO_HW/ru-rnc.map.txt'):
    ms, ud = line.strip('\n').split()
    mapping[ms] = ud

In [28]:
from pymystem3 import Mystem
m = Mystem()

In [10]:
def normalize_mystem(text):
    tokens = []
    norm_words = m.analyze(text)
    for norm_word in norm_words:
        if 'analysis' not in norm_word:
            continue
            
        if not len(norm_word['analysis']):
            lemma = norm_word['text']
            pos = 'UNKN'
        else:
            lemma = norm_word["analysis"][0]["lex"].lower().strip()
            pos = norm_word["analysis"][0]["gr"].split(',')[0]
            pos = pos.split('=')[0].strip()
        pos = mapping[pos]
        tokens.append(lemma+'_'+pos)

    return tokens

In [11]:
def get_embedding(text, model, dim):
    split_text = text.split()
    # чтобы не доставать одно слово несколько раз
    # сделаем счетчик, а потом векторы домножим на частоту
    words = Counter(split_text)
    total = len(split_text)
    vectors = np.zeros((len(words), dim))

    
    for i,word in enumerate(words):
        try:
            v = model[normalize_mystem(word)]
            print(v)
            vectors[i] = v*(words[word]/total) # просто умножаем вектор на частоту
        except (KeyError, ValueError):
            continue
            
    if vectors.any():
        vectors = np.average(vectors, axis=0)
    else:
        vectors = np.zeros((dim))
    
    return vectors

In [None]:
dim = 100
X_w2v = np.zeros((len(X_norm), dim))

for i, text in enumerate(X_norm):
    X_w2v[i] = get_embedding(text, model, dim)
    
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_ft, y, test_size=0.1, random_state=8)

In [None]:
clf1 = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=8, max_iter=300)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

clf4 = MultinomialNB(alpha=0.1, fit_prior=True)

clf5 = KNeighborsClassifier(n_neighbors=2) 

eclf = VotingClassifier(
        estimators=[('lr', clf1), 
                    ('etc', clf2), 
                    ('gnb', clf3), 
                    ('mnb', clf4),
                    ('knc', clf5)], 
        voting='hard')

voting = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features=500)),
    ('tfidf', TfidfTransformer(sublinear_tf=True)),
    ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
    ('clf', eclf),
    ])
voting = voting.fit(X_train_2, y_train)2)
predictions = voting.predict(X_test_2)
print("Precision: {0:6.2f}".format(precision_score(y_test_2, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test_2, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test_2, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test_2, predictions)))

## **BAGGING**

In [21]:
def build_model(clf_type):  
  clf = BaggingClassifier(base_estimator=clf_type, n_estimators=25, max_samples=0.75, max_features=0.8, bootstrap=False, bootstrap_features=False)

  pipeline = Pipeline([
      ('vect', CountVectorizer(analyzer='word', max_features=500)),
      ('tfidf', TfidfTransformer(sublinear_tf=True)),
      ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
      ('clf', clf),
      ])
  voting = pipeline.fit(X_train, y_train)
  predictions = pipeline.predict(X_test)
  print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
  print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
  print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
  print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

In [24]:
clfs = [LogisticRegression(multi_class='multinomial',
                          solver='lbfgs',
                          random_state=8,
                          max_iter=300),
        RandomForestClassifier(n_estimators=65, random_state=1
        ),
        GaussianNB(),
        MultinomialNB(alpha=0.1, fit_prior=True),
        KNeighborsClassifier(n_neighbors=3)]

for clf in clfs:
  build_model(clf)
  print()


Precision:   0.67
Recall:   0.67
F1-measure:   0.67
Accuracy:   0.67

Precision:   0.70
Recall:   0.69
F1-measure:   0.69
Accuracy:   0.69

Precision:   0.61
Recall:   0.60
F1-measure:   0.60
Accuracy:   0.60

Precision:   0.66
Recall:   0.65
F1-measure:   0.65
Accuracy:   0.65

Precision:   0.68
Recall:   0.64
F1-measure:   0.64
Accuracy:   0.64



## **BOOSTING**

In [25]:
def build_model(clf):
  clf = AdaBoostClassifier(base_estimator=clf, n_estimators=20)

  pipeline = Pipeline([
      ('vect', CountVectorizer(analyzer='word', max_features=550)),
      ('tfidf', TfidfTransformer(sublinear_tf=True)),
      ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
      ('clf', clf),
      ])
  voting = pipeline.fit(X_train, y_train)
  predictions = pipeline.predict(X_test)
  print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
  print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
  print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
  print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

In [27]:
clfs = [LogisticRegression(multi_class='multinomial',
                          solver='lbfgs',
                          random_state=8,
                          max_iter=300),
        RandomForestClassifier(n_estimators=65, random_state=1
        ),
        GaussianNB(),
        MultinomialNB(alpha=0.1, fit_prior=True),
        DecisionTreeClassifier(criterion='entropy', max_depth=1)]

for clf in clfs:
  build_model(clf)
  print()

Precision:   0.75
Recall:   0.50
F1-measure:   0.44
Accuracy:   0.51

Precision:   0.72
Recall:   0.71
F1-measure:   0.71
Accuracy:   0.71

Precision:   0.44
Recall:   0.41
F1-measure:   0.41
Accuracy:   0.40

Precision:   0.67
Recall:   0.64
F1-measure:   0.65
Accuracy:   0.64

Precision:   0.56
Recall:   0.56
F1-measure:   0.55
Accuracy:   0.56

