In [1]:
import re
import pandas as pd
import numpy as np

import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.metrics.distance import jaccard_distance
from nltk.corpus import stopwords
from nltk.corpus import words

import gensim
import gensim.downloader
from gensim.models import Word2Vec
from autocorrect import Speller

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
speller = Speller(lang='en')

In [3]:
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/aleksandrmorozov/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aleksandrmorozov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksandrmorozov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aleksandrmorozov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/aleksandrmorozov/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

## Введение в NLP

Одной из наиболее распространенных задач обработки естественного языка является классификация текста в соответствии с его смыслом. В данном случае необходимо правильно распознать тональность текста: негативную, нейтральную и позитивную.

### Предобработка данных

Для решения поставленной задачи необходимо правильно обработать данные для того, чтобы их можно было использовать для обучения.
Сперва проводится очистка данных. Этот шаг может включать в себя:
* Токенизацию текста - разбивку предложений на отдельные слова
* Обработку ошибок - исправление синтаксических ошибок в словах
* Удаление символов, не являющихся буквенными/цифренными
* Удаление стоп-слов - удаление слов, не носящих смысловой нагрузки (предлоги, местоимения, междометия и т.п.)
* Лемматизацию - преведение различных форм слова к словарной форме
* Стемминг - сокращение слов до грамматических основ

In [4]:
class Preprocessing:
    
    
    def __init__(self, data_path='data', handle_stopwords=False, handle_misspell=False):
        
        tweets = pd.DataFrame()
        
        files = {
            'processedNegative.csv': -1,
            'processedNeutral.csv': 0,
            'processedPositive.csv': 1
        }
        
        for file, val in files.items():
            with open(f'{data_path}/{file}') as f:
                data = f.read()
            data = re.sub(r',(?=[^ ])', '--split-here--', data)
            tmp = pd.DataFrame(data.split('--split-here--'), columns=['tweet'])
            tmp['target'] = val
            tmp.drop_duplicates(inplace=True)
            tweets = pd.concat([tweets, tmp])
        
        tweets.tweet = tweets.tweet.str.replace(r'[^a-z A-Z]+', '', regex=True)
        self.tweets = tweets.dropna().reset_index(drop=True)
        self.stopwords = stopwords.words('english')
        self.handle_stopwords = handle_stopwords
        self.handle_misspell = handle_misspell
        self.__tokenize()

    
    def reduce_length(self, word):
        
        pattern = re.compile(r"(.)\1{2,}")
        return pattern.sub(r"\1\1", word)
    
    
    def misspell_handler(self, token_arr):
        
        speller = Speller(lang='en')
        corrected = []
        
        for sent in token_arr:
            tmp = []
            for word in sent:
                word = self.reduce_length(word)
                word = speller(word)
                tmp.append(word)

            corrected.append(tmp)
        
        if self.handle_stopwords:
            clear_tokens = [[w for w in sent if w not in self.stopwords] 
                                for sent in corrected]
        
        return corrected

    
    def __tokenize(self):
        
        token_arr = [word_tokenize(tweet) for tweet in self.tweets.tweet]
        token_arr = [[w.lower() for w in tokens] for tokens in token_arr]
        
        if self.handle_stopwords:
            token_arr = [[w for w in sent if w not in self.stopwords] for sent in token_arr]
            
        self.tweets['tokens'] = token_arr 
        
        if self.handle_misspell:
            clear_arr = self.misspell_handler(token_arr)
            self.tweets['tokens_misspell'] = clear_arr
        

    def stemming(self):
        
        ps = PorterStemmer()
        stem = [[ps.stem(w) for w in sent] for sent in self.tweets.tokens] 
        self.tweets['stemming'] = stem
        
        if self.handle_misspell:
            stem_miss = [[ps.stem(w) for w in sent] for sent in self.tweets.tokens_misspell]
            self.tweets['stemming_misspell'] = stem_miss

        
    def lemmatization(self):
        
        wnl = WordNetLemmatizer()
        lemm = [[wnl.lemmatize(w) for w in sent] for sent in self.tweets.tokens]
        self.tweets['lemmatization'] = lemm
        
        if self.handle_misspell:
            lemm = [[wnl.lemmatize(w) for w in sent] for sent in self.tweets.tokens_misspell]
            self.tweets['lemmatization_misspell'] = lemm

        
    def transform(self):
        
        self.stemming()
        self.lemmatization()
        
        empty_rows = self.tweets[(self.tweets.tokens.str.len() == 0)]
        self.tweets.drop(empty_rows.index, inplace=True)
        self.tweets.reset_index(drop=True, inplace=True)
        

In [5]:
processed_data = Preprocessing(handle_stopwords=False, handle_misspell=True)
processed_data.transform()
df = processed_data.tweets
df.head()

Unnamed: 0,tweet,target,tokens,tokens_misspell,stemming,stemming_misspell,lemmatization,lemmatization_misspell
0,How unhappy some dogs like it though,-1,"[how, unhappy, some, dogs, like, it, though]","[how, unhappy, some, dogs, like, it, though]","[how, unhappi, some, dog, like, it, though]","[how, unhappi, some, dog, like, it, though]","[how, unhappy, some, dog, like, it, though]","[how, unhappy, some, dog, like, it, though]"
1,talking to my over driver about where Im going...,-1,"[talking, to, my, over, driver, about, where, ...","[talking, to, my, over, driver, about, where, ...","[talk, to, my, over, driver, about, where, im,...","[talk, to, my, over, driver, about, where, im,...","[talking, to, my, over, driver, about, where, ...","[talking, to, my, over, driver, about, where, ..."
2,Does anybody know if the Rands likely to fall ...,-1,"[does, anybody, know, if, the, rands, likely, ...","[does, anybody, know, if, the, hands, likely, ...","[doe, anybodi, know, if, the, rand, like, to, ...","[doe, anybodi, know, if, the, hand, like, to, ...","[doe, anybody, know, if, the, rand, likely, to...","[doe, anybody, know, if, the, hand, likely, to..."
3,I miss going to gigs in Liverpool unhappy,-1,"[i, miss, going, to, gigs, in, liverpool, unha...","[i, miss, going, to, gigs, in, liverpool, unha...","[i, miss, go, to, gig, in, liverpool, unhappi]","[i, miss, go, to, gig, in, liverpool, unhappi]","[i, miss, going, to, gig, in, liverpool, unhappy]","[i, miss, going, to, gig, in, liverpool, unhappy]"
4,There isnt a new Riverdale tonight unhappy,-1,"[there, isnt, a, new, riverdale, tonight, unha...","[there, isnt, a, new, riverdale, tonight, unha...","[there, isnt, a, new, riverdal, tonight, unhappi]","[there, isnt, a, new, riverdal, tonight, unhappi]","[there, isnt, a, new, riverdale, tonight, unha...","[there, isnt, a, new, riverdale, tonight, unha..."


### Представление текста

Теперь необходиомо привести текст к формату, который можно использовать для обучения модели. Основные способы способы представления текста:


**Bag of words (мешок слов)** - в данном представлении текста составляется словарь, где каждому уникальному слову в данных соответствует свой индекс. Отдельное взятое предложение может быть представлено в виде списка, длина которого равна количеству уникальных слов. Если слово встречается в предложении, то в список по индексу этого слова записывается 1, если нет - 0. Так же вместо 1 можно записывать сколько раз слово встречается в предложении.


**TF-IDF (Term Frequency, Inverse Document Frequency)** - статистическая мера для оценки важности слов. TF-IDF взвешивает слова в зависимости от частоты их употребления, понижая вес тех слов, которые встречаются слишком часто, что позволяет избавиться от лишних шумов.


**Word2vec** - модель, которая обучается на прочтении огромного количества текста с последующим запоминанием того, какое слово возникает в схожих контекстах. После обучения на достаточном количестве данных, Word2Vec генерирует вектор из n измерений для каждого слова в словаре, в котором слова со схожим значением располагаются ближе друг к другу. Так как word2vec генерирует вектор на каждое слово, то для того, чтобы представить предложение в виде вектора, можно посчитать среднее значение векторов, взвешенных по TFIDF

In [6]:
class transform:
    
    
    def __init__(self, processed_data):
        
        self.tweets = processed_data
        self.vect_df = pd.DataFrame()
        self.vect_df['target'] = processed_data.target
        self.vectorizers = {}


    def to_bin_vector(self):
        '''
        Реализация bag of words с бинарным представлением слов
        '''
        
        for col in self.tweets.columns[2:]:
            cv = CountVectorizer()
            X = cv.fit_transform([' '.join(w) for w in self.tweets[col]])
            bin_vec = [[1 if i > 0 else 0 for i in row] for row in X.toarray()]
            self.vect_df[f'bin_vect_{col}'] = bin_vec
            self.vectorizers[f'bin_vect_{col}'] = cv

            
    def to_count_vector(self):
        '''
        Реализация bag of words с учетом количетсва вхождений слов в предложении
        '''
        
        for col in self.tweets.columns[2:]:
            cv = CountVectorizer()
            X = cv.fit_transform([' '.join(w) for w in self.tweets[col]])
            self.vect_df[f'count_vect_{col}'] = list(X.toarray())
            self.vectorizers[f'count_vect_{col}'] = cv
    
    
    def to_tfidf(self):
        
        for col in self.tweets.columns[2:]:
            tv = TfidfVectorizer()
            X = tv.fit_transform([' '.join(w) for w in self.tweets[col]])
            self.vect_df[f'tfidf_vect_{col}'] = list(X.toarray())
            self.vectorizers[f'tfidf_vect_{col}'] = tv
       
    
    def word2vec(self, n_dim=50):
        
        for col in self.tweets.columns[2:]:
            res = []
            tv = TfidfVectorizer()
            tv.fit([' '.join(w) for w in self.tweets[col]])
            tf_idf = dict(zip(tv.get_feature_names_out(), list(tv.idf_)))
            tf_idf_vocab = tf_idf.keys()
            
            w2v_model = Word2Vec(self.tweets[col], 
                                 min_count=1,
                                 sg=1,
                                 vector_size=n_dim,
                                 seed=42)
            
            mean = np.mean(w2v_model.wv.vectors, 0)
            std = np.std(w2v_model.wv.vectors, 0)
            for sent in self.tweets[col]:
                sent_vec = np.zeros(n_dim)
                weight_sum = 0
                for word in sent:
                    if word in tf_idf_vocab:
                        vec = (w2v_model.wv[word] - mean)/std  # нормализуем данные
                        weight = tf_idf[word]*(sent.count(word)/len(sent))  # взвешиваем слова по TFIDF
                        sent_vec += (vec * weight)
                        weight_sum += weight

                if weight_sum != 0:
                    sent_vec /= weight_sum
                res.append(sent_vec)
            
            self.vect_df[f'w2v_{col}'] = res
      
    
    def glove(self, n_dim=25):
        '''
        Использование предобученной модели word2vec с расширенным словарем
        '''
        
        glove = gensim.downloader.load(f'glove-twitter-{n_dim}')
        
        mean = np.mean(glove.vectors, 0)
        std = np.std(glove.vectors, 0)
        for col in self.tweets.columns[2:]:
            res = []
            tv = TfidfVectorizer()
            tv.fit([' '.join(w) for w in self.tweets[col]])
            
            tf_idf = dict(zip(tv.get_feature_names_out(), list(tv.idf_)))
            glove_vocab = glove.key_to_index.keys()
            tf_idf_vocab = tf_idf.keys()
            
            for sent in self.tweets[col]:
                sent_vec = np.zeros(n_dim)
                weight_sum = 0
                for word in sent:
                    if word in tf_idf_vocab and word in glove_vocab:
                        vec = (glove[word] - mean)/std
                        weight = tf_idf[word]*(sent.count(word)/len(sent))
                        sent_vec += (vec * weight)
                        weight_sum += weight

                if weight_sum != 0:
                    sent_vec /= weight_sum
                res.append(sent_vec)
            
            self.vect_df[f'glove_{col}'] = res
       
    
    def apply_all(self):
        
        self.to_bin_vector()
        self.to_count_vector()
        self.to_tfidf()
        self.word2vec(75)
        self.glove(100)
        

In [7]:
vectors = transform(df)
vectors.apply_all()
vectors.vect_df.head(5)

Unnamed: 0,target,bin_vect_tokens,bin_vect_tokens_misspell,bin_vect_stemming,bin_vect_stemming_misspell,bin_vect_lemmatization,bin_vect_lemmatization_misspell,count_vect_tokens,count_vect_tokens_misspell,count_vect_stemming,...,w2v_stemming,w2v_stemming_misspell,w2v_lemmatization,w2v_lemmatization_misspell,glove_tokens,glove_tokens_misspell,glove_stemming,glove_stemming_misspell,glove_lemmatization,glove_lemmatization_misspell
0,-1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[2.3242857185004158, -2.4990170738934268, -0.3...","[2.2724655148883124, -1.6505452402067795, -2.6...","[2.4805453225521954, -2.9529819167202866, -3.6...","[2.9564793775604996, -2.2646960576672654, -2.5...","[0.6480318300869508, -0.28423400745306243, 0.4...","[0.6477079265056543, -0.28443119544072054, 0.4...","[0.668538349271827, -0.37048814203797475, 0.37...","[0.668538349271827, -0.37048814203797475, 0.37...","[0.6597960065209195, -0.39388759992503386, 0.4...","[0.6594731282160216, -0.39418283079931055, 0.4..."
1,-1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[1.994291325033083, -2.112010639718187, 0.4588...","[1.943228230612658, -1.5144923048767946, -0.76...","[2.112622632700753, -2.60275328570693, -2.9924...","[2.3683792207979226, -2.025272705950721, -2.20...","[0.7212703938964621, 0.33395460218286865, 0.46...","[0.6983579820038477, 0.3414635418031642, 0.484...","[0.5486683360131586, 0.3100470660555184, 0.319...","[0.49713198004630604, 0.355012953494765, 0.358...","[0.7252669653610138, 0.355449039481583, 0.4733...","[0.7019244034324438, 0.362309880220061, 0.4928..."
2,-1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[1.8003574356036465, -1.9610387400586475, 0.43...","[1.8454155314364025, -1.3068454754857273, -0.6...","[1.7157942045225962, -2.1822666145892367, -2.5...","[2.102266354520098, -1.7090745285088509, -1.58...","[0.28574667106518564, 0.27927008990920615, 0.2...","[0.35918648362999817, 0.2442847161534092, 0.24...","[0.12973172580630823, 0.30124781782919535, 0.2...","[0.17398689734716866, 0.2175394253294878, 0.14...","[0.29739961492881417, 0.3633682726187642, 0.19...","[0.3386214671475524, 0.2891211149288993, 0.095..."
3,-1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[1.280970317055139, -0.9460966515757131, 0.902...","[1.2241550658179137, -0.6973038940718105, -1.3...","[1.5110283494642982, -1.6273415476315318, -1.5...","[1.6960533388038481, -1.2555043362274299, -0.7...","[0.22231244385910687, 0.18492170189069537, 0.7...","[0.22237786522955177, 0.1846918876149743, 0.75...","[0.09449381668260298, 0.7801656517226432, 0.68...","[0.09453320168555919, 0.7801679991210608, 0.68...","[0.14475583485912283, 0.633365361609054, 0.797...","[0.14469404397955868, 0.633871107173953, 0.797..."
4,-1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[1.750638663588911, -2.024681282886285, 0.1327...","[1.5228035621864235, -1.0602074804298587, -1.0...","[1.9366248529719352, -2.2393622787898337, -2.6...","[2.0471627606583387, -1.9188825043107416, -1.8...","[0.18369200287731746, 0.38293295545078954, 0.3...","[0.18461196721889186, 0.3803651248519063, 0.32...","[0.1961412042471419, 0.49526036788130484, 0.07...","[0.19557643709193806, 0.4947385287572735, 0.07...","[0.18457289447553116, 0.3798479381446972, 0.32...","[0.18405008378445853, 0.37950905332718216, 0.3..."


In [8]:
# def train_models(model_arr, vect_df):
    
#     result = pd.DataFrame(index=vect_df.columns[1:])
    
#     for model in model_arr:
#         scores = []
#         for col in vectors[1:]:
#             X = vectors[col].to_list()
#             y = vectors.target.to_list()

#             X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                                train_size=0.8,
#                                                                random_state=42,
#                                                                shuffle=True)

#             model.fit(X_train, y_train)
#             y_pred = model.predict(X_test)
#             accuracy = accuracy_score(y_pred, y_test)
#             scores.append(accuracy)

#         result[f'{model}_accuracy'] = scores
        

## Обучение моделей

Теперь будем обучать модели с помощью различных подходов к обработке и представлению данных и сравним результаты.

In [9]:
class Train():
    
    
    def __init__(self, vect_df):
        
        self.vect_df = vect_df
        self.models = {}
        
        
    def LogRegress(self):
        
        lr = LogisticRegression(multi_class='multinomial',
                                solver='lbfgs', max_iter=1000)
        self.models['LogRegress'] = lr
       
    
    def SVC(self):
        
        svc = svm.SVC()
        self.models['SVC'] = svc
    
    
    def GBM(self):
        
        gbm = GradientBoostingClassifier()
        self.models['GBM'] = gbm
     
    
    def test(self):
        
        self.LogRegress()
        self.SVC()
        self.GBM()
        
        result = pd.DataFrame(index=self.vect_df.columns[1:])
    
        for name, model in self.models.items():
            scores = []
            for col in self.vect_df.columns[1:]:
                X = self.vect_df[col].to_list()
                y = self.vect_df.target.to_list()

                X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                                   train_size=0.8,
                                                                   random_state=42,
                                                                   shuffle=True)

                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                accuracy = accuracy_score(y_pred, y_test)
                scores.append(accuracy)

            result[f'{name}_accuracy'] = scores
            
        return result
    

In [10]:
vect_df = vectors.vect_df
models = Train(vect_df)
final_res = models.test()
final_res

Unnamed: 0,LogRegress_accuracy,SVC_accuracy,GBM_accuracy
bin_vect_tokens,0.922509,0.926199,0.916974
bin_vect_tokens_misspell,0.920664,0.928044,0.916974
bin_vect_stemming,0.931734,0.926199,0.916974
bin_vect_stemming_misspell,0.931734,0.924354,0.916974
bin_vect_lemmatization,0.926199,0.928044,0.915129
bin_vect_lemmatization_misspell,0.924354,0.929889,0.915129
count_vect_tokens,0.918819,0.922509,0.913284
count_vect_tokens_misspell,0.916974,0.924354,0.918819
count_vect_stemming,0.926199,0.928044,0.913284
count_vect_stemming_misspell,0.916974,0.926199,0.916974


In [11]:
final_res.to_csv('results.csv')

In [12]:
final_res.mean()

LogRegress_accuracy    0.880074
SVC_accuracy           0.876814
GBM_accuracy           0.867958
dtype: float64

In [13]:
final_res.max()

LogRegress_accuracy    0.931734
SVC_accuracy           0.929889
GBM_accuracy           0.924354
dtype: float64

In [14]:
final_res.idxmax()

LogRegress_accuracy                    bin_vect_stemming
SVC_accuracy             bin_vect_lemmatization_misspell
GBM_accuracy           tfidf_vect_lemmatization_misspell
dtype: object

## Выводы

Линейные модели дали лучший результат в поставленой задаче. Высокая точность была достигнута с использованием обычного мешка слов и с сохранением стоп слов. 

Среди подходов к приведению слов к общей форме лучше всего себя продемонстрировала лемматизация. При этом точность word2vec при использовании лемматизации значительно растет, т.к. word2vec пытается определить именно смысловое значение слов.

Также, стоит пояснить один важный момент: в данной задаче необходимо правильно выявить не только негативные/позитивные твиты, но и нейтральные. Таким образом, сохранив стоп-слова, мы наблюдаем повышение точности модели, т.к. нейтральные твиты содержат в себе много "шумовых" слов. 

## Using the best model

In [15]:
class TextClassifier:
    
    
    def __init__(self, vect_df, vectors):
        
        lr = LogisticRegression(
            multi_class='multinomial',
            solver='lbfgs',
            max_iter=1000
        )
        
        self.vectorizer = vectors.vectorizers['bin_vect_stemming']
        
        X = vect_df['bin_vect_stemming'].to_list()
        y = vect_df['target'].to_list()
        
        lr.fit(X, y)
        self.lr = lr

        
    def classify(self, text):
        
        ps = PorterStemmer()
        tokens = [word_tokenize(sent) for sent in text]
        processed_text = [[ps.stem(w) for w in sent] for sent in tokens]
        processed_text = [' '.join(w) for w in processed_text]
        
        vectors = self.vectorizer.transform(processed_text)
        vectors = list(vectors.toarray())
        pred = self.lr.predict(vectors)
        
        res = pd.DataFrame()
        res['tweets'] = text
        res['prediction'] = pred
        
        return res
        

In [16]:
text = [
    'my dear friend',
    'so happy for you',
    'love you baby',
    'jesus christ',
    'such a shame',
    'boring',
    'i feel very lonely and unhappy',
    'so sad'
]

tc = TextClassifier(vect_df, vectors)
tc.classify(text)

Unnamed: 0,tweets,prediction
0,my dear friend,1
1,so happy for you,1
2,love you baby,1
3,jesus christ,0
4,such a shame,0
5,boring,0
6,i feel very lonely and unhappy,-1
7,so sad,-1
