In [32]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
stopwords_list = stopwords.words("english")

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, accuracy_score, f1_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [33]:
pos = pd.read_csv('pos.csv')
neg = pd.read_csv('neg.csv')
neg = neg.rename(columns={'0': 'text'})
pos = pos.rename(columns={'0': 'text'})
df = pd.concat([pos, neg])
df.head()

Unnamed: 0,text,sentiment
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


### Характеристики датасета

In [34]:
doc_lenghts = df['text'].str.split().apply(lambda x: len(x))
doc_count = df.shape[0]

print(f'Количество слов в корпусе {sum(doc_lenghts)}')
print(f'Количество документов {doc_count}')
print(f'Медианное количестов слов в документе {np.median(doc_lenghts)}')

Количество слов в корпусе 224067
Количество документов 10662
Медианное количестов слов в документе 20.0


In [35]:
# wnl = WordNetLemmatizer()
# df['text'] = df.apply(lambda sent: re_tokenizer.tokenize(sent['text']), axis=1)
# df['text'] = df.apply(lambda sent: [token.lower() for token in sent['text'] if len(token) > 2 and token.isalpha()], axis=1)
# df['text'] = df.apply(lambda sent: [wnl.lemmatize(word) for word in sent['text']], axis=1)
# df['text'] = df.apply(lambda sent: [word for word in sent['text'] if word not in stopwords_list], axis=1)

In [50]:
xtrain, xvalid, ytrain, yvalid = train_test_split(df['text'], df['sentiment'], test_size=0.25, random_state=17)

pipelines = [
    Pipeline([('ngrams_vectorizer', CountVectorizer(stop_words=stopwords_list, token_pattern=r"[A-z-']+", ngram_range=(1, 2))),
              ('bayes', MultinomialNB())]),
    Pipeline([('ngrams_vectorizer', CountVectorizer(stop_words=stopwords_list, token_pattern=r"[A-z-']+", ngram_range=(1, 3))),
              ('bayes', MultinomialNB())]),
    Pipeline([('ngrams_vectorizer', CountVectorizer(stop_words=stopwords_list, token_pattern=r"[A-z-']+", ngram_range=(2, 4))),
              ('bayes', MultinomialNB())]),
    Pipeline([('char_ngrams_vectorizer', CountVectorizer(analyzer='char', ngram_range=(3, 8))),
              ('bayes', MultinomialNB())]),
    Pipeline([('char_ngrams_vectorizer', CountVectorizer(analyzer='char', ngram_range=(2, 5))),
              ('bayes', MultinomialNB())]),
    Pipeline([('char_ngrams_vectorizer', CountVectorizer(analyzer='char', ngram_range=(4, 6))),
              ('bayes', MultinomialNB())]),
    Pipeline([('bag_of_words', CountVectorizer(stop_words=stopwords_list, token_pattern=r"[A-z-']+")),
              ('bayes', MultinomialNB())]),
    Pipeline([('tfidf_params', TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stopwords_list, token_pattern=r"[A-z-']+")),
              ('bayes', MultinomialNB())]),
    Pipeline([('tfidf_params', TfidfVectorizer(max_df=0.96, min_df=3, stop_words=stopwords_list, token_pattern=r"[A-z-']+")),
              ('bayes', MultinomialNB())]),
    Pipeline([('tfidf_params', TfidfVectorizer(max_df=0.97, min_df=4, stop_words=stopwords_list, token_pattern=r"[A-z-']+")),
              ('bayes', MultinomialNB())]),
    Pipeline([('tfidf_plain', TfidfVectorizer(stop_words=stopwords_list, token_pattern="[A-z-']+")),
              ('bayes', MultinomialNB())])
]

score = pd.DataFrame(columns=['accuracy', 'precision', 'f1_score', 'recall'])

def get_pred_score(pipe,
                   xtrain=xtrain, ytrain=ytrain,
                   xvalid=xvalid, yvalid=yvalid):
    pipe.fit(xtrain, ytrain)
    predictions = pipe.predict(xvalid)
    pipe_name = pipe.steps[0][0] + ' '
    if pipe_name in ['ngrams_vectorizer ', 'char_ngrams_vectorizer ']:
        pipe_name += 'ngram_range ' + str(pipe.__dict__['steps'][0][1].ngram_range)
    if pipe_name in ['tfidf_params ', 'tfidf_plain ']:
        pipe_name += 'min_df ' + str(pipe.__dict__['steps'][0][1].min_df) + ' '
        pipe_name += 'max_df ' + str(pipe.__dict__['steps'][0][1].max_df)

    row = pd.Series({'accuracy': accuracy_score(yvalid, predictions),
                     'precision': precision_score(yvalid, predictions),
                     'f1_score': f1_score(yvalid, predictions),
                     'recall': recall_score(yvalid, predictions)},
                     name=pipe_name)

    return row

for pipe in pipelines:
    score = score.append(get_pred_score(pipe))

### Разные гиперпараметры не дают существенного прироста эффективности модели

In [52]:
score

Unnamed: 0,accuracy,precision,f1_score,recall
"ngrams_vectorizer ngram_range (1, 2)",0.767442,0.767737,0.768311,0.768886
"ngrams_vectorizer ngram_range (1, 3)",0.768567,0.768657,0.769518,0.770381
"ngrams_vectorizer ngram_range (2, 4)",0.609527,0.684539,0.513324,0.410621
"char_ngrams_vectorizer ngram_range (3, 8)",0.776444,0.764454,0.782323,0.801047
"char_ngrams_vectorizer ngram_range (2, 5)",0.764441,0.76047,0.767235,0.774121
"char_ngrams_vectorizer ngram_range (4, 6)",0.77982,0.773723,0.783155,0.79282
bag_of_words,0.769692,0.7716,0.769865,0.768138
tfidf_params min_df 2 max_df 0.95,0.765191,0.770342,0.763952,0.757666
tfidf_params min_df 3 max_df 0.96,0.767817,0.771558,0.767206,0.762902
tfidf_params min_df 4 max_df 0.97,0.758815,0.764885,0.757083,0.749439
