In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
stopwords_list = stopwords.words("english")

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, accuracy_score, f1_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [3]:
pos = pd.read_csv('pos.csv')
neg = pd.read_csv('neg.csv')
neg = neg.rename(columns={'0': 'text'})
pos = pos.rename(columns={'0': 'text'})
df = pd.concat([pos, neg])
df.head()

Unnamed: 0,text,sentiment
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


### Характеристики датасета

In [4]:
doc_lenghts = df['text'].str.split().apply(lambda x: len(x))
doc_count = df.shape[0]

print(f'Количество слов в корпусе {sum(doc_lenghts)}')
print(f'Количество документов {doc_count}')
print(f'Медианное количестов слов в документе {np.median(doc_lenghts)}')

Количество слов в корпусе 224067
Количество документов 10662
Медианное количестов слов в документе 20.0


In [5]:
xtrain, xvalid, ytrain, yvalid = train_test_split(df['text'], df['sentiment'], test_size=0.25, random_state=17)

pipelines = [
    Pipeline([('ngrams_vectorizer', CountVectorizer(stop_words=stopwords_list, token_pattern=r"[A-z-']+", ngram_range=(1, 2))),
              ('bayes', MultinomialNB())]),
    Pipeline([('char_ngrams_vectorizer', CountVectorizer(analyzer='char', ngram_range=(3, 8))),
              ('bayes', MultinomialNB())]),
    Pipeline([('bag_of_words', CountVectorizer(stop_words=stopwords_list, token_pattern=r"[A-z-']+")),
              ('bayes', MultinomialNB())]),
    Pipeline([('tfidf_params', TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stopwords_list, token_pattern=r"[A-z-']+")),
              ('bayes', MultinomialNB())]),
    Pipeline([('tfidf_plain', TfidfVectorizer(stop_words=stopwords_list, token_pattern="[A-z-']+")),
              ('bayes', MultinomialNB())])
]

score = pd.DataFrame(columns=['accuracy', 'precision', 'f1_score', 'recall'])

def get_pred_score(pipe,
                   xtrain=xtrain, ytrain=ytrain,
                   xvalid=xvalid, yvalid=yvalid):
    pipe.fit(xtrain, ytrain)
    predictions = pipe.predict(xvalid)
    row = pd.Series({'accuracy': accuracy_score(yvalid, predictions),
                     'precision': precision_score(yvalid, predictions),
                     'f1_score': f1_score(yvalid, predictions),
                     'recall': recall_score(yvalid, predictions)},
                    name=pipe.steps[0][0])

    return row

for pipe in pipelines:
    score = score.append(get_pred_score(pipe))

In [6]:
score

Unnamed: 0,accuracy,precision,f1_score,recall
ngrams_vectorizer,0.767442,0.767737,0.768311,0.768886
char_ngrams_vectorizer,0.776444,0.764454,0.782323,0.801047
bag_of_words,0.769692,0.7716,0.769865,0.768138
tfidf_params,0.765191,0.770342,0.763952,0.757666
tfidf_plain,0.768942,0.768429,0.770149,0.771877
