In [20]:
from nltk.corpus import movie_reviews
import nltk
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import numpy as np
import warnings

In [2]:
def write_answer(string, num):
  with open(f'answer_{num}.txt', 'w') as f:
    f.write(string)

In [3]:
nltk.download('movie_reviews')
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [4]:
negfeats = [' '.join(movie_reviews.words(fileids=[f]))for f in negids]
posfeats = [' '.join(movie_reviews.words(fileids=[f])) for f in posids]

In [5]:
reviews = negfeats + posfeats
labels = [0]*len(negfeats) + [1]*len(posfeats)

In [6]:
def class_text(vect, cls):
  return Pipeline([
                   ('vectorizer', vect),
                   ('classifier', cls)
                  ])


Построение pipeline на частотных признаках и tfidf

In [7]:
warnings.filterwarnings('ignore')
cv_counvect = cross_val_score(class_text(CountVectorizer(), LogisticRegression()),reviews,labels,cv=5, scoring='accuracy')

In [8]:
print(f"Mean accuracy: {cv_counvect.mean()}")
print(f"Mean std: {cv_counvect.std()}")

Mean accuracy: 0.8424999999999999
Mean std: 0.021794494717703363


In [9]:
cv_tfidf = cross_val_score(class_text(TfidfVectorizer(), LogisticRegression()), reviews, labels, cv=5, scoring='accuracy')

In [10]:
print(f"Mean accuracy: {cv_tfidf.mean()}")
print(f"Mean std: {cv_tfidf.std()}")

Mean accuracy: 0.8205
Mean std: 0.003999999999999995


In [11]:
write_answer(' '.join(map(str, [cv_counvect.mean(),cv_counvect.std(),cv_tfidf.mean(),cv_tfidf.std()])),1)

Удаление слишком редких слов

In [12]:
accuracy_n_min = []
for n_min in [10, 50]:
  cv_result = cross_val_score(class_text(CountVectorizer(min_df=n_min), LogisticRegression()), reviews, labels, cv=5, scoring='accuracy')
  accuracy_n_min.append(cv_result.mean())

In [13]:
print(accuracy_n_min)

[0.8400000000000001, 0.8140000000000001]


In [14]:
write_answer(' '.join(map(str, accuracy_n_min)), 2)

Различные классификаторы

In [15]:
np.random.seed(42)
accuracy_dif_cls = []
cls_list = [LogisticRegression(), LinearSVC(), SGDClassifier()]
for cls in cls_list:
  cv_result = cross_val_score(class_text(CountVectorizer(), cls), reviews, labels, cv=5, scoring='accuracy')
  accuracy_dif_cls.append(cv_result.mean())

In [16]:
accuracy_dif_cls

[0.8424999999999999, 0.8325000000000001, 0.835]

In [17]:
write_answer(str(min(accuracy_dif_cls)), 3)

Удаление стоп слов
  

In [25]:
nltk.download('stopwords')
stop_words_nltk = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
cv_result_nltk = cross_val_score(class_text(CountVectorizer(stop_words=stop_words_nltk), cls), reviews, labels, cv=5, scoring='accuracy')
cvresult_sklearn = cross_val_score(class_text(CountVectorizer(stop_words='english'), cls), reviews, labels, cv=5, scoring='accuracy')

In [29]:
print(f"mean accuarcy without stop_words: {cv_result_nltk.mean():.2f} and {cvresult_sklearn.mean():.2f}")

mean accuarcy without stop_words: 0.83 and 0.83


In [30]:
write_answer(' '.join(map(str, [cv_result_nltk.mean(), cvresult_sklearn.mean()])), 4)

Учет n-грамм

In [34]:
cv_result_2 = cross_val_score(class_text(CountVectorizer(ngram_range=(1,2)), cls), reviews, labels, cv=5, scoring='accuracy')
cv_result_35 = cross_val_score(class_text(CountVectorizer(ngram_range=(3,5), analyzer='char_wb'), cls), reviews, labels, cv=5, scoring='accuracy')

In [35]:
print(f"mean accuarcy for bigram: {cv_result_2.mean():.2f}")
print(f"mean accuarcy for 3 to 5 chargram: {cv_result_35.mean():.2f}")

mean accuarcy for bigram: 0.84
mean accuarcy for 3 to 5 chargram: 0.82


In [36]:
write_answer(' '.join(map(str, [cv_result_2.mean(), cv_result_35.mean()])), 5)