In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from clfswitcher import ClfSwitcher
from transformer import Transformer
import pickle
import nltk
reviews = nltk.download('movie_reviews')
#from nltk.corpus import movie_reviews

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/dmitrijbordugov/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [2]:
from nltk.corpus import movie_reviews

Generate data

In [3]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

negfeats = [movie_reviews.words(fileids=[f]) for f in negids]
posfeats = [movie_reviews.words(fileids=[f]) for f in posids]

reviews = posfeats + negfeats
ispositive = [1]*len(posfeats) + [0]*len(negfeats)

texts = [None]*len(reviews)
for i, review in enumerate(reviews):
    texts[i]=''
    for word in review:
        texts[i] += ' ' + word

Describe pipeline and find best model with GridSearch

In [4]:
pipeline = Pipeline([
    ('vect', Transformer()),
    ('clf', ClfSwitcher()),
])

In [5]:
parameters = [
    {
        'vect__vectorizer': [TfidfVectorizer(), CountVectorizer()],
        'clf__estimator': [SGDClassifier(loss='log'), LogisticRegression()],
        
    },]
    

gscv = GridSearchCV(pipeline, parameters, cv=3, verbose=0, scoring='accuracy')
gscv.fit(texts, ispositive)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        Transformer(vectorizer=TfidfVectorizer(analyzer='word',
                                                                               binary=False,
                                                                               decode_error='strict',
                                                                               dtype=<class 'numpy.float64'>,
                                                                               encoding='utf-8',
                                                                               input='content',
                                                                               lowercase=True,
                                                                               max_df=1.0,
                                                                           

In [6]:
gscv.best_score_

0.6875

In [7]:
import pandas as pd
results=pd.DataFrame(gscv.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__estimator,param_vect__vectorizer,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,1.804118,0.13377,0.402492,0.023095,"SGDClassifier(alpha=0.0001, average=False, cla...","TfidfVectorizer(analyzer='word', binary=False,...","{'clf__estimator': SGDClassifier(alpha=0.0001,...",0.373313,0.83958,0.276276,0.4965,0.245897,3
1,1.735014,0.154325,0.394176,0.029648,"SGDClassifier(alpha=0.0001, average=False, cla...","CountVectorizer(analyzer='word', binary=False,...","{'clf__estimator': SGDClassifier(alpha=0.0001,...",0.617691,0.836582,0.608108,0.6875,0.105529,1
2,1.831332,0.208126,0.428848,0.025428,"LogisticRegression(C=1.0, class_weight=None, d...","TfidfVectorizer(analyzer='word', binary=False,...","{'clf__estimator': LogisticRegression(C=1.0, c...",0.01949,0.809595,0.016517,0.282,0.373208,4
3,1.805291,0.202477,0.406102,0.037881,"LogisticRegression(C=1.0, class_weight=None, d...","CountVectorizer(analyzer='word', binary=False,...","{'clf__estimator': LogisticRegression(C=1.0, c...",0.595202,0.824588,0.630631,0.6835,0.100844,2


In [25]:
import random
from sklearn.model_selection import StratifiedKFold

#cv = StratifiedKFold(n_splits=3, random_state=42)

from sklearn.model_selection import cross_val_score
clf_pipeline = Pipeline(
            [("vectorizer", CountVectorizer()),
            ("classifier", SGDClassifier())]
        )
print(round(cross_val_score(clf_pipeline, texts, ispositive, cv=3, scoring='accuracy').mean(), 2))
print(round(cross_val_score(gscv.best_estimator_, texts, ispositive, cv=3, scoring='accuracy').mean(), 2))

0.83
0.69


In [24]:
gscv.best_estimator_

Pipeline(memory=None,
         steps=[('vect',
                 Transformer(vectorizer=CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        preprocessor=None,
                                                        stop_words=