In [1]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer


from gensim.sklearn_api import W2VTransformer
from gensim.sklearn_api import D2VTransformer
from gensim.models import Word2Vec
from gensim.models import Doc2Vec

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

#### categories

In [2]:
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
]

#### dataset

In [3]:
data = fetch_20newsgroups(subset='train',categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")

1441 documents
3 categories


#### data glance

In [5]:
type(data.data)

list

In [7]:
len(data.data)

1441

In [9]:
data.data[:3]

["From: psyrobtw@ubvmsd.cc.buffalo.edu (Robert Weiss)\nSubject: 18 Apr 93   God's Promise in Philippians 4:9\nOrganization: University at Buffalo\nLines: 8\nNews-Software: VAX/VMS VNEWS 1.41\nNntp-Posting-Host: ubvmsd.cc.buffalo.edu\n\n\n\tThose things,\n\twhich ye have both learned, and received,\n\tand heard, and seen in me,\n\tdo:\n\tand the God of peace shall be with you.\n\n\tPhilippians 4:9\n",
 'From: myless@vaxc.cc.monash.edu.au (Myles Strous)\nSubject: J.C.Jensen\'s bitmap code\nOrganization: Computer Centre, Monash University, Australia\nLines: 18\n\nGreetings all.\n\tAccording to a FAQ I read, on 30 July 1992, Joshua C. Jensen posted an \narticle on bitmap manipulation (specifically, scaling and perspective) to the \nnewsgroup rec.games.programmer. (article 7716)\n\tThe article included source code in Turbo Pascal with inline assembly \nlanguage.\n\n\tI have been unable to find an archive for this newsgroup, or a current \nemail address for Joshua C. Jensen.\n\tIf anyone has

In [10]:
type(data.target)

numpy.ndarray

In [11]:
len(data.target)

1441

In [12]:
set(data.target)

{0, 1, 2}

#### combination1: svm+TfidfVectorizer

In [13]:
pipeline1 = Pipeline([('vect', TfidfVectorizer()),('svc', SVC()),])

In [14]:
parameters1={'vect__max_df': (0.5, 0.75, 1.0),
'vect__max_features': (None, 5000, 10000, 50000),
'vect__ngram_range': ((1, 1), (1, 2)),
'svc__C': (1,5),
'svc__gamma': ('scale','auto'),
'svc__random_state': (30,42)}

In [15]:
gs1 = GridSearchCV(pipeline1, parameters1, cv=5,n_jobs=-1, verbose=1)

In [16]:
gs1.fit(data.data, data.target)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', TfidfVectorizer()),
                                       ('svc', SVC())]),
             n_jobs=-1,
             param_grid={'svc__C': (1, 5), 'svc__gamma': ('scale', 'auto'),
                         'svc__random_state': (30, 42),
                         'vect__max_df': (0.5, 0.75, 1.0),
                         'vect__max_features': (None, 5000, 10000, 50000),
                         'vect__ngram_range': ((1, 1), (1, 2))},
             verbose=1)

In [17]:
print("Best score: %0.4f" % gs1.best_score_)

Best score: 0.9577


In [18]:
print("Best parameters set:")
best_parameters1 = gs1.best_estimator_.get_params()
for param_name in sorted(parameters1.keys()):
    print("\t%s: %r" % (param_name, best_parameters1[param_name]))

Best parameters set:
	svc__C: 5
	svc__gamma: 'scale'
	svc__random_state: 30
	vect__max_df: 0.5
	vect__max_features: 5000
	vect__ngram_range: (1, 1)


#### combination2:  DecisionTree+TfidfVectorizer

In [23]:
pipeline4 = Pipeline([('vect', TfidfVectorizer()),('dt', DecisionTreeClassifier()),])

In [24]:
parameters4={'vect__max_df': (0.5, 0.75, 1.0),
'vect__max_features': (None, 5000, 10000, 50000),
'vect__ngram_range': ((1, 1), (1, 2)),
'dt__criterion':('gini','entropy'), 
'dt__splitter':['random'],
'dt__max_depth':(100,200),
}

In [25]:
gs4 = GridSearchCV(pipeline4, parameters4, cv=5,n_jobs=-1, verbose=1)

In [26]:
gs4.fit(data.data, data.target)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', TfidfVectorizer()),
                                       ('dt', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'dt__criterion': ('gini', 'entropy'),
                         'dt__max_depth': (100, 200),
                         'dt__splitter': ['random'],
                         'vect__max_df': (0.5, 0.75, 1.0),
                         'vect__max_features': (None, 5000, 10000, 50000),
                         'vect__ngram_range': ((1, 1), (1, 2))},
             verbose=1)

In [27]:
print("Best score: %0.4f" % gs4.best_score_)

Best score: 0.8231


In [28]:
print("Best parameters set:")
best_parameters4 = gs4.best_estimator_.get_params()
for param_name in sorted(parameters4.keys()):
    print("\t%s: %r" % (param_name, best_parameters4[param_name]))

Best parameters set:
	dt__criterion: 'gini'
	dt__max_depth: 200
	dt__splitter: 'random'
	vect__max_df: 1.0
	vect__max_features: None
	vect__ngram_range: (1, 2)


#### combination3: LogisticRegression+TfidfVectorizer

In [45]:
pipeline7 = Pipeline([('vect', TfidfVectorizer()),('lr', LogisticRegression()),])

In [46]:
parameters7={'vect__max_df': (0.5, 0.75, 1.0),
'vect__max_features': (None, 5000, 10000, 50000),
'vect__ngram_range': ((1, 1), (1, 2)),
'lr__C': (3,5),
'lr__penalty': ('none','l2'),
'lr__max_iter': (80,100)}

In [47]:
gs7 = GridSearchCV(pipeline7, parameters7, cv=5,n_jobs=-1, verbose=1)

In [48]:
gs7.fit(data.data, data.target)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


  "Setting penalty='none' will ignore the C and l1_ratio parameters"


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', TfidfVectorizer()),
                                       ('lr', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'lr__C': (3, 5), 'lr__max_iter': (80, 100),
                         'lr__penalty': ('none', 'l2'),
                         'vect__max_df': (0.5, 0.75, 1.0),
                         'vect__max_features': (None, 5000, 10000, 50000),
                         'vect__ngram_range': ((1, 1), (1, 2))},
             verbose=1)

In [49]:
print("Best score: %0.4f" % gs7.best_score_)

Best score: 0.9605


In [50]:
print("Best parameters set:")
best_parameters7 = gs7.best_estimator_.get_params()
for param_name in sorted(parameters7.keys()):
    print("\t%s: %r" % (param_name, best_parameters7[param_name]))

Best parameters set:
	lr__C: 3
	lr__max_iter: 80
	lr__penalty: 'none'
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 1)


#### combination4: Multinomial Naïve Bayes+TfidfVectorizer

In [62]:
pipeline10 = Pipeline([('vect', TfidfVectorizer()),('mnb', MultinomialNB()),])

In [63]:
parameters10={'vect__max_df': (0.5, 0.75, 1.0),
'vect__max_features': (None, 5000, 10000, 50000),
'vect__ngram_range': ((1, 1), (1, 2)),
'mnb__alpha':(0.5,1,1.5)}

In [64]:
gs10 = GridSearchCV(pipeline10, parameters10, cv=5,n_jobs=-1, verbose=1)

In [65]:
gs10.fit(data.data, data.target)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', TfidfVectorizer()),
                                       ('mnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'mnb__alpha': (0.5, 1, 1.5),
                         'vect__max_df': (0.5, 0.75, 1.0),
                         'vect__max_features': (None, 5000, 10000, 50000),
                         'vect__ngram_range': ((1, 1), (1, 2))},
             verbose=1)

In [67]:
print("Best score: %0.4f" % gs10.best_score_)

Best score: 0.9403


In [68]:
print("Best parameters set:")
best_parameters10 = gs10.best_estimator_.get_params()
for param_name in sorted(parameters10.keys()):
    print("\t%s: %r" % (param_name, best_parameters10[param_name]))

Best parameters set:
	mnb__alpha: 0.5
	vect__max_df: 0.5
	vect__max_features: 5000
	vect__ngram_range: (1, 1)
