In [None]:
from sklearn.datasets import fetch_20newsgroups

train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

Xtrain = train.data
ytrain = train.target
Xtest = test.data
ytest = test.target

print("X:", len(Xtrain))
print("y:", len(ytrain))

In [None]:
print("X[0]:", Xtrain[0])
print("y[0]:", ytrain[0])

In [None]:
train.target_names

### Apply Tfidf ([documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words = 'english',
                        ngram_range = (2, 2))
Xtrain_tfidf = tfidf.fit_transform(Xtrain)

Xtrain_tfidf

In [None]:
#tfidf.vocabulary_

### Classify with Naive Bayes ([documentation](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html))

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB(alpha = 1)
nb.fit(Xtrain_tfidf, ytrain)

Evaluate on the test set using [`classification_report`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html) 

We will focus on the [F1-score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)

In [None]:
from sklearn.metrics import classification_report

Xtest_tfidf = tfidf.transform(Xtest)

ypred = nb.predict(Xtest_tfidf)

print(classification_report(ytest, ypred))

### Combine all methods into a [pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words = 'english')),
                     ('nb', MultinomialNB())])

pipeline.fit(Xtrain, ytrain)
ypred = pipeline.predict(Xtest)
print(classification_report(ytest, ypred))

Now we will use [grid search cross-validation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) to find model with the best hyperparameters

![5CV](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
          'nb__alpha': [0.01, 0.1, 1, 10]}

gridcv = GridSearchCV(pipeline, params, scoring = 'f1_macro', cv = 5)
gridcv.fit(Xtrain, ytrain)

In [None]:
gridcv.best_estimator_

In [None]:
ypred = gridcv.predict(Xtest)
print(classification_report(ytest, ypred))

#### Exercise

1. For the Naive Bayes model, use grid search cross-validation across different values of `alpha` and `ngram_range` to find the best model.

2. For the best value of `alpha` and `ngram_range`, compute the `f1_macro` score on the test set. 
* What value of `alpha` and `ngram_range` did you choose?
* Whai is the model's `f1_macro` score?

Extra: Try `GaussianNB` [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html). Do you get a better result?