# [新聞資料集](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html#sklearn.datasets.fetch_20newsgroups)

## 1. 載入相關套件

In [26]:
"""
REF ADDR : https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py
"""
from time import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

## 載入資料集

In [27]:
# 篩選新聞類別
categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
]

data_train = fetch_20newsgroups(
    subset="train",
    categories=categories,
    shuffle=True,
)

data_test = fetch_20newsgroups(
    subset="test",
    categories=categories,
    shuffle=True,
)

## 後續步驟可參閱 [Classification of text documents using sparse features](https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py)

In [28]:
# order of labels in `target_names` can be different from `categories`
target_names = data_train.target_names
print(target_names)

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']


In [29]:
# split target in a training set and a test set
y_train, y_test = data_train.target, data_test.target
print(y_train)
print(y_test)

[1 3 2 ... 1 0 1]
[2 1 1 ... 3 1 1]


In [30]:
# Extracting features from the training data using a sparse vectorizer
t0 = time()
vectorizer = TfidfVectorizer(
    sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english"
)
X_train = vectorizer.fit_transform(data_train.data)
duration_train = time() - t0

print(duration_train)

0.22662997245788574


In [31]:
# Extracting features from the test data using the same vectorizer
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration_test = time() - t0

print(duration_train)

0.22662997245788574


In [32]:
feature_names = vectorizer.get_feature_names_out()
verbose=False

if verbose:
    # compute size of loaded data
    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)

    print(
        f"{len(data_train.data)} documents - "
        f"{data_train_size_mb:.2f}MB (training set)"
    )
    print(f"{len(data_test.data)} documents - {data_test_size_mb:.2f}MB (test set)")
    print(f"{len(target_names)} categories")
    print(
        f"vectorize training done in {duration_train:.3f}s "
        f"at {data_train_size_mb / duration_train:.3f}MB/s"
    )
    print(f"n_samples: {X_train.shape[0]}, n_features: {X_train.shape[1]}")
    print(
        f"vectorize testing done in {duration_test:.3f}s "
        f"at {data_test_size_mb / duration_test:.3f}MB/s"
    )
    print(f"n_samples: {X_test.shape[0]}, n_features: {X_test.shape[1]}")



In [33]:
print("X_train= \n", X_train)
print("X_test= \n", X_test)
print("y_train= \n", y_train)
print("y_test= \n", y_test)
print("feature_names= \n", feature_names)
print("target_names= \n", target_names)

X_train= 
   (0, 2517)	0.18882559857168102
  (0, 453)	0.14810906626622972
  (0, 7330)	0.13783664104130097
  (0, 254)	0.34404631756778353
  (0, 2276)	0.07033982375105988
  (0, 7070)	0.2944436581666543
  (0, 6173)	0.17819540353245403
  (0, 166)	0.08658058224839943
  (0, 3457)	0.09233043938439052
  (0, 7504)	0.06709283290470455
  (0, 4906)	0.13276131063021313
  (0, 6245)	0.17819540353245403
  (0, 4650)	0.10645464205726671
  (0, 4406)	0.1270798399040038
  (0, 5370)	0.13150813563613234
  (0, 1353)	0.12609338927283967
  (0, 2927)	0.24787945867395397
  (0, 3223)	0.08031114792141571
  (0, 2133)	0.1369871763301335
  (0, 5453)	0.09571658744266878
  (0, 5091)	0.23464700210776418
  (0, 5454)	0.12336340331196224
  (0, 2395)	0.09339764418615404
  (0, 4048)	0.05087600635720216
  (0, 3720)	0.1320941585100957
  :	:
  (2033, 2040)	0.15208661467341683
  (2033, 5540)	0.10109677643672658
  (2033, 2870)	0.1628784470588461
  (2033, 5478)	0.11827544273835017
  (2033, 252)	0.22003887239477773
  (2033, 5592)	0.