In [1]:
import sklearn

import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
df = fetch_20newsgroups()

In [6]:
df.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [7]:
print(df.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [8]:
print(df.data[34])

From: joec@hilbert.cyprs.rain.com ( Joe Cipale)
Subject: Re: Clayton Need not Retract
Organization: Cypress Semi, Beaverton OR
Lines: 13

In article <Apr.9.08.39.25.1993.15639@romulus.rutgers.edu> kaldis@romulus.rutgers.edu (Theodore A. Kaldis) writes:
>civilized society.  The _ONLY_ way a homosexual can maintain even a
>modicum of respectability is by remaining in the closet.
>-- 
>  The views expressed herein are   |  Theodore A. Kaldis
>  my own only.  Do you seriously   |  kaldis@remus.rutgers.edu
>  believe that a major university  |  {...}!rutgers!remus.rutgers.edu!kaldis
>  as this would hold such views??? |

Once again, it appears that the one-eyed man has appeared in the land of the sighted
and for some strange resaon has appointed himself the ruler and supreme power.

Joe Cipale



In [9]:
print(df.target[34])

18


In [11]:
np.unique(df.target)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tfidf_vect = TfidfVectorizer(stop_words='english')

df_transformed = tfidf_vect.fit_transform(df.data)

In [15]:
df_transformed.shape

(11314, 129796)

In [19]:
import random

random.sample(list(tfidf_vect.vocabulary_.items()), 10)

[('3fgx', 12763),
 ('e2cyd', 49785),
 ('elevations', 51014),
 ('9ictc67i6o', 22725),
 ('garb', 58029),
 ('misrael', 82316),
 ('734832494', 18659),
 ('v08i008', 119939),
 ('qx8oqvf1', 98380),
 ('dietz', 47121)]

In [22]:
print(df_transformed[34])

  (0, 111782)	0.09002934834336684
  (0, 103277)	0.11556997274541954
  (0, 28872)	0.1002353906716775
  (0, 100970)	0.14195220944487544
  (0, 110697)	0.08130265486678305
  (0, 107103)	0.10959982070258496
  (0, 28802)	0.08526184050829096
  (0, 53587)	0.11980586303089565
  (0, 100651)	0.22088286599997634
  (0, 53434)	0.07177018641688745
  (0, 121239)	0.1425545598349198
  (0, 40450)	0.10880372376338542
  (0, 100564)	0.08839163910676408
  (0, 101125)	0.12350626860807491
  (0, 83152)	0.12154012478825411
  (0, 63756)	0.09046890954202168
  (0, 24173)	0.12846055535347195
  (0, 108389)	0.06722430138360797
  (0, 39975)	0.10362966865975039
  (0, 114289)	0.1873977063214353
  (0, 71539)	0.5103915932700802
  (0, 102686)	0.1996171443767215
  (0, 4900)	0.14195220944487544
  (0, 1376)	0.08130265486678305
  (0, 32376)	0.0974788492379283
  :	:
  (0, 28961)	0.05282268541796294
  (0, 28806)	0.07208684822690764
  (0, 78863)	0.08071595547501147
  (0, 106009)	0.07121681081130549
  (0, 12495)	0.07033836705110541

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
x_train, x_test, y_train, y_test = train_test_split(df_transformed,df.target,shuffle=True,test_size=0.2)

In [26]:
x_train.shape, y_train.shape

((9051, 129796), (9051,))

In [27]:
x_test.shape, y_test.shape

((2263, 129796), (2263,))

In [28]:
from sklearn.neural_network import MLPClassifier

In [31]:
mlp_clf = MLPClassifier(activation='relu',
                        hidden_layer_sizes=(32,),
                        solver='adam',
                        verbose=True,
                        max_iter=50)

In [32]:
mlp_clf.fit(x_train, y_train)

Iteration 1, loss = 2.91192818
Iteration 2, loss = 2.54071599
Iteration 3, loss = 2.02597397
Iteration 4, loss = 1.47213329
Iteration 5, loss = 1.00423113
Iteration 6, loss = 0.67499813
Iteration 7, loss = 0.46342819
Iteration 8, loss = 0.32951203
Iteration 9, loss = 0.24275686
Iteration 10, loss = 0.18418001
Iteration 11, loss = 0.14360312
Iteration 12, loss = 0.11462003
Iteration 13, loss = 0.09360073
Iteration 14, loss = 0.07772750
Iteration 15, loss = 0.06566892
Iteration 16, loss = 0.05637921
Iteration 17, loss = 0.04902550
Iteration 18, loss = 0.04308859
Iteration 19, loss = 0.03833970
Iteration 20, loss = 0.03444132
Iteration 21, loss = 0.03120669
Iteration 22, loss = 0.02851850
Iteration 23, loss = 0.02617604
Iteration 24, loss = 0.02419499
Iteration 25, loss = 0.02252300
Iteration 26, loss = 0.02102388
Iteration 27, loss = 0.01975500
Iteration 28, loss = 0.01874898
Iteration 29, loss = 0.01767226
Iteration 30, loss = 0.01676274
Iteration 31, loss = 0.01605120
Iteration 32, los



In [33]:
y_pred = mlp_clf.predict(x_test)

pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})

pred_results.sample(10)

Unnamed: 0,y_test,y_pred
1601,4,4
662,17,17
861,5,5
285,10,10
793,17,17
1891,18,18
1462,8,8
1493,16,16
2261,5,5
521,2,2


In [34]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9164825452938578