# scikit-learn

## Meaning of text

### Dataset for training

In [1]:
from sklearn.datasets import fetch_20newsgroups
import sklearn.datasets as ds

In [2]:
ds = fetch_20newsgroups()
print(f"ds has {len(ds.data)} records of data")
print(dir(ds))

#list(zip(list(range(len(ds.target_names))), ds.target_names))
list(enumerate(ds.target_names))

ds has 11314 records of data
['DESCR', 'data', 'filenames', 'target', 'target_names']


[(0, 'alt.atheism'),
 (1, 'comp.graphics'),
 (2, 'comp.os.ms-windows.misc'),
 (3, 'comp.sys.ibm.pc.hardware'),
 (4, 'comp.sys.mac.hardware'),
 (5, 'comp.windows.x'),
 (6, 'misc.forsale'),
 (7, 'rec.autos'),
 (8, 'rec.motorcycles'),
 (9, 'rec.sport.baseball'),
 (10, 'rec.sport.hockey'),
 (11, 'sci.crypt'),
 (12, 'sci.electronics'),
 (13, 'sci.med'),
 (14, 'sci.space'),
 (15, 'soc.religion.christian'),
 (16, 'talk.politics.guns'),
 (17, 'talk.politics.mideast'),
 (18, 'talk.politics.misc'),
 (19, 'talk.religion.misc')]

In [3]:
# first record
print(ds.target[0])
print(ds.target_names[ds.target[0]])
print(ds.data[0].split('\n'))

7
rec.autos
["From: lerxst@wam.umd.edu (where's my thing)", 'Subject: WHAT car is this!?', 'Nntp-Posting-Host: rac3.wam.umd.edu', 'Organization: University of Maryland, College Park', 'Lines: 15', '', ' I was wondering if anyone out there could enlighten me on this car I saw', 'the other day. It was a 2-door sports car, looked to be from the late 60s/', 'early 70s. It was called a Bricklin. The doors were really small. In addition,', 'the front bumper was separate from the rest of the body. This is ', 'all I know. If anyone can tellme a model name, engine specs, years', 'of production, where this car is made, history, or whatever info you', 'have on this funky looking car, please e-mail.', '', 'Thanks,', '- IL', '   ---- brought to you by your neighborhood Lerxst ----', '', '', '', '', '']


In [4]:
selected_categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [5]:
data_train = fetch_20newsgroups(subset='train', categories=selected_categories, shuffle=True, random_state=42)
#data_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

### Counter

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

##### try a small data set first

In [7]:
samples = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [8]:
X = count_vect.fit_transform(samples)
print(f"(sample size, dictionary size) = {X.shape}")
print(f"dictionary: {count_vect.get_feature_names()}")

# X is a sparse matrix of sample vs dictionary
print(X.toarray())

(sample size, dictionary size) = (4, 9)
dictionary: ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


##### now big one

In [9]:
X_train_counts = count_vect.fit_transform(data_train.data)
print(X_train_counts.shape)

(2257, 35788)


### Frequency

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

(2257, 35788)


### Training a classifier

In [11]:
from sklearn.naive_bayes import MultinomialNB

# train (fit) the model with the converted count matrix in frequency format and the target (supervised)
clf = MultinomialNB().fit(X_train_tfidf, data_train.target)

##### now use the model on new data

In [12]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'I am sick', 'My wife is with me']

# note: we use transform, not fit_transform
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [13]:
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, data_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'I am sick' => sci.med
'My wife is with me' => soc.religion.christian


### Build Pipeline

In [15]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [16]:
# single command to train the model
text_clf.fit(data_train.data, data_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [18]:
# evaluation with test data
data_test = fetch_20newsgroups(subset='test', categories=selected_categories, shuffle=True, random_state=42)
docs_test = data_test.data
predicted = text_clf.predict(docs_test)

# test result
import numpy as np
np.mean(predicted == data_test.target)

0.8348868175765646

##### use a linear support vector machine (SVM), which is widely regarded as one of the best text classification algorithms (although it’s also a bit slower than naïve Bayes)

In [19]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

In [21]:
text_clf.fit(data_train.data, data_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == data_test.target)

0.9101198402130493

### Performance Analysis

In [22]:
from sklearn import metrics
print(metrics.classification_report(data_test.target, predicted, target_names=data_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.80      0.87       319
         comp.graphics       0.87      0.98      0.92       389
               sci.med       0.94      0.89      0.91       396
soc.religion.christian       0.90      0.95      0.93       398

              accuracy                           0.91      1502
             macro avg       0.91      0.91      0.91      1502
          weighted avg       0.91      0.91      0.91      1502



In [23]:
# confusion matrix C is such that C_ij is equal to the number of observations known 
# to be in group i and predicted to be in group j.

# we have groups ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
metrics.confusion_matrix(data_test.target, predicted)

array([[256,  11,  16,  36],
       [  4, 380,   3,   2],
       [  5,  35, 353,   3],
       [  5,  11,   4, 378]])