## Downloading the 20newsgroups dataset using sklearn

http://qwone.com/~jason/20Newsgroups/


In [1]:
# Loading the data set - annotated training data.
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [2]:
# You can check the target names (categories) and some data files by following commands.
twenty_train.target_names #prints all the categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
print("\n".join(twenty_train.data[0].split("\n")[:100])) #prints first 100 lines of the first data file

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [4]:
twenty_train.target[0]

7

In [5]:
len(twenty_train.target)

11314

In [6]:
for i, n in enumerate(twenty_train.target_names):
  print(i, n)

0 alt.atheism
1 comp.graphics
2 comp.os.ms-windows.misc
3 comp.sys.ibm.pc.hardware
4 comp.sys.mac.hardware
5 comp.windows.x
6 misc.forsale
7 rec.autos
8 rec.motorcycles
9 rec.sport.baseball
10 rec.sport.hockey
11 sci.crypt
12 sci.electronics
13 sci.med
14 sci.space
15 soc.religion.christian
16 talk.politics.guns
17 talk.politics.mideast
18 talk.politics.misc
19 talk.religion.misc


## Feature extraction

In [7]:
# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features=10000)
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(11314, 10000)

In [8]:
count_vect.get_feature_names_out()[1000:2000]

array(['angles', 'angmar', 'angry', 'animal', 'animals', 'animated',
       'animation', 'ankara', 'ann', 'announce', 'announced',
       'announcement', 'annoyed', 'annoying', 'annual', 'anon',
       'anonymity', 'anonymous', 'another', 'ansi', 'answer', 'answered',
       'answering', 'answers', 'antenna', 'anthony', 'anti',
       'antibiotics', 'antonio', 'anwar', 'any', 'anybody', 'anymore',
       'anyone', 'anything', 'anytime', 'anyway', 'anyways', 'anywhere',
       'ao', 'aoj', 'aol', 'ap', 'apana', 'apart', 'apartment', 'apc',
       'api', 'apollo', 'apologies', 'apologize', 'apostle', 'apostles',
       'app', 'apparent', 'apparently', 'appeal', 'appeals', 'appear',
       'appearance', 'appeared', 'appearing', 'appears', 'apple',
       'applelink', 'apples', 'applicable', 'application', 'applications',
       'applied', 'applies', 'apply', 'applying', 'appointed',
       'appreciate', 'appreciated', 'appressian', 'approach',
       'approaches', 'approaching', 'appropri

In [9]:
len(twenty_train.data)

11314

In [10]:
# Alternatively: use TF-IDF features
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 10000)

## Train a machine learning model to classify the texts

### Naive Bayes

In [11]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [12]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [13]:
# Performance of NB Classifier
import numpy as np

twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.7738980350504514

In [14]:
predicted

array([ 7, 11,  0, ...,  9,  3, 15])

### SVM

In [15]:
# Training Support Vector Machines - SVM and calculating its performance

from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer(max_features=20000)), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)

# Predicting labels on test set
predicted_svm = text_clf_svm.predict(twenty_test.data)
# Measuring performance (which metric??)
np.mean(predicted_svm == twenty_test.target)



0.8159851301115242

### Feature extraction variants

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# Removing stop words
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])

In [18]:
# Stemming words
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english', max_features=10000)

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)

predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)

np.mean(predicted_mnb_stemmed == twenty_test.target)

0.8138608603292619

In [19]:
text_mnb_stemmed['vect'].get_feature_names_out()[-1000:]

array(['transmit', 'transmitt', 'transpar', 'transplant', 'transport',
       'trap', 'trash', 'travel', 'travi', 'tray', 'tread', 'treasur',
       'treasuri', 'treat', 'treati', 'treatment', 'tree', 'trek',
       'tremend', 'trend', 'trentu', 'tri', 'trial', 'triangl',
       'triangul', 'tribe', 'trick', 'tricki', 'trident', 'trigger',
       'trillion', 'trim', 'triniti', 'trip', 'tripl', 'triumf',
       'triumph', 'trivia', 'trivial', 'trol', 'troop', 'trophi',
       'troubl', 'troy', 'truck', 'true', 'truecolor', 'truetyp', 'truli',
       'trumpet', 'trunk', 'trust', 'truth', 'trw', 'ts', 'tsd', 'tseng',
       'tsiel', 'tsn', 'tsr', 'tt', 'tti', 'ttl', 'ttu', 'tu', 'tube',
       'tucson', 'tudelft', 'tue', 'tuesday', 'tuft', 'tuinstra',
       'tulkarm', 'tulsa', 'tune', 'turbo', 'turgeon', 'turk', 'turkey',
       'turkish', 'turkiy', 'turn', 'turner', 'turpin', 'tut', 'tutori',
       'tv', 'tvtwm', 'tw', 'twice', 'twin', 'twist', 'twisto', 'twm',
       'tx', 'txt', 'ty'

### Performance metrics

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

In [21]:
accuracy_score(predicted_mnb_stemmed, twenty_test.target)

0.8138608603292619

In [22]:
precision_score(predicted_mnb_stemmed, twenty_test.target, average='micro')

0.8138608603292619

In [23]:
recall_score(predicted_mnb_stemmed, twenty_test.target, average='macro')

0.8226006672357936

### Classification report

In [24]:
print(classification_report(predicted_mnb_stemmed, twenty_test.target, target_names=twenty_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.77      0.75       297
           comp.graphics       0.75      0.69      0.72       423
 comp.os.ms-windows.misc       0.69      0.78      0.73       350
comp.sys.ibm.pc.hardware       0.73      0.65      0.69       444
   comp.sys.mac.hardware       0.83      0.80      0.82       397
          comp.windows.x       0.79      0.83      0.81       378
            misc.forsale       0.80      0.81      0.80       386
               rec.autos       0.90      0.88      0.89       406
         rec.motorcycles       0.93      0.93      0.93       401
      rec.sport.baseball       0.92      0.92      0.92       396
        rec.sport.hockey       0.98      0.92      0.95       428
               sci.crypt       0.94      0.86      0.90       432
         sci.electronics       0.65      0.79      0.72       323
                 sci.med       0.79      0.91      0.85       344
         

Explore the weights of the features in the trained SVM model in order to infer some insights related to feature importance for each class.


In [25]:
# ....

In [26]:
classifier = text_clf_svm['clf-svm']

In [27]:
classifier.coef_.shape
# <- investigate the features with highest coefficients
print(classifier.coef_)

[[-0.05671016  0.04070699 -0.00475401 ...  0.         -0.00507972
  -0.02184114]
 [-0.02280162 -0.0251327   0.         ... -0.00017215  0.10495375
  -0.01687109]
 [-0.11747475  0.004907   -0.01854303 ...  0.00359518 -0.00324281
   0.00083127]
 ...
 [-0.05260286  0.07714953 -0.00125346 ...  0.         -0.00282441
   0.        ]
 [-0.02415987 -0.0005222   0.         ...  0.          0.
  -0.00033254]
 [-0.05066426 -0.00853936  0.00340131 ...  0.          0.
   0.        ]]


In [28]:
sorted(classifier.coef_[-1])[:10]


[-0.15444660450574657,
 -0.13036697154635563,
 -0.12332716112994753,
 -0.11566964422150726,
 -0.11461752304091687,
 -0.11331696540274347,
 -0.11236593127407489,
 -0.10976382494350513,
 -0.10653733910275515,
 -0.10400696732522516]

# Exercises

Try different classification models from sklearn to classify the same data, for example a Logistic Regression model: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html


In [29]:
from sklearn.linear_model import LogisticRegression

Can you find a different preprocessing pipeline which leads to better results? For example, try the following:
- lemmatization instead of stemming
- include or exclude punctuation
- limit the vocabulary size
- include word n-grams in the vocabulary
- ...

In [30]:
# Explore different features for the same dataset

# Count Vectorizer
count_vec_limited_vocab = CountVectorizer(max_features=2500)
X_train_counts_limited_vocab = count_vec_limited_vocab.fit_transform(twenty_train.data)
X_train_counts_limited_vocab.shape

(11314, 2500)

In [31]:
count_vec_limited_vocab.get_feature_names_out()[:100]

array(['00', '000', '01', '02', '03', '04', '05', '06', '07', '08', '09',
       '0d', '0t', '10', '100', '1000', '11', '12', '128', '13', '130',
       '14', '145', '15', '150', '16', '17', '18', '19', '1988', '1989',
       '1990', '1991', '1992', '1993', '1993apr14', '1993apr15',
       '1993apr16', '1993apr19', '1993apr20', '1993apr5', '1993apr6',
       '1d9', '1st', '1t', '20', '200', '2000', '21', '22', '23', '24',
       '241', '25', '250', '256', '26', '27', '28', '29', '2di', '2nd',
       '2tm', '30', '300', '31', '32', '33', '34', '34u', '35', '36',
       '37', '38', '386', '39', '3d', '3rd', '3t', '40', '400', '408',
       '41', '42', '43', '44', '45', '46', '47', '48', '486', '49', '4t',
       '50', '500', '51', '52', '53', '54', '55'], dtype=object)

In [32]:
# Alternatively: use TF-IDF features
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf_limited_vocab = tfidf_transformer.fit_transform(X_train_counts_limited_vocab)

print("Number of features seen during fit: ", tfidf_transformer.n_features_in_)
X_train_tfidf_limited_vocab.shape

Number of features seen during fit:  2500


(11314, 2500)

In [33]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [34]:
# any letter repeated more than three times in a row is replaced by two repetitions of the same letter
def remove_multiple_occurences(text):
    n = len(text)

    if n < 3:
        return text

    i, count = 0, 0
    while i < n - 1:
        i += 1
        if text[i] != text[i-1]:
            count = 0
        else:
            count += 1
            if count >= 2:
                text = text[:i] + text[i+1:]
                n -= 1
                i -= 1

    return text

In [35]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import string
import unicodedata

wordnet_map = {
    "N": wordnet.NOUN,
    "V": wordnet.VERB,
    "J": wordnet.ADJ,
    "R": wordnet.ADV
}

class LemmatizedCountVectorizer(CountVectorizer):
  def get_tokens(self, text):
    # Normalize text
    preprocessed_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # Tokenize text
    tokens = nltk.word_tokenize(preprocessed_text)
    return tokens

  def get_lemmas(self, text):
    tokenized_text = self.get_tokens(text)
    tagged_text = nltk.pos_tag(tokenized_text)
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(w, pos=wordnet_map.get(p[0], wordnet.NOUN)) for (w, p) in tagged_text]
  
    return lemmas

  def build_analyzer(self):
    analyzer = super(LemmatizedCountVectorizer, self).build_analyzer()
    return lambda text: (self.get_lemmas(text))


lemmatized_count_vec = LemmatizedCountVectorizer(stop_words="english", max_features=7500)

In [36]:
"""
  Logistic regression
  Reference: https://scikit-learn.org/stable/modules/linear_model.html?highlight=logistic+regression#logistic-regression
"""

text_lr_lemmatized = Pipeline([
    ('bag_of_words', lemmatized_count_vec),
    ('tfidf', TfidfTransformer()),
    ('classifier', LogisticRegression(max_iter=120))
])

text_lr_lemmatized = text_lr_lemmatized.fit(twenty_train.data, twenty_train.target)
predicted_lr_lemmatized = text_lr_lemmatized.predict(twenty_test.data)

print('Accurary: ', np.mean(twenty_test.target == predicted_lr_lemmatized))

Accurary:  0.7635422198619225


In [37]:
text_lr_lemmatized['bag_of_words'].get_feature_names_out()[-100:]

array(['wide', 'widely', 'widespread', 'widget', 'width', 'wife', 'wild',
       'will', 'willing', 'win', 'wind', 'window', 'windows', 'wing',
       'winner', 'winter', 'wipe', 'wire', 'wiretap', 'wiring', 'wisdom',
       'wise', 'wish', 'with', 'withdraw', 'within', 'without', 'witness',
       'wlsmith', 'wo', 'woman', 'wonder', 'wonderful', 'wood', 'word',
       'work', 'worker', 'working', 'workstation', 'world',
       'world.std.com', 'worried', 'worry', 'worship', 'worth',
       'worthless', 'worthwhile', 'worthy', 'would', "wouldn't", 'wound',
       'wrap', 'wrist', 'write', 'writer', 'writes', 'writing', 'wrong',
       'x', 'x-Soviet', 'xdm', 'xpert', 'xterm', 'y', 'ya', 'yard', 'ye',
       'yeah', 'year', 'yeast', 'yell', 'yellow', 'yes', 'yesterday',
       'yet', 'yfn.ysu.edu', 'yield', 'you', "you're", 'young', 'your',
       'yours', 'yourself', 'youth', 'yoyo.cc.monash.edu.au', 'z', 'zero',
       'zone', 'zoo.toronto.edu', 'zuma.UUCP', '{', '|', '|/', '|I', '|_'

In [38]:
# Classification report
print(classification_report(predicted_lr_lemmatized, twenty_test.target, target_names=twenty_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.63      0.64      0.64       313
           comp.graphics       0.70      0.61      0.66       446
 comp.os.ms-windows.misc       0.67      0.65      0.66       407
comp.sys.ibm.pc.hardware       0.66      0.68      0.67       377
   comp.sys.mac.hardware       0.71      0.79      0.75       347
          comp.windows.x       0.75      0.77      0.76       388
            misc.forsale       0.87      0.66      0.75       509
               rec.autos       0.81      0.86      0.83       370
         rec.motorcycles       0.87      0.87      0.87       395
      rec.sport.baseball       0.84      0.84      0.84       396
        rec.sport.hockey       0.91      0.91      0.91       398
               sci.crypt       0.84      0.91      0.87       366
         sci.electronics       0.70      0.65      0.68       424
                 sci.med       0.74      0.83      0.78       355
         

Use one of the sklearn models to train a classifier to predict the author of a text. You can use the `nltk` downloader to build a dataset: for example download literary texts written by Shakespeare and by Jane Austen, then build a binary classifier to distinguish between the two.

Try to use feature selection to use different kinds of features than in the previous task:
- stopword features
- punctuation features
- parts-of-speech (i.e. first apply a POS-tagger on the text, then use the obtained tags as your "vocabulary" instead of word tokens.)
- use character-level representations (remember `analize`='char` for the sklearn vectorizers)

In [39]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [40]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [41]:
macbeth_sentences = nltk.corpus.gutenberg.sents('shakespeare-macbeth.txt')
hamlet_sentences = nltk.corpus.gutenberg.sents('shakespeare-hamlet.txt')
caesar_sentences = nltk.corpus.gutenberg.sents('shakespeare-caesar.txt')

emma_sentences = nltk.corpus.gutenberg.sents('austen-emma.txt')
sense_sentences = nltk.corpus.gutenberg.sents('austen-sense.txt')
persuasion_sentences = nltk.corpus.gutenberg.sents('austen-persuasion.txt')

In [42]:
macbeth_longest_len = max(len(s) for s in macbeth_sentences)
hamlet_longest_len = max(len(s) for s in hamlet_sentences)
caesar_longest_len = max(len(s) for s in caesar_sentences)

emma_longest_len = max(len(s) for s in emma_sentences)
sense_longest_len = max(len(s) for s in sense_sentences)
persuasion_longest_len = max(len(s) for s in persuasion_sentences)

In [43]:
# Extract all the sentences that have at least half of the length of the longest sentence
data_machbeth = [s for s in macbeth_sentences if len(s) > macbeth_longest_len / 4.]
data_hamlet = [s for s in hamlet_sentences if len(s) > hamlet_longest_len / 4.]
data_caesar = [s for s in caesar_sentences if len(s) > caesar_longest_len / 4.]

data_emma = [s for s in emma_sentences if len(s) > emma_longest_len / 4.]
data_sense = [s for s in sense_sentences if len(s) > sense_longest_len / 4.]
data_persuasion = [s for s in persuasion_sentences if len(s) > persuasion_longest_len / 4.]

In [44]:
num_sentences_shakespeare = len(data_hamlet) + len(data_caesar) + len(data_machbeth)
num_sentences_austen = len(data_emma) + len(data_sense) + len(data_persuasion)

In [45]:
X = np.concatenate([data_machbeth, data_hamlet, data_caesar, data_emma, data_sense, data_persuasion])
y = np.concatenate([np.zeros((num_sentences_shakespeare),), np.ones((num_sentences_austen,))])



In [46]:
import sklearn 
from sklearn.model_selection import train_test_split

# Split data intro train and test
X_train_texts, X_test_texts, y_train_texts, y_test_texts = train_test_split(X, y, test_size=0.33, random_state=0)

In [47]:
class PosTagsCountVectorizer(CountVectorizer):
  def get_pos_tags(self, tokenized_text):
    tagged_text = nltk.pos_tag(tokenized_text)
    pos_tags = [p for (_, p) in tagged_text]  
    return pos_tags

  def build_analyzer(self):
    analyzer = super(PosTagsCountVectorizer, self).build_analyzer()
    return lambda text: (self.get_pos_tags(text))


postags_count_vec = PosTagsCountVectorizer(stop_words="english", max_features=2500, analyzer='char')

In [48]:
"""
  Logistic regression
  Reference: https://scikit-learn.org/stable/modules/linear_model.html?highlight=logistic+regression#logistic-regression
"""
from sklearn.ensemble import RandomForestClassifier

text_rf_postags = Pipeline([
    ('bag_of_words', postags_count_vec),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier())
])

text_rf_lemmatized = text_rf_postags.fit(X_train_texts, y_train_texts)
predicted_rf_postags = text_rf_postags.predict(X_test_texts)

In [49]:
print('Accurary: ', np.mean(predicted_rf_postags == y_test_texts))

Accurary:  0.9484126984126984
