# CHAPTER 4 Text Classification


## Text Normalization


In [1]:
from contractions import CONTRACTION_MAP
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer


In [2]:
stopword_list = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()


In [3]:
def tokenize_text(text):
    tokens = nltk.word_tokenize(text) 
    tokens = [token.strip() for token in tokens]
    return tokens

def expand_contractions(text, contraction_mapping):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text


In [4]:
from pattern.en import tag
from nltk.corpus import wordnet as wn

# Annotate text tokens with POS tags
def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text


## Feature Extraction


In [5]:
CORPUS = [
'the sky is blue',
'sky is blue and sky is beautiful',
'the beautiful sky is so blue',
'i love blue cheese'
]

new_doc = ['loving this blue sky today']


In [6]:
import pandas as pd

def display_features(features, feature_names):
    df = pd.DataFrame(data=features,
                      columns=feature_names)
    print df



### Bag of Words Model


In [7]:
from feature_extractors import bow_extractor    


In [8]:
bow_vectorizer, bow_features = bow_extractor(CORPUS)
features = bow_features.todense()
print features


[[0 0 1 0 1 0 1 0 1]
 [1 1 1 0 2 0 2 0 0]
 [0 1 1 0 1 0 1 1 1]
 [0 0 1 1 0 1 0 0 0]]


In [9]:
new_doc_features = bow_vectorizer.transform(new_doc)
new_doc_features = new_doc_features.todense()
print new_doc_features


[[0 0 1 0 0 0 1 0 0]]


In [10]:
feature_names = bow_vectorizer.get_feature_names()
print feature_names


[u'and', u'beautiful', u'blue', u'cheese', u'is', u'love', u'sky', u'so', u'the']


In [11]:
display_features(features, feature_names)
display_features(new_doc_features, feature_names)


   and  beautiful  blue  cheese  is  love  sky  so  the
0    0          0     1       0   1     0    1   0    1
1    1          1     1       0   2     0    2   0    0
2    0          1     1       0   1     0    1   1    1
3    0          0     1       1   0     1    0   0    0
   and  beautiful  blue  cheese  is  love  sky  so  the
0    0          0     1       0   0     0    1   0    0


In [12]:
import numpy as np
from feature_extractors import tfidf_transformer
feature_names = bow_vectorizer.get_feature_names()
    
tfidf_trans, tdidf_features = tfidf_transformer(bow_features)
features = np.round(tdidf_features.todense(), 2)
display_features(features, feature_names)


    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [13]:
nd_tfidf = tfidf_trans.transform(new_doc_features)
nd_features = np.round(nd_tfidf.todense(), 2)
display_features(nd_features, feature_names)


   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


### TF-IDF Model


In [14]:
import scipy.sparse as sp
from numpy.linalg import norm
feature_names = bow_vectorizer.get_feature_names()

# compute term frequency
tf = bow_features.todense()
tf = np.array(tf, dtype='float64')

# show term frequencies
display_features(tf, feature_names)


   and  beautiful  blue  cheese   is  love  sky   so  the
0  0.0        0.0   1.0     0.0  1.0   0.0  1.0  0.0  1.0
1  1.0        1.0   1.0     0.0  2.0   0.0  2.0  0.0  0.0
2  0.0        1.0   1.0     0.0  1.0   0.0  1.0  1.0  1.0
3  0.0        0.0   1.0     1.0  0.0   1.0  0.0  0.0  0.0


In [15]:
# build the document frequency matrix
df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr)
df = 1 + df # to smoothen idf later

# show document frequencies
display_features([df], feature_names)


   and  beautiful  blue  cheese  is  love  sky  so  the
0    2          3     5       2   4     2    4   2    3


In [16]:
# compute inverse document frequencies
total_docs = 1 + len(CORPUS)
idf = 1.0 + np.log(float(total_docs) / df)

# show inverse document frequencies
display_features([np.round(idf, 2)], feature_names)


    and  beautiful  blue  cheese    is  love   sky    so   the
0  1.92       1.51   1.0    1.92  1.22  1.92  1.22  1.92  1.51


In [17]:
# compute idf diagonal matrix  
total_features = bow_features.shape[1]
idf_diag = sp.spdiags(idf, diags=0, m=total_features, n=total_features)
idf = idf_diag.todense()

# print the idf diagonal matrix
print np.round(idf, 2)


[[ 1.92  0.    0.    0.    0.    0.    0.    0.    0.  ]
 [ 0.    1.51  0.    0.    0.    0.    0.    0.    0.  ]
 [ 0.    0.    1.    0.    0.    0.    0.    0.    0.  ]
 [ 0.    0.    0.    1.92  0.    0.    0.    0.    0.  ]
 [ 0.    0.    0.    0.    1.22  0.    0.    0.    0.  ]
 [ 0.    0.    0.    0.    0.    1.92  0.    0.    0.  ]
 [ 0.    0.    0.    0.    0.    0.    1.22  0.    0.  ]
 [ 0.    0.    0.    0.    0.    0.    0.    1.92  0.  ]
 [ 0.    0.    0.    0.    0.    0.    0.    0.    1.51]]


In [18]:
# compute tfidf feature matrix
tfidf = tf * idf

# show tfidf feature matrix
display_features(np.round(tfidf, 2), feature_names)


    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00   1.0    0.00  1.22  0.00  1.22  0.00  1.51
1  1.92       1.51   1.0    0.00  2.45  0.00  2.45  0.00  0.00
2  0.00       1.51   1.0    0.00  1.22  0.00  1.22  1.92  1.51
3  0.00       0.00   1.0    1.92  0.00  1.92  0.00  0.00  0.00


In [19]:
# compute L2 norms 
norms = norm(tfidf, axis=1)

# print norms for each document
print np.round(norms, 2)


[ 2.5   4.35  3.5   2.89]


In [20]:
# compute normalized tfidf
norm_tfidf = tfidf / norms[:, None]

# show final tfidf feature matrix
display_features(np.round(norm_tfidf, 2), feature_names)


    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [21]:
# compute new doc term freqs from bow freqs
nd_tf = new_doc_features
nd_tf = np.array(nd_tf, dtype='float64')

# compute tfidf using idf matrix from train corpus
nd_tfidf = nd_tf*idf
nd_norms = norm(nd_tfidf, axis=1)
norm_nd_tfidf = nd_tfidf / nd_norms[:, None]

# show new_doc tfidf feature vector
display_features(np.round(norm_nd_tfidf, 2), feature_names)


   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


In [22]:
from feature_extractors import tfidf_extractor
    
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
display_features(np.round(tdidf_features.todense(), 2), feature_names)


    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [23]:
nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names)    


   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


### Advanced Word Vectorization Models


In [24]:
import gensim
import nltk

TOKENIZED_CORPUS = [nltk.word_tokenize(sentence) 
                    for sentence in CORPUS]
tokenized_new_doc = [nltk.word_tokenize(sentence) 
                    for sentence in new_doc]                        

model = gensim.models.Word2Vec(TOKENIZED_CORPUS, 
                               size=10,
                               window=10,
                               min_count=2,
                               sample=1e-3)


In [29]:
 print model['sky']


[ 0.01608407 -0.04819566  0.04227461 -0.03011346  0.0254148   0.01728328
  0.0155535   0.00774884 -0.02752112  0.01646519]


In [30]:
print model['blue']


[-0.0472235   0.01662185 -0.01221706 -0.04724348 -0.04384995  0.00193343
 -0.03163504 -0.03423524  0.02661656  0.03033725]


In [25]:
from feature_extractors import averaged_word_vectorizer


avg_word_vec_features = averaged_word_vectorizer(corpus=TOKENIZED_CORPUS,
                                                 model=model,
                                                 num_features=10)
print np.round(avg_word_vec_features, 3)


[[ 0.006 -0.01   0.015 -0.014  0.004 -0.006 -0.024 -0.007 -0.001 -0.   ]
 [-0.008 -0.01   0.021 -0.019 -0.002 -0.002 -0.011  0.002  0.003 -0.001]
 [-0.003 -0.007  0.008 -0.02  -0.001 -0.004 -0.014 -0.015  0.002 -0.01 ]
 [-0.047  0.017 -0.012 -0.047 -0.044  0.002 -0.032 -0.034  0.027  0.03 ]]


In [26]:
nd_avg_word_vec_features = averaged_word_vectorizer(corpus=tokenized_new_doc,
                                                    model=model,
                                                    num_features=10)
print np.round(nd_avg_word_vec_features, 3)


[[-0.016 -0.016  0.015 -0.039 -0.009  0.01  -0.008 -0.013 -0.     0.023]]


In [27]:
from feature_extractors import tfidf_weighted_averaged_word_vectorizer

corpus_tfidf = tdidf_features
vocab = tfidf_vectorizer.vocabulary_
wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_vectorizer(corpus=TOKENIZED_CORPUS,
                                                                     tfidf_vectors=corpus_tfidf,
                                                                     tfidf_vocabulary=vocab,
                                                                     model=model, 
                                                                     num_features=10)
print np.round(wt_tfidf_word_vec_features, 3)


[[ 0.011 -0.011  0.014 -0.011  0.007 -0.007 -0.024 -0.008 -0.004 -0.004]
 [-0.    -0.014  0.028 -0.014  0.004 -0.003 -0.012  0.011 -0.001 -0.002]
 [-0.001 -0.008  0.007 -0.019  0.001 -0.004 -0.012 -0.018  0.001 -0.014]
 [-0.047  0.017 -0.012 -0.047 -0.044  0.002 -0.032 -0.034  0.027  0.03 ]]


In [28]:
nd_wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_new_doc,
                                                                     tfidf_vectors=nd_tfidf,
                                                                     tfidf_vocabulary=vocab,
                                                                     model=model, 
                                                                     num_features=10)
print np.round(nd_wt_tfidf_word_vec_features, 3)   


[[-0.012 -0.019  0.018 -0.038 -0.006  0.01  -0.006 -0.011 -0.003  0.023]]


## Classification Algorithms


## Evaluating Classification Models


In [31]:
from sklearn import metrics
import numpy as np
import pandas as pd
from collections import Counter


In [32]:
actual_labels = ['spam', 'ham', 'spam', 'spam', 'spam',
               'ham', 'ham', 'spam', 'ham', 'spam',
               'spam', 'ham', 'ham', 'ham', 'spam',
               'ham', 'ham', 'spam', 'spam', 'ham']
              
predicted_labels = ['spam', 'spam', 'spam', 'ham', 'spam',
                    'spam', 'ham', 'ham', 'spam', 'spam',
                    'ham', 'ham', 'spam', 'ham', 'ham',
                    'ham', 'spam', 'ham', 'spam', 'spam']
                    
ac = Counter(actual_labels)                     
pc = Counter(predicted_labels)  

print 'Actual counts:', ac.most_common()
print 'Predicted counts:', pc.most_common()          


Actual counts: [('ham', 10), ('spam', 10)]
Predicted counts: [('spam', 11), ('ham', 9)]


In [33]:
cm = metrics.confusion_matrix(y_true=actual_labels,
                         y_pred=predicted_labels,
                         labels=['spam','ham'])
print pd.DataFrame(data=cm, 
                   columns=pd.MultiIndex(levels=[['Predicted:'],
                                                 ['spam','ham']], 
                                         labels=[[0,0],[0,1]]), 
                   index=pd.MultiIndex(levels=[['Actual:'],
                                               ['spam','ham']], 
                                       labels=[[0,0],[0,1]]))


             Predicted:    
                   spam ham
Actual: spam          5   5
        ham           6   4


In [34]:
positive_class = 'spam'

true_positive = 5.
false_positive = 6.
false_negative = 5.
true_negative = 4.

accuracy = np.round(
                metrics.accuracy_score(y_true=actual_labels,
                                       y_pred=predicted_labels),2)
accuracy_manual = np.round(
                    (true_positive + true_negative) /
                      (true_positive + true_negative +
                       false_negative + false_positive),2)
print 'Accuracy:', accuracy
print 'Manually computed accuracy:', accuracy_manual                                       


Accuracy: 0.45
Manually computed accuracy: 0.45


In [35]:
precision = np.round(
                metrics.precision_score(y_true=actual_labels,
                                        y_pred=predicted_labels,
                                        pos_label=positive_class),2)
precision_manual = np.round(
                        (true_positive) /
                        (true_positive + false_positive),2)
print 'Precision:', precision
print 'Manually computed precision:', precision_manual


Precision: 0.45
Manually computed precision: 0.45


In [36]:
recall = np.round(
            metrics.recall_score(y_true=actual_labels,
                                 y_pred=predicted_labels,
                                 pos_label=positive_class),2)
recall_manual = np.round(
                    (true_positive) /
                    (true_positive + false_negative),2)
print 'Recall:', recall
print 'Manually computed recall:', recall_manual


Recall: 0.5
Manually computed recall: 0.5


In [37]:
f1_score = np.round(
                metrics.f1_score(y_true=actual_labels,
                                 y_pred=predicted_labels,
                                 pos_label=positive_class),2) 
f1_score_manual = np.round(
                    (2 * precision * recall) /
                    (precision + recall),2)
print 'F1 score:', f1_score
print 'Manually computed F1 score:', f1_score_manual                                 

F1 score: 0.48
Manually computed F1 score: 0.47


## Building a Multi-Class Classification System


In [38]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split


In [39]:
def get_data():
    data = fetch_20newsgroups(subset='all',
                              shuffle=True,
                              remove=('headers', 'footers', 'quotes'))
    return data
    
def prepare_datasets(corpus, labels, test_data_proportion=0.3):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels, 
                                                        test_size=0.33, random_state=42)
    return train_X, test_X, train_Y, test_Y

def remove_empty_docs(corpus, labels):
    filtered_corpus = []
    filtered_labels = []
    for doc, label in zip(corpus, labels):
        if doc.strip():
            filtered_corpus.append(doc)
            filtered_labels.append(label)

    return filtered_corpus, filtered_labels
    
    
dataset = get_data()

print dataset.target_names


No handlers could be found for logger "sklearn.datasets.twenty_newsgroups"


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [40]:
corpus, labels = dataset.data, dataset.target
corpus, labels = remove_empty_docs(corpus, labels)

print 'Sample document:', corpus[10]
print 'Class label:',labels[10]
print 'Actual class label:', dataset.target_names[labels[10]]


Sample document: the blood of the lamb.

This will be a hard task, because most cultures used most animals
for blood sacrifices. It has to be something related to our current
post-modernism state. Hmm, what about used computers?

Cheers,
Kent
Class label: 19
Actual class label: talk.religion.misc


In [41]:
train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus,
                                                                        labels,
                                                                        test_data_proportion=0.3)


In [42]:
from normalization import normalize_corpus

norm_train_corpus = normalize_corpus(train_corpus)
norm_test_corpus = normalize_corpus(test_corpus)  

''.strip()


''

In [43]:
from feature_extractors import bow_extractor, tfidf_extractor
from feature_extractors import averaged_word_vectorizer
from feature_extractors import tfidf_weighted_averaged_word_vectorizer
import nltk
import gensim

# bag of words features
bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)  
bow_test_features = bow_vectorizer.transform(norm_test_corpus) 

# tfidf features
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)  
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)    


In [44]:
# tokenize documents
tokenized_train = [nltk.word_tokenize(text)
                   for text in norm_train_corpus]
tokenized_test = [nltk.word_tokenize(text)
                   for text in norm_test_corpus]  
# build word2vec model                   
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)                  


In [45]:
# averaged word vector features
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
                                                 num_features=500)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=model,
                                                num_features=500)                                                 


In [46]:
# tfidf weighted averaged word vector features
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train, 
                                                                  tfidf_vectors=tfidf_train_features, 
                                                                  tfidf_vocabulary=vocab, 
                                                                  model=model, 
                                                                  num_features=500)
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test, 
                                                                 tfidf_vectors=tfidf_test_features, 
                                                                 tfidf_vocabulary=vocab, 
                                                                 model=model, 
                                                                 num_features=500)



In [47]:
from sklearn import metrics
import numpy as np

def get_metrics(true_labels, predicted_labels):
    
    print 'Accuracy:', np.round(
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        2)
    print 'Precision:', np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        2)
    print 'Recall:', np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        2)
    print 'F1 Score:', np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        2)
                        

def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    # evaluate model prediction performance   
    get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)
    return predictions    



In [48]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100)


In [49]:
# Multinomial Naive Bayes with bag of words features
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)


Accuracy: 0.67
Precision: 0.72
Recall: 0.67
F1 Score: 0.65


In [50]:
# Support Vector Machine with bag of words features
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)


Accuracy: 0.61
Precision: 0.66
Recall: 0.61
F1 Score: 0.62


In [51]:
# Multinomial Naive Bayes with tfidf features                                           
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)


Accuracy: 0.72
Precision: 0.78
Recall: 0.72
F1 Score: 0.7


In [52]:
# Support Vector Machine with tfidf features
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)


Accuracy: 0.77
Precision: 0.77
Recall: 0.77
F1 Score: 0.77


In [53]:
# Support Vector Machine with averaged word vector features
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)


Accuracy: 0.56
Precision: 0.57
Recall: 0.56
F1 Score: 0.54


In [54]:
# Support Vector Machine with tfidf weighted averaged word vector features
svm_tfidfwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_wv_test_features,
                                           test_labels=test_labels)


Accuracy: 0.53
Precision: 0.56
Recall: 0.53
F1 Score: 0.52


In [55]:
import pandas as pd
cm = metrics.confusion_matrix(test_labels, svm_tfidf_predictions)
pd.DataFrame(cm, index=range(0,20), columns=range(0,20))  

class_names = dataset.target_names
print class_names[0], '->', class_names[15]
print class_names[18], '->', class_names[16]  
print class_names[19], '->', class_names[15]  



alt.atheism -> soc.religion.christian
talk.politics.misc -> talk.politics.guns
talk.religion.misc -> soc.religion.christian


In [56]:
import re

num = 0
for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
    if label == 0 and predicted_label == 15:
        print 'Actual Label:', class_names[label]
        print 'Predicted Label:', class_names[predicted_label]
        print 'Document:-'
        print re.sub('\n', ' ', document)
        print
        num += 1
        if num == 4:
            break



Actual Label: alt.atheism
Predicted Label: soc.religion.christian
Document:-
I would like a list of Bible contadictions from those of you who dispite being free from Christianity are well versed in the Bible. 

Actual Label: alt.atheism
Predicted Label: soc.religion.christian
Document:-
  They spent quite a bit of time on the wording of the Constitution.  They picked words whose meanings implied the intent.  We have already looked in the dictionary to define the word.  Isn't this sufficient?   But we were discussing it in relation to the death penalty.  And, the Constitution need not define each of the words within.  Anyone who doesn't know what cruel is can look in the dictionary (and we did).

Actual Label: alt.atheism
Predicted Label: soc.religion.christian
Document:-
Our Lord and Savior David Keresh has risen!   	He has been seen alive!   	Spread the word!     --------------------------------------------------------------------------------

Actual Label: alt.atheism
Predicted Label

In [57]:
num = 0
for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
    if label == 18 and predicted_label == 16:
        print 'Actual Label:', class_names[label]
        print 'Predicted Label:', class_names[predicted_label]
        print 'Document:-'
        print re.sub('\n', ' ', document)
        print
        num += 1
        if num == 4:
            break

Actual Label: talk.politics.misc
Predicted Label: talk.politics.guns
Document:-
After the initial gun battle was over, they had 50 days to come out peacefully. They had their high priced lawyer, and judging by the posts here they had some public support. Can anyone come up with a rational explanation why the didn't come out (even after they negotiated coming out after the radio sermon) that doesn't include the Davidians wanting to commit suicide/murder/general mayhem?

Actual Label: talk.politics.misc
Predicted Label: talk.politics.guns
Document:-
Yesterday, the FBI was saying that at least three of the bodies had gunshot wounds, indicating that they were shot trying to escape the fire.  Today's paper quotes the medical examiner as saying that there is no evidence of gunshot wounds in any of the recovered bodies.  At the beginning of this siege, it was reported that while Koresh had a class III (machine gun) license, today's paper quotes the government as saying, no, they didn't have a