In [172]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn.feature_extraction.text as ext
categories = ['talk.politics.misc', 'soc.religion.christian',
               'talk.politics.mideast', 'sci.med', 'sci.space',
              'sci.electronics', 'talk.politics.misc']
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
%matplotlib

Using matplotlib backend: MacOSX


In [175]:
#bag of words will use numbers to represent words and
#aspects about them like location.
#By simplying the text information it is easier
#to analyze and extract information about it.
#below is gathering the dataset and creating a bag of words
twenty_train = fetch_20newsgroups(subset='train',
                                  categories=categories, 
                                  shuffle=True, 
                                  random_state=42)
#twenty_train is a bunch and bag of words process occurs
#by giving a value to every word in the training set.
#Then it will count every occurence of the words.
count_vect = ext.CountVectorizer()
X_train_count = count_vect.fit_transform(
    twenty_train.data)
#The result will be placed in X_train_count
print X_train_counts.shape #shape shows the entries

(3406, 48556)


In [148]:
twenty_train.target_names

['sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.politics.misc']

In [192]:
#n-grams with the n referring to the size. The items can
#be letters, syllables, words and pairs. The purpose of an
#n-gram is to predict the next sequence in a series
from sklearn.datasets import fetch_20newsgroups
import sklearn.feature_extraction.text as ext
categories=["sci.med"]
med_train = fetch_20newsgroups(subset='train',
                                  categories=categories,
                                  remove=('headers','footers','quotes'),
                                  shuffle=True, 
                                  random_state=42)
count_chars= ext.CountVectorizer(analyzer='char_wb',
        ngram_range=(3,3),max_features=10).fit(med_train['data'])
count_words= ext.CountVectorizer(analyzer='word',
        ngram_range=(2,2),max_features=10,
                                stop_words='english').fit(med_train['data'])
X = count_chars.transform(med_train.data)

In [193]:
print count_words.get_feature_names()

[u'banks n3jxp', u'cadre dsl', u'chastity intellect', u'dsl pitt', u'geb cadre', u'gordon banks', u'intellect geb', u'n3jxp skepticism', u'pitt edu', u'skepticism chastity']


In [194]:
print X[1].todense()

[[2 2 1 9 4 0 3 3 1 5]]


In [195]:
print count_words.get_feature_names()

[u'banks n3jxp', u'cadre dsl', u'chastity intellect', u'dsl pitt', u'geb cadre', u'gordon banks', u'intellect geb', u'n3jxp skepticism', u'pitt edu', u'skepticism chastity']


In [76]:
len(twenty_train.data)

3406

In [77]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: lundby@rtsg.mot.com (Walter F. Lundby)
Subject: Re: Is MSG sensitivity superstition?
Nntp-Posting-Host: accord2


In [78]:
print(twenty_train.target_names[twenty_train.target[0]])

sci.med


In [79]:
twenty_train.target[:10]

array([1, 0, 2, 5, 3, 5, 0, 1, 3, 0])

In [80]:
#names of categories
for t in twenty_train.target[:10]:
     print(twenty_train.target_names[t])

sci.med
sci.electronics
sci.space
talk.politics.misc
soc.religion.christian
talk.politics.misc
sci.electronics
sci.med
soc.religion.christian
sci.electronics


In [95]:
#another example of bag of words
from sklearn.feature_extraction.text import CountVectorizer
count_vect= CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(3406, 48556)

In [96]:
count_vect.vocabulary_.get(u'algorithm')

6892

In [97]:
#tf-idf
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(3406, 48556)

In [98]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3406, 48556)

In [99]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [100]:
#training a classifier
docs_new = ['war', 'moon', 'diabetes','stable','terrible','life','death']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'war' => talk.politics.mideast
'moon' => sci.space
'diabetes' => sci.med
'stable' => sci.space
'terrible' => soc.religion.christian
'life' => soc.religion.christian
'death' => soc.religion.christian


In [101]:
#pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MultinomialNB()),
 ])

In [102]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [103]:
#evaluating accuracy
twenty_test = fetch_20newsgroups(subset='test',
     categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 

0.84252315835906488

In [104]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42,
                                            max_iter=5, tol=None)),
])
text_clf.fit(twenty_train.data, twenty_train.target)  
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)  

0.92148213498014997

In [159]:
from sklearn import metrics
report =metrics.classification_report(twenty_test.target, predicted,
     target_names=twenty_test.target_names)

In [106]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
 }

In [107]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [108]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [109]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]
'soc.religion.christian'

'soc.religion.christian'

In [110]:
gs_clf.best_score_

0.88

In [111]:
for param_name in sorted(parameters.keys()):
...     print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
...


tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [112]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [113]:
from sklearn.metrics import confusion_matrix

In [119]:
categories[:10]

['talk.politics.misc',
 'soc.religion.christian',
 'talk.politics.mideast',
 'sci.med',
 'sci.space',
 'sci.electronics',
 'talk.politics.misc']

In [121]:
print newsgroups_train.data[5]






Of course.  The term must be rigidly defined in any bill.


I doubt she uses this term for that.  You are using a quote allegedly
from her, can you back it up?




I read the article as presenting first an argument about weapons of mass
destruction (as commonly understood) and then switching to other topics.
The first point evidently was to show that not all weapons should be
allowed, and then the later analysis was, given this understanding, to
consider another class.






In [122]:
#http://epopt.readthedocs.io/en/latest/notebooks/newsgroups.html
from sklearn.feature_extraction import text
vectorizer = text.TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(newsgroups_train.data)
y = newsgroups_train.target
Xtest = vectorizer.transform(newsgroups_test.data)
ytest = newsgroups_test.target

In [123]:
print X.shape

(11314, 5000)


In [134]:
from sklearn.manifold import TSNE

# an example of a tsne model
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

tsne_lda = tsne_model.fit_transform(X_topics)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 18846 samples in 0.039s...
[t-SNE] Computed neighbors for 18846 samples in 18.760s...
[t-SNE] Computed conditional probabilities for sample 1000 / 18846
[t-SNE] Computed conditional probabilities for sample 2000 / 18846
[t-SNE] Computed conditional probabilities for sample 3000 / 18846
[t-SNE] Computed conditional probabilities for sample 4000 / 18846
[t-SNE] Computed conditional probabilities for sample 5000 / 18846
[t-SNE] Computed conditional probabilities for sample 6000 / 18846
[t-SNE] Computed conditional probabilities for sample 7000 / 18846
[t-SNE] Computed conditional probabilities for sample 8000 / 18846
[t-SNE] Computed conditional probabilities for sample 9000 / 18846
[t-SNE] Computed conditional probabilities for sample 10000 / 18846
[t-SNE] Computed conditional probabilities for sample 11000 / 18846
[t-SNE] Computed conditional probabilities for sample 12000 / 18846
[t-SNE] Computed conditional probabilities for sa

In [141]:
from wordcloud import WordCloud, STOPWORDS

In [154]:
#an example of a classification visualization
def plot_classification_report(cr, title='Classification report ', with_avg_total=False, cmap=plt.cm.Blues):

    lines = cr.split('\n')

    classes = []
    plotMat = []
    for line in lines[2 : (len(lines) - 3)]:
        #print(line)
        t = line.split()
        # print(t)
        classes.append(t[0])
        v = [float(x) for x in t[1: len(t) - 1]]
        print(v)
        plotMat.append(v)

    if with_avg_total:
        aveTotal = lines[len(lines) - 1].split()
        classes.append('avg/total')
        vAveTotal = [float(x) for x in t[1:len(aveTotal) - 1]]
        plotMat.append(vAveTotal)


    plt.imshow(plotMat, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    x_tick_marks = np.arange(3)
    y_tick_marks = np.arange(len(classes))
    plt.xticks(x_tick_marks, ['precision', 'recall', 'f1-score'], rotation=45)
    plt.yticks(y_tick_marks, classes)
    plt.tight_layout()
    plt.ylabel('Classes')
    plt.xlabel('Measures')

In [163]:
plot_classification_report(report, with_avg_total=True)


[0.87, 0.97, 0.92]
[0.94, 0.86, 0.9]
[0.94, 0.94, 0.94]
[0.93, 0.97, 0.95]
[0.95, 0.92, 0.94]
[0.91, 0.84, 0.88]
