# Working with text 

[Tutorial](http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#extracting-features-from-text-files)

In [4]:
from sklearn.datasets import fetch_20newsgroups


categories = ['alt.atheism', 'soc.religion.christian']

twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)


No handlers could be found for logger "sklearn.datasets.twenty_newsgroups"


In [5]:
twenty_train.target_names

['alt.atheism', 'soc.religion.christian']

In [6]:
print len(twenty_train.data)

1079


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(1079, 19666)

# TD-IDF

Occurrence count is a good start but there is an issue: longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.
To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.
Another refinement on top of tf is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.


In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape


(1079, 19666)

# Entrenando clasificador

Empezamos Naive Bayes

In [15]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tf, twenty_train.target)

In [18]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']

X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tf_transformer.transform(X_new_counts)

In [19]:
X_new_tfidf

<2x19666 sparse matrix of type '<type 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [21]:
clf.predict(X_new_tfidf)

array([1, 1])

# Pipeline


In [27]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [28]:
text_clf.predict(docs_new)

array([1, 1])