# Working with text 

[Tutorial](http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#extracting-features-from-text-files)

In [11]:
from sklearn.datasets import fetch_20newsgroups


categories = ['alt.atheism', 'soc.religion.christian']

twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)


In [12]:
print twenty_train.target_names

print twenty_train.data[0]

['alt.atheism', 'soc.religion.christian']
From: nigel.allen@canrem.com (Nigel Allen)
Subject: library of congress to host dead sea scroll symposium april 21-22
Lines: 96


 Library of Congress to Host Dead Sea Scroll Symposium April 21-22
 To: National and Assignment desks, Daybook Editor
 Contact: John Sullivan, 202-707-9216, or Lucy Suddreth, 202-707-9191
          both of the Library of Congress

   WASHINGTON, April 19  -- A symposium on the Dead Sea 
Scrolls will be held at the Library of Congress on Wednesday,
April 21, and Thursday, April 22.  The two-day program, cosponsored
by the library and Baltimore Hebrew University, with additional
support from the Project Judaica Foundation, will be held in the
library's Mumford Room, sixth floor, Madison Building.
   Seating is limited, and admission to any session of the symposium
must be requested in writing (see Note A).
   The symposium will be held one week before the public opening of a
major exhibition, "Scrolls from the Dead Sea

In [13]:
print len(twenty_train.data)

1079


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features=20)
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape


print count_vect.get_feature_names()
X_train_counts[:10].toarray()

[u'and', u'are', u'as', u'be', u'but', u'edu', u'for', u'from', u'god', u'have', u'in', u'is', u'it', u'not', u'of', u'that', u'the', u'this', u'to', u'you']


array([[26,  1,  0,  8,  0,  0,  3,  9,  0,  1,  8,  2,  0,  0, 44,  1, 57,
         0,  7,  0],
       [14,  3,  4,  9,  3,  4,  9,  4,  5,  6, 11, 10, 11,  3, 10,  4, 17,
         5, 19,  7],
       [ 4,  1,  2,  1,  1,  3,  0,  1,  3,  3,  3,  3,  1,  1,  5,  4,  8,
         1,  5,  0],
       [ 4,  6,  1,  6,  3,  2,  1,  2,  0,  4,  1,  7,  7,  4, 10,  7, 17,
         5, 13, 13],
       [ 5,  1,  2,  3,  0,  3,  0,  3,  0,  0,  5,  5,  3,  2,  5,  1, 13,
         3,  5,  2],
       [10, 14,  0,  3,  1,  2,  0,  2,  0,  4, 16, 22,  4,  5, 10, 10, 16,
         9, 14, 14],
       [17,  7,  5, 13, 15,  5, 10,  2,  5, 12, 16, 33, 26, 17, 17, 26, 39,
        10, 42, 14],
       [ 2,  2,  3,  2,  1,  4,  2,  1,  1,  1,  1,  1,  9,  1,  7,  3,  4,
         3,  3,  3],
       [ 1,  2,  0,  0,  1,  2,  0,  2,  0,  0,  5,  1,  0,  1,  2,  5,  8,
         0,  5,  4],
       [ 3,  0,  0,  0,  0,  1,  0,  1,  0,  0,  1,  0,  2,  0,  1,  1,  1,
         0,  2,  1]])

# Construir dataframe desde vectorizador

Queremos armar el dataframe sin ninguna cosa rala ni nada por el estilo

Esta vez usamos el vectorizador pero sacando stop words (para que no queden 'and' 'is' y otros yuyos como primeras palabras)

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features=40, stop_words='english')

X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape


df = pd.DataFrame(X_train_counts.toarray(), columns=count_vect.get_feature_names())

print df[:10]

   article  atheists  believe  bible  christ  christian  christians  church  \
0        0         0        0      2       0          0           0       0   
1        0         0        2      4       0          2           0       1   
2        1         0        0      0       0          0           0       0   
3        0         1        0      0       0          0           0       0   
4        1         1        0      0       0          1           1       0   
5        1         0        0      2       0          0           2       3   
6        5         0        1      0       0          0           0       0   
7        1         0        0      0       0          0           0       0   
8        1         0        0      0       0          0           0       0   
9        0         0        0      0       0          0           0       0   

   com  did   ...    subject  things  think  time  true  truth  university  \
0    1    0   ...          1       0      0     1   

# TD-IDF

Occurrence count is a good start but there is an issue: longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.
To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.
Another refinement on top of tf is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.


In [22]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

transformer = TfidfVectorizer(max_features=100, stop_words='english').fit(twenty_train.data)

X_train_tf = transformer.transform(twenty_train.data)
print X_train_tf.shape


df = pd.DataFrame(X_train_tf.toarray(), columns=transformer.get_feature_names())

print df.iloc[:10]

(1079, 100)
       1993       apr  argument   article  atheism  atheist  atheists  belief  \
0  0.000000  0.000000  0.000000  0.000000      0.0      0.0  0.000000     0.0   
1  0.000000  0.000000  0.000000  0.000000      0.0      0.0  0.000000     0.0   
2  0.165481  0.000000  0.000000  0.116009      0.0      0.0  0.000000     0.0   
3  0.000000  0.000000  0.000000  0.000000      0.0      0.0  0.087735     0.0   
4  0.163866  0.169518  0.649793  0.114876      0.0      0.0  0.189245     0.0   
5  0.075125  0.077716  0.000000  0.052665      0.0      0.0  0.000000     0.0   
6  0.000000  0.000000  0.000000  0.164614      0.0      0.0  0.000000     0.0   
7  0.227492  0.235339  0.000000  0.159480      0.0      0.0  0.000000     0.0   
8  0.000000  0.000000  0.000000  0.072758      0.0      0.0  0.000000     0.0   
9  0.000000  0.000000  0.000000  0.000000      0.0      0.0  0.000000     0.0   

    believe     bible  ...    university       use        ve      want  \
0  0.000000  0.185854 

# Entrenando clasificador

Empezamos Naive Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tf, twenty_train.target)

In [18]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']

X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tf_transformer.transform(X_new_counts)

NameError: name 'tf_transformer' is not defined

In [None]:
X_new_tfidf

In [None]:
clf.predict(X_new_tfidf)

# Pipeline


In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(twenty_train.data, twenty_train.target)

In [None]:
text_clf.predict(docs_new)