# Datasets

### Newsgroups

In [3]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=2,
                             remove=('headers', 'footers', 'quotes'))
text_samples = dataset.data[:1000]
text = text_samples[0]
text

u"Something about how Koresh had threatened to cause local \nproblems with all these wepaons he had and was alleged to\nhave.  \n\nSomeone else will post more details soon, I'm sure.\n\nOther News:\nSniper injures 9 outside MCA buildling in L.A.  Man arrested--suspect\nwas disgruntled employee of Universal Studios, which\nis a division of M.C.A.\n\n\nQUESTION:\nWhat will Californians do with all those guns after the Reginald\ndenny trial?"

# Tokenizing

In [6]:
import nltk
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

# Stemming

In [3]:
import nltk
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
stems = [stemmer.stem(t) for t in tokens]

In [5]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
stems = [stemmer.stem(t) for t in tokens]

In [6]:
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
stems = [stemmer.stem(t) for t in tokens]

# Stop Words

In [1]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

## Lemmatization

In [2]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
print lemmatizer.lemmatize('go', pos='v') 
print lemmatizer.lemmatize('went') 
print lemmatizer.lemmatize('went', pos='v')

# The lemmatize() function takes a second optional argument, which is the POS tag. Without the POS tag information, the lemmatizer is likely to fail, as follows:

# lemmatizer.lemmatize('am') 
# lemmatizer.lemmatize('am', pos='v') 
# lemmatizer.lemmatize('is') 
# lemmatizer.lemmatize('is', pos='v') 
# lemmatizer.lemmatize('are')  
# lemmatizer.lemmatize('are', pos='v') 

# The available POS tags for the WordNet-based lemmatizer are grouped into macro-categories: adjectives (a), nouns (n), verbs (v), and adverbs (r).

go
went
go


## Part of Speech Tagging

In [3]:
import nltk
tokens = nltk.word_tokenize("This sentence is short, nice and to the point") 
nltk.pos_tag(tokens)

[('This', 'DT'),
 ('sentence', 'NN'),
 ('is', 'VBZ'),
 ('short', 'JJ'),
 (',', ','),
 ('nice', 'JJ'),
 ('and', 'CC'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('point', 'NN')]

# Vectorizing
### CountVectorizer

In [15]:
import pandas
from sklearn.feature_extraction.text import CountVectorizer

# max_df: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
# min_df: Ignore terms that have doc freq lower than the given threshold
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000,
                                stop_words='english')
tf_matrix = tf_vectorizer.fit_transform(text_samples)
tf_vocab = tf_vectorizer.get_feature_names()

i = 130
j = 10
tf_sample_words = tf_vectorizer.get_feature_names()[i:i+10]
tf_sample_words
pandas.DataFrame(tf_matrix[j:j+7,i:i+10].todense(), columns=tf_sample_words)

Unnamed: 0,areas,aren,argic,argument,armenian,armenians,arms,army,article,articles
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,5,4,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0


### TfidfVectorizer

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=20000,
                                 min_df=0.1)
                                 #use_idf=True, tokenizer=nltk.word_tokenize, ngram_range=(1,3)
tfidf_matrix = tfidf_vectorizer.fit_transform(text_samples) #fit the vectorizer to synopses
tfidf_vocab = tfidf_vectorizer.get_feature_names()
tfidf_matrix

<1000x108 sparse matrix of type '<type 'numpy.float64'>'
	with 25613 stored elements in Compressed Sparse Row format>

### gensim word2vec

In [102]:
import gensim

# The type of input that Word2Vec is looking for.. 
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_samples]

model = gensim.models.Word2Vec(texts, size=100, window=5, min_count=1, workers=4,sg=1)

In [133]:
# Some useful attributes
print model.vocab[u'koresh']
print model.vocab[u'cult']
print model.vocab[u'waco']
print model.vocab[u'motherboard']
print model.most_similar('usenet' ,topn=4)
print model.similarity('woman','man')
print model.n_similarity(['woman', 'girl'], ['man', 'boy'])
print model.doesnt_match("waco koresh cult motherboard".split()) # this doesn't work very well

Vocab(count:11, index:1408, sample_int:4294967296)
Vocab(count:6, index:2670, sample_int:4294967296)
Vocab(count:5, index:3018, sample_int:4294967296)
Vocab(count:9, index:1696, sample_int:4294967296)
[(u'engine', 0.9990893006324768), (u'explained', 0.9990804195404053), (u'property', 0.9990782141685486), (u'swap', 0.9990752935409546)]
0.997880684459
0.998612564475
koresh


# Topic Modeling and Dimensionality Reduction
## LDA: Latent Drichlet Analysis


### sklearn

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda.fit(tf_matrix)

### gensim

In [17]:
from gensim.models.ldamodel import LdaModel

def lda_model(docs):
    # Build LDA model, setting the number of topics to extract
    return LdaModel(docs, num_topics=20)

def lda_vector(lda_model, doc):
    # Generate features for a new document
    return lda_model[doc]

from gensim.utils import mock_data
gensim_corpus = mock_data()
lda = lda_model(gensim_corpus)
lda

<gensim.models.ldamodel.LdaModel at 0x12404ac90>

## Latent Semantic Analysis

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def latent_semantic_analysis(docs):
    tfidf = TfidfVectorizer() # Using default parameters
    tfidf.fit(docs) # Creating dictionary
    vecs = tfidf.transform(docs) # Using dictionary to vectorize documents
    svd = TruncatedSVD(n_components=100) # Generating 100 top components
    svd.fit(vecs) # Creating SVD matrices
    return svd.transform(vecs) # Finally use LSA to vectorize documents

from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
latent_semantic_analysis(newsgroups_train.data)

## PCA

In [72]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
X = pca.fit_transform(tfidf_matrix.toarray()) 

## SVD

In [76]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=20, random_state=42)
X = svd.fit_transform(tfidf_matrix)

# Machine Learning Algorithms