In [1]:
import pandas as pd

In [2]:
import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS

In [4]:
sample = u"I can't imagine spending $3000 for a single bedroom apartment in N.Y.C."
nlp = spacy.load('en')
doc = nlp(sample)

In [5]:
# Print out tokens
for token in doc:
    print(token)

I
ca
n't
imagine
spending
$
3000
for
a
single
bedroom
apartment
in
N.Y.C.


In [6]:
# Store tokens as list, print out
tokens = [token for token in doc]
print(tokens)

[I, ca, n't, imagine, spending, $, 3000, for, a, single, bedroom, apartment, in, N.Y.C.]


In [7]:
for word in doc:
    if word.is_stop == True:
        print(word)

I
ca
n't
for
a
in


In [8]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

I -PRON- PRON PRP nsubj X True True
ca can AUX MD aux xx True True
n't not ADV RB neg x'x False True
imagine imagine VERB VB ROOT xxxx True False
spending spend VERB VBG xcomp xxxx True False
$ $ SYM $ nmod $ False False
3000 3000 NUM CD dobj dddd False False
for for ADP IN prep xxx True True
a a DET DT det x True True
single single ADJ JJ amod xxxx True False
bedroom bedroom NOUN NN compound xxxx True False
apartment apartment NOUN NN pobj xxxx True False
in in ADP IN prep xx True True
N.Y.C. N.Y.C. PROPN NNP pobj X.X.X. False False


In [9]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 70})

In [10]:
# Print out named entities
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

3000 26 30 MONEY
N.Y.C. 65 71 GPE


In [11]:
displacy.render(doc, style='ent', jupyter=True)

# TruncatedSVD

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_colwidth", 200)

In [13]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


11314

In [14]:
dataset.target_names 

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [15]:
news_df = pd.DataFrame({'document':documents})

# removing everything except alphabets`
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z#]", " ")

# removing short words
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# make all text lowercase
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [16]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# tokenization
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())

# remove stop-words
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

news_df['clean_doc'] = detokenized_doc

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])

X.shape # check shape of the document-term matrix

(11314, 1000)

In [18]:
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

len(svd_model.components_)

20

In [20]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])
        #print(" ")

Topic 0: 
like
know
people
think
good
time
thanks
Topic 1: 
thanks
windows
card
drive
mail
file
advance
Topic 2: 
game
team
year
games
season
players
good
Topic 3: 
drive
scsi
disk
hard
card
drives
problem
Topic 4: 
windows
file
window
files
program
using
problem
Topic 5: 
government
chip
mail
space
information
encryption
data
Topic 6: 
like
bike
know
chip
sounds
looks
look
Topic 7: 
card
sale
video
offer
monitor
price
jesus
Topic 8: 
know
card
chip
video
government
people
clipper
Topic 9: 
good
know
time
bike
jesus
problem
work
Topic 10: 
think
chip
good
thanks
clipper
need
encryption
Topic 11: 
thanks
right
problem
good
bike
time
window
Topic 12: 
good
people
windows
know
file
sale
files
Topic 13: 
space
think
know
nasa
problem
year
israel
Topic 14: 
space
good
card
people
time
nasa
thanks
Topic 15: 
people
problem
window
time
game
want
bike
Topic 16: 
time
bike
right
windows
file
need
really
Topic 17: 
time
problem
file
think
israel
long
mail
Topic 18: 
file
need
card
files
problem


In [29]:
import umap

X_topics = svd_model.fit_transform(X)
embedding = umap.get(getn_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)

plt.figure(figsize=(7,5))
plt.scatter(embedding[:, 0], embedding[:, 1], 
c = dataset.target,
s = 10, # size
edgecolor='none'
)
plt.show()

AttributeError: module 'umap' has no attribute 'get'

In [28]:
import umap
umap.get()

AttributeError: module 'umap' has no attribute 'get'

# LsiModel


# HDP Model

In [44]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models import HdpModel
hdp = HdpModel(common_corpus, common_dictionary)

In [45]:
unseen_document = [(1, 3.), (2, 4)]
doc_hdp = hdp[unseen_document]

In [46]:
topic_info = hdp.print_topics(num_topics=20, num_words=10)

In [47]:
hdp.update([[(1, 2)], [(1, 1), (4, 5)]])

In [48]:
hdp.lda_alpha

array([2.13309276e-01, 1.75163850e-01, 1.29753306e-01, 1.01323391e-01,
       7.96114031e-02, 6.36590246e-02, 5.00145521e-02, 3.96599836e-02,
       3.11042809e-02, 2.47680162e-02, 1.95200267e-02, 1.55039136e-02,
       1.22568569e-02, 9.62353071e-03, 7.50439563e-03, 5.92189079e-03,
       4.69037813e-03, 3.65328467e-03, 2.87947784e-03, 2.24828375e-03,
       1.75038524e-03, 1.36600541e-03, 1.05927248e-03, 8.28600490e-04,
       6.43405058e-04, 5.00912390e-04, 3.86693499e-04, 2.98307820e-04,
       2.30323295e-04, 1.78047248e-04, 1.36625218e-04, 1.06141911e-04,
       8.16317073e-05, 6.26820299e-05, 4.81158212e-05, 3.70072848e-05,
       2.82193312e-05, 2.14363923e-05, 1.64072104e-05, 1.23035108e-05,
       9.37297711e-06, 7.16142582e-06, 5.43998126e-06, 4.07847505e-06,
       3.09440953e-06, 2.33063068e-06, 1.73493642e-06, 1.31383383e-06,
       9.88967768e-07, 7.33526647e-07, 5.46014206e-07, 4.04712642e-07,
       3.04237218e-07, 2.24979579e-07, 1.66688861e-07, 1.22422543e-07,
      

# pLSA

In [57]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from enstop import PLSA

news = fetch_20newsgroups(subset='all')
data = CountVectorizer().fit_transform(news.data)

model = PLSA(n_components=20).fit(data)
topics = model.components_
doc_vectors = model.embedding_

ModuleNotFoundError: No module named 'enstop'

In [55]:
fit(data)

NameError: name 'fit' is not defined