In [1]:
%matplotlib inline
import pandas as pd

df = pd.read_csv('./data/train.csv')
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


In [2]:
import re
from bs4 import BeautifulSoup

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(tokenizer_stem_nostop('runners like running and thus they run'))

['runner', 'like', 'run', 'thu', 'run']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ginag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### BoW

In [4]:
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer

doc_dummy = ["Study hard, then you will be happy and I will be happy", 
           "\"I'm not happy :(\" \", because you don't study hard"]
print('[example documents]\n{}\n'.format('\n'.join(doc_dummy)))

# ngram_range=(min,max), default: 1-gram => (1,1)
count = CountVectorizer(ngram_range=(1, 1),
                        preprocessor=preprocessor,
                        tokenizer=tokenizer_stem_nostop)

count.fit(doc_dummy)
# dictionary is stored in vocabulary_
BoW = count.vocabulary_
print('[vocabulary]\n{}'.format(BoW))

[example documents]
Study hard, then you will be happy and I will be happy
"I'm not happy :(" ", because you don't study hard

[vocabulary]
{'studi': 2, 'hard': 1, 'happi': 0}




In [5]:
# get matrix (doc_id, vocabulary_id) --> tf
doc_bag = count.transform(doc_dummy)
print('(did, vid)\ttf')
print(doc_bag)

print('\nIs document-term matrix a scipy.sparse matrix? {}'.format(sp.sparse.issparse(doc_bag)))

(did, vid)	tf
  (0, 0)	2
  (0, 1)	1
  (0, 2)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1

Is document-term matrix a scipy.sparse matrix? True


In [6]:
doc_bag = doc_bag.toarray()
print(doc_bag)

print('\nAfter calling .toarray(), is it a scipy.sparse matrix? {}'.format(sp.sparse.issparse(doc_bag)))

[[2 1 1]
 [1 1 1]]

After calling .toarray(), is it a scipy.sparse matrix? False


In [7]:
doc = df['Page content'].iloc[:100]
doc_bag = count.fit_transform(doc).toarray()

In [17]:
print("[most frequent vocabularies]")
bag_cnts = np.sum(doc_bag, axis=0)
top = 10
print(bag_cnts.shape)

[most frequent vocabularies]
(7127,)


In [9]:
print(count, '\n')

print(count.inverse_transform(np.ones(bag_cnts.shape[0])))

print(count.inverse_transform(np.ones(bag_cnts.shape[0]))[0])
print(count.inverse_transform(np.ones(bag_cnts.shape[0]))[0].shape)

# [bag_cnts.argsort()[::-1] -> descending -> get top 10
print(count.inverse_transform(np.ones(bag_cnts.shape[0]))[0][bag_cnts.argsort()[::-1][:top]])

CountVectorizer(preprocessor=<function preprocessor at 0x000001CA9AA28AF0>,
                tokenizer=<function tokenizer_stem_nostop at 0x000001CABF765DC0>) 

[array(['a330', 'a340', 'aan', ..., 'zuffa', 'zvlfremqgq', 'zvqxb2f5xi'],
      dtype='<U22')]
['a330' 'a340' 'aan' ... 'zuffa' 'zvlfremqgq' 'zvqxb2f5xi']
(7127,)
['imag' 'getti' 'new' 'also' 'one' 'said' 'see' 'time' 'year' 'world']


In [10]:
# [::-1] reverses a list since sort is in ascending order
for tok, v in zip(count.inverse_transform(np.ones(bag_cnts.shape[0]))[0][bag_cnts.argsort()[::-1][:top]], \
                  np.sort(bag_cnts)[::-1][:top]):
    print('{}: {}'.format(tok, v))

imag: 639
getti: 180
new: 168
also: 165
one: 162
said: 151
see: 147
time: 144
year: 142
world: 135


### TFIDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,1),
                        preprocessor=preprocessor,
                        tokenizer=tokenizer_stem_nostop)

tfidf.fit(doc)

top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    print('%s: %.2f' %(tfidf.get_feature_names()[sorted_idx[i]], idf[sorted_idx[i]]))

doc_tfidf = tfidf.transform(doc).toarray()
tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones(tfidf_sum.shape[0]))[0][tfidf_sum.argsort()[::-1]][:top], \
                        np.sort(tfidf_sum)[::-1][:top]):
    print('{}: {}'.format(tok, v))

[vocabularies with smallest idf scores]
topic: 1.00
see: 1.12
also: 1.13
imag: 1.21
one: 1.32
new: 1.52
world: 1.54
year: 1.54
like: 1.55
time: 1.61

[vocabularies with highest tf-idf scores]
imag: 4.1122911268509705
app: 2.3867465879041356
video: 2.3321333885238404
game: 2.1911895842442406
said: 2.0142987671015766
new: 2.0010914875633707
compani: 1.9319822679769407
getti: 1.7677938352965423
time: 1.7601195175854225
twitter: 1.7333793682275045


In [12]:
print(tfidf_sum.shape)


(7127,)


In [13]:
print(doc_tfidf.shape)

(100, 7127)


7127 features may too large to store

In [14]:
for idx, x in enumerate(doc_tfidf[0:1,:][0]):
    if x > 0:
        print(tfidf.get_feature_names()[idx], ": ", x)

abl :  0.019160789088677892
academia :  0.036004167476229934
accomplish :  0.036004167476229934
administr :  0.05172688646057352
agenc :  0.06530958424107364
aim :  0.02586344323028676
alreadi :  0.02079308110731518
also :  0.049441296554212
alter :  0.03303819576921464
anim :  0.026840223536662797
announc :  0.0369271928601622
anoth :  0.016699499290719625
approach :  0.02796783364624306
articl :  0.019535998833908295
ask :  0.01880388970667592
associ :  0.03288561353478174
asteroid :  0.6120708470959089
astronaut :  0.15466902676629174
author :  0.02423115121164948
begin :  0.022311960213999275
benefit :  0.025001861939227772
bill :  0.050003723878455544
build :  0.019160789088677892
cancel :  0.030933805353258348
capsul :  0.036004167476229934
captur :  0.050003723878455544
capturerespons :  0.036004167476229934
caus :  0.022311960213999275
centauri :  0.03303819576921464
challeng :  0.09692460484659791
chelyabinsk :  0.036004167476229934
china :  0.02586344323028676
citi :  0.01782

In [15]:
doc_tfidf[0:1,:][0]

array([0., 0., 0., ..., 0., 0., 0.])

In [16]:
doc[0]

'<html><head><div class="article-info"> <span class="byline basic">Clara Moskowitz</span> for <a href="/publishers/space-com/">Space.com</a> <time datetime="Wed, 19 Jun 2013 15:04:30 +0000">2013-06-19 15:04:30 UTC</time> </div></head><body><h1 class="title">NASA\'s Grand Challenge: Stop Asteroids From Destroying Earth</h1><figure class="article-image"><img class="microcontent" data-fragment="lead-image" data-image="http://i.amz.mshcdn.com/I7b9cUsPSztew7r1WT6_iBLjflo=/950x534/2013%2F06%2F19%2Ffe%2FDactyl.44419.jpg" data-micro="1" data-url="http://mashable.com/2013/06/19/nasa-grand-challenge-asteroid/" src="http://i.amz.mshcdn.com/I7b9cUsPSztew7r1WT6_iBLjflo=/950x534/2013%2F06%2F19%2Ffe%2FDactyl.44419.jpg"/></figure><article data-channel="world"><section class="article-content"> <p>There may be killer asteroids headed for Earth, and NASA has decided to do something about it. The space agency announced a new "Grand Challenge" on June 18 to find all dangerous space rocks and figure out how

In [51]:
text = BeautifulSoup(doc[0], 'html.parser').select('time')[0].text.strip()
    
# regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
emoticons = re.findall(r, text)
text = re.sub(r, '', text)

# convert to lowercase and append all emoticons behind (with space in between)
# replace('-','') removes nose of emoticons
text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')

In [52]:
text

'2013 06 19 15 04 30 utc '

In [46]:
print(text.select('h1')[0].text.strip())
print(text.select('time')[0].text.strip())
print(text.select('body')[0].text.strip())

NASA's Grand Challenge: Stop Asteroids From Destroying Earth
2013-06-19 15:04:30 UTC
NASA's Grand Challenge: Stop Asteroids From Destroying Earth There may be killer asteroids headed for Earth, and NASA has decided to do something about it. The space agency announced a new "Grand Challenge" on June 18 to find all dangerous space rocks and figure out how to stop them from destroying our planet. The new mission builds on projects already underway at NASA, including a plan to capture an asteroid, pull it in toward the moon and send astronauts to visit it. As part of the Grand Challenge, the agency issued a "request for information" today — aiming to solicit ideas from industry, academia and the public on how to improve the asteroid mission plan. "We're asking for you to think about concepts and different approaches for what we've described here," William Gerstenmaier, NASA's associate administrator for human explorations and operations, said yesterday during a NASA event announcing the in

# Competition