In [1]:
import re
import nltk
from bs4 import BeautifulSoup
import numpy as np
from urllib.request import urlopen
import string
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cosine
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from gensim import matutils

import pyLDAvis.gensim

In [2]:
wiki = "http://en.wikipedia.org/wiki/"
titles = ["Integral", "Riemann_integral", "Riemann-Stieltjes_integral", "Derivative",
    "Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
    "Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien","Warsaw","England","Eiffel_Tower","Taipei_101"]

def parse(url):        
    x = urlopen(url)
    x = x.read()
    x = BeautifulSoup(x,'html.parser')
    x = x.find("div",id="bodyContent")
    x = x.find_all("p")
    return(x)

articles_raw = [parse(url) for url in [wiki+x for x in titles]]

In [3]:
articles_raw[0][1]

<p>is defined informally as the signed <a class="mw-redirect" href="/wiki/Area_(geometry)" title="Area (geometry)">area</a> of the region in the <span class="texhtml mvar" style="font-style:italic;">xy</span>-plane that is bounded by the <a href="/wiki/Graph_of_a_function" title="Graph of a function">graph</a> of <span class="texhtml mvar" style="font-style:italic;">f</span>, the <span class="texhtml mvar" style="font-style:italic;">x</span>-axis and the vertical lines <span class="texhtml"><i>x</i> = <i>a</i></span> and <span class="texhtml"><i>x</i> = <i>b</i></span>. The area above the <span class="texhtml mvar" style="font-style:italic;">x</span>-axis adds to the total and that below the <span class="texhtml mvar" style="font-style:italic;">x</span>-axis subtracts from the total.
</p>

## Preprocessing

### Removing html tags

In [4]:
articles_non_tags = [re.sub('<.*?>','',str(a)) for a in articles_raw]

### Convert into lower letters 

In [5]:
articles_lower = [art.lower() for art in articles_non_tags]

In [6]:
articles_lower[1][:1000]

'[in the branch of mathematics known as real analysis, the riemann integral, created by bernhard riemann, was the first rigorous definition of the integral of a function on an interval. it was presented to the faculty at the university of göttingen in 1854, but not published in a journal until 1868.[1] for many functions and practical applications, the riemann integral can be evaluated by the fundamental theorem of calculus or approximated by numerical integration., the riemann integral is unsuitable for many theoretical purposes. some of the technical deficiencies in riemann integration can be remedied with the riemann–stieltjes integral, and most disappear with the lebesgue integral., , , let f be a nonnegative real-valued function on the interval [a, b], and let, be the region of the plane under the graph of the function f and above the interval [a, b] (see the figure on the top right). we are interested in measuring the area of s. once we have measured it, we will denote the area b

## Segmentation

### Word tokenize - document segmentation

In [7]:
nltk.word_tokenize(articles_lower[0])

['[',
 'in',
 'mathematics',
 ',',
 'an',
 'integral',
 'assigns',
 'numbers',
 'to',
 'functions',
 'in',
 'a',
 'way',
 'that',
 'can',
 'describe',
 'displacement',
 ',',
 'area',
 ',',
 'volume',
 ',',
 'and',
 'other',
 'concepts',
 'that',
 'arise',
 'by',
 'combining',
 'infinitesimal',
 'data',
 '.',
 'integration',
 'is',
 'one',
 'of',
 'the',
 'two',
 'main',
 'operations',
 'of',
 'calculus',
 ',',
 'with',
 'its',
 'inverse',
 'operation',
 ',',
 'differentiation',
 ',',
 'being',
 'the',
 'other',
 '.',
 'given',
 'a',
 'function',
 'f',
 'of',
 'a',
 'real',
 'variable',
 'x',
 'and',
 'an',
 'interval',
 '[',
 'a',
 ',',
 'b',
 ']',
 'of',
 'the',
 'real',
 'line',
 ',',
 'the',
 'definite',
 'integral',
 ',',
 'is',
 'defined',
 'informally',
 'as',
 'the',
 'signed',
 'area',
 'of',
 'the',
 'region',
 'in',
 'the',
 'xy-plane',
 'that',
 'is',
 'bounded',
 'by',
 'the',
 'graph',
 'of',
 'f',
 ',',
 'the',
 'x-axis',
 'and',
 'the',
 'vertical',
 'lines',
 'x',
 '=',

In [8]:
article_tokenized = [nltk.word_tokenize(art) for art in articles_lower]

In [9]:
article_tokenized[0][:10]

['[',
 'in',
 'mathematics',
 ',',
 'an',
 'integral',
 'assigns',
 'numbers',
 'to',
 'functions']

## Cleaning segments

### Removing punctuation

In [17]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
articles_no_punctuation = [
    [token for token in art if token not in string.punctuation]
    for art in article_tokenized
]

In [19]:
articles_no_punctuation[5][:10]

['edvard',
 'munch',
 '/mʊŋk/',
 '1',
 'norwegian',
 'ˈedvɑʈ',
 'ˈmuŋk',
 'listen',
 '12',
 'december']

### Removing 'stopwords'

In [20]:
stopwords = nltk.corpus.stopwords.words('english')

In [29]:
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [21]:
len(stopwords)

179

In [22]:
articles_no_stopwords = [
    [token for token in art if not token in stopwords]
    for art in articles_no_punctuation
]

In [23]:
articles_no_stopwords[0][:10]

['mathematics',
 'integral',
 'assigns',
 'numbers',
 'functions',
 'way',
 'describe',
 'displacement',
 'area',
 'volume']

## Segments normalization

### Stemming - identifying a common stem among various forms of a word (removing suffixes, plural from, various form fo verbs, stem is not required to be a propery spelled word)
### Most popular : Porter and Snowball stemmers

### More sxtensive normalization down to the semmantic root of a word is called lemmatization, more accurate than stemming as it takes into account the meaning of a word, but stemmers are faster, and less complex

W nltk mamy dostępny stemmer dla języka angielskiego. Dla języka polskiego mamy dostępny na przykład stemmer Morfeusz (http://sgjp.pl/morfeusz/index.html),

In [30]:
stemmer = nltk.PorterStemmer()

In [31]:
stemmer.stem('driving')

'drive'

In [32]:
articles_stemmed = [
    [stemmer.stem(token) for token in art]
    for art in articles_no_stopwords
]

In [33]:
articles_stemmed[0][:20]

['mathemat',
 'integr',
 'assign',
 'number',
 'function',
 'way',
 'describ',
 'displac',
 'area',
 'volum',
 'concept',
 'aris',
 'combin',
 'infinitesim',
 'data',
 'integr',
 'one',
 'two',
 'main',
 'oper']

### Merging into one text

In [34]:
articles_cleaned = [' '.join(tokens) for tokens in articles_stemmed]

In [35]:
articles_cleaned[0][:400]

'mathemat integr assign number function way describ displac area volum concept aris combin infinitesim data integr one two main oper calculu invers oper differenti given function f real variabl x interv b real line definit integr defin inform sign area region xy-plan bound graph f x-axi vertic line x x b. area x-axi add total x-axi subtract total oper integr addit constant invers oper differenti re'

### Count vectorizer (reguires text not separate tokens) - represent documents as vectors of word counts

In [36]:
vectorizer = CountVectorizer(token_pattern='(?u)\\b\\w+\\b',min_df=2, max_df=0.7) 
# default pattern remove one letter long words

In [37]:
vectorizer.fit(articles_cleaned)
dtm_count = vectorizer.transform(articles_cleaned)
dtm_count

<15x3710 sparse matrix of type '<class 'numpy.int64'>'
	with 14569 stored elements in Compressed Sparse Row format>

In [39]:
dtm_count.shape

(15, 3710)

In [40]:
dtm_count.getcol(800).toarray()

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [2],
       [0],
       [0]])

In [41]:
vectorizer.get_feature_names()[800]

'birthplac'

In [42]:
vectorizer.vocabulary_

{'mathemat': 2259,
 'integr': 1990,
 'assign': 686,
 'function': 1702,
 'displac': 1303,
 'area': 658,
 'volum': 3574,
 'concept': 1064,
 'aris': 661,
 'combin': 1025,
 'infinitesim': 1961,
 'data': 1195,
 'main': 2222,
 'oper': 2497,
 'calculu': 891,
 'invers': 2016,
 'differenti': 1276,
 'f': 1554,
 'real': 2812,
 'variabl': 3527,
 'x': 3676,
 'interv': 2005,
 'b': 731,
 'definit': 1225,
 'defin': 1224,
 'inform': 1964,
 'sign': 3086,
 'region': 2851,
 'plan': 2636,
 'bound': 832,
 'graph': 1773,
 'axi': 730,
 'vertic': 3544,
 'add': 525,
 'total': 3417,
 'subtract': 3274,
 'constant': 1089,
 'reason': 2820,
 'refer': 2842,
 'relat': 2860,
 'notion': 2459,
 'deriv': 1251,
 'case': 916,
 'written': 3671,
 'discuss': 1296,
 'articl': 675,
 'fundament': 1704,
 'theorem': 3376,
 'connect': 1081,
 'valu': 3523,
 'principl': 2713,
 'formul': 1665,
 'independ': 1945,
 'isaac': 2025,
 'newton': 2427,
 'gottfri': 1758,
 'wilhelm': 3634,
 'leibniz': 2147,
 'late': 2117,
 '17th': 104,
 'thought

### statistics for  word 'kingdom' in documents

In [43]:
vectorizer.vocabulary_["kingdom"]

2088

In [45]:
vectorizer.get_feature_names()[2088]

'kingdom'

In [46]:
dtm_count.getcol(vectorizer.vocabulary_["kingdom"]).toarray()

array([[ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 1],
       [ 0],
       [ 0],
       [ 7],
       [ 2],
       [ 3],
       [ 5],
       [56],
       [ 0],
       [ 0]])

### statistics of words in first document

In [54]:
dtm_count.getrow(0).toarray()[0].shape

(3710,)

In [55]:
df = pd.DataFrame({
    'counts':dtm_count.getrow(0).toarray()[0],# pandas potrzebuje 1d
    'words':vectorizer.get_feature_names()
}
)

In [56]:
df = df.sort_values(by="counts",ascending=False)

In [57]:
df.head()

Unnamed: 0,counts,words
1990,267,integr
1702,132,function
1554,74,f
3676,71,x
731,50,b


### Top k words for all documents

In [58]:
def top_words(matrix,vectorizer,k):
    
    words = np.array(vectorizer.get_feature_names())
    top_words = []
    
    for art in range(matrix.shape[0]):

        words_counter =  matrix.getrow(art).toarray()[0]

        words_counter = np.argsort(words_counter)
        words_counter = words_counter[::-1]
        
        top_words_indices = words_counter[:k]
        top_document_words = words[top_words_indices]
        
        
        top_words.append(top_document_words) 
        
    return np.array(top_words)

    
        

In [59]:
top_words(dtm_count,vectorizer,5)

array([['integr', 'function', 'f', 'x', 'b'],
       ['integr', 'riemann', 'sum', 'interv', 'function'],
       ['integr', 'g', 'riemann', 'stieltj', 'function'],
       ['deriv', 'f', 'function', 'x', 'h'],
       ['displaystyl', 'x', 'sequenc', 'b', 'to'],
       ['cm', 'paint', 'museum', 'art', 'portrait'],
       ['van', 'gogh', 'paint', 'art', 'portrait'],
       ['paint', 'kraków', 'polish', 'jan', 'art'],
       ['war', 'russian', 'novel', 'anarchist', 'peac'],
       ['stori', 'german', 'write', 'der', 'die'],
       ['tolkien', 'ring', 'english', 'lord', 'stori'],
       ['warsaw', 'citi', 'polish', 'poland', 'palac'],
       ['england', 'english', 'world', 'kingdom', 'london'],
       ['tower', 'level', 'top', 'pari', 'de'],
       ['101', 'floor', 'build', 'tower', 'ft']], 
      dtype='<U14')

### TF-IDF - Term Frequency-Inverse Documents Frequency

In [60]:
sentences = [
    "morze plaza slonce",
    'morze piasek slonce lato',
    'plaza lato woda'
]

In [61]:
CountVectorizer().fit_transform(sentences).toarray()

array([[0, 1, 0, 1, 1, 0],
       [1, 1, 1, 0, 1, 0],
       [1, 0, 0, 1, 0, 1]], dtype=int64)

In [62]:
TfidfVectorizer(norm=None).fit_transform(sentences).toarray()

array([[ 0.        ,  1.28768207,  0.        ,  1.28768207,  1.28768207,
         0.        ],
       [ 1.28768207,  1.28768207,  1.69314718,  0.        ,  1.28768207,
         0.        ],
       [ 1.28768207,  0.        ,  0.        ,  1.28768207,  0.        ,
         1.69314718]])

In [63]:
# im częściej występuje slowo w danym dokumencie tym to słowo ma większe TF-IDF
# im częściej słowo występuje w wielu artykułąch tym to słowo ma mniejsze TF-IDF

In [64]:
tf_idf = TfidfVectorizer(norm=None,min_df=2,max_df=0.7,token_pattern='(?u)\\b\\w+\\b')

In [65]:
tf_idf.fit(articles_cleaned)
dtm_tfidf = tf_idf.transform(articles_cleaned)
dtm_tfidf

<15x3710 sparse matrix of type '<class 'numpy.float64'>'
	with 14569 stored elements in Compressed Sparse Row format>

In [66]:
top_words(dtm_tfidf,tf_idf,5)

array([['integr', 'function', 'x', 'f', 'interv'],
       ['riemann', 'integr', 'interv', 'sum', 'partit'],
       ['integr', 'riemann', 'stieltj', 'g', 'x'],
       ['f', 'deriv', 'x', 'function', 'vector'],
       ['displaystyl', 'x', 'sequenc', 'b', 'to'],
       ['cm', 'paint', 'museum', 'portrait', 'scream'],
       ['gogh', 'van', 'paint', 'gauguin', '1888'],
       ['kraków', 'polish', 'paint', 'jan', 'fine'],
       ['anarchist', 'war', 'russian', 'novel', 'peac'],
       ['stori', 'der', 'franz', 'jewish', 'czech'],
       ['tolkien', 'ring', 'lord', 'english', 'beowulf'],
       ['warsaw', 'citi', 'poland', 'polish', 'palac'],
       ['england', 'english', 'london', 'kingdom', 'britain'],
       ['tower', 'level', 'top', 'pari', 'ft'],
       ['floor', '101', 'tower', 'build', 'ft']], 
      dtype='<U14')

In [None]:
top_words(dtm_count,vectorizer,5)

### Cosine similarity 

In [67]:
sentences = ["morze plaza slonce","morze plaza slonce morze plaza slonce"]

In [68]:
matrix_repr = CountVectorizer().fit_transform(sentences).toarray()

In [69]:
euclidean(matrix_repr[0],matrix_repr[1])

1.7320508075688772

In [70]:
cosine(matrix_repr[0],matrix_repr[1])

0.0

In [71]:
dtm_count.shape

(15, 3710)

In [72]:
cosine(dtm_count[0].toarray(),dtm_count[1].toarray())

0.26571334929199597

### according to CountVectorizer

In [73]:
for art in range(dtm_count.shape[0]):
    distances = [cosine(dtm_count[art].toarray(),dtm_count[art2].toarray())for art2 in range(dtm_count.shape[0])]
    distances[art] = 1.0
    print('Article {} is most similar to {}'.format(titles[art],titles[np.argmin(distances)]))

Article Integral is most similar to Riemann_integral
Article Riemann_integral is most similar to Integral
Article Riemann-Stieltjes_integral is most similar to Integral
Article Derivative is most similar to Integral
Article Limit_of_a_sequence is most similar to Derivative
Article Edvard_Munch is most similar to Jan_Matejko
Article Vincent_van_Gogh is most similar to Edvard_Munch
Article Jan_Matejko is most similar to Edvard_Munch
Article Lev_Tolstoj is most similar to Franz_Kafka
Article Franz_Kafka is most similar to Lev_Tolstoj
Article J._R._R._Tolkien is most similar to Franz_Kafka
Article Warsaw is most similar to Jan_Matejko
Article England is most similar to J._R._R._Tolkien
Article Eiffel_Tower is most similar to Taipei_101
Article Taipei_101 is most similar to Eiffel_Tower


### according to TF-IDF

In [74]:
for art in range(dtm_tfidf.shape[0]):
    distances = [cosine(dtm_tfidf[art].toarray(),dtm_tfidf[art2].toarray())for art2 in range(dtm_tfidf.shape[0])]
    distances[art] = 1.0
    print('Article {} is most similar to {}'.format(titles[art],titles[np.argmin(distances)]))

Article Integral is most similar to Riemann_integral
Article Riemann_integral is most similar to Riemann-Stieltjes_integral
Article Riemann-Stieltjes_integral is most similar to Riemann_integral
Article Derivative is most similar to Integral
Article Limit_of_a_sequence is most similar to Derivative
Article Edvard_Munch is most similar to Jan_Matejko
Article Vincent_van_Gogh is most similar to Edvard_Munch
Article Jan_Matejko is most similar to Edvard_Munch
Article Lev_Tolstoj is most similar to Franz_Kafka
Article Franz_Kafka is most similar to Lev_Tolstoj
Article J._R._R._Tolkien is most similar to Franz_Kafka
Article Warsaw is most similar to Jan_Matejko
Article England is most similar to Lev_Tolstoj
Article Eiffel_Tower is most similar to Taipei_101
Article Taipei_101 is most similar to Eiffel_Tower


## LSI - Latent Semantic Indexing, LSA - Latent Semantic Analysis (SVD)

In [75]:
dtm_tfidf.shape

(15, 3710)

In [76]:
# 15 documents and 3724 terms - OVERFITTTING
# We want to end up with fewer topics than words, so we can use those topic vector 
# as a reduced-dimension reprezentation of the original TF-IDF vectors

In [77]:
svd = TruncatedSVD(5,n_iter=100) # 5 topics

In [78]:
svd.fit(dtm_tfidf)

TruncatedSVD(algorithm='randomized', n_components=5, n_iter=100,
       random_state=None, tol=0.0)

In [79]:
svd_topic_vectors = svd.transform(dtm_tfidf)#transform document_term_matrix 
                                            #into topic_document matrix(topic vectors)

In [94]:
svd_topic_vectors.shape

(15, 5)

In [95]:
svd_topic_vectors

array([[ 0.13342,  0.7143 , -0.23265,  0.08995, -0.03971],
       [ 0.06601,  0.37659, -0.12372,  0.04717, -0.03153],
       [ 0.02303,  0.13258, -0.04335,  0.01696, -0.00983],
       [ 0.08651,  0.4311 , -0.14329,  0.05106,  0.00121],
       [ 0.01549,  0.07816, -0.02557,  0.0097 , -0.00172],
       [ 0.07614,  0.00274,  0.05944,  0.05603,  0.01432],
       [ 0.38324,  0.02781,  0.58426,  0.70161, -0.06677],
       [ 0.03633,  0.0044 ,  0.03243, -0.00517, -0.00305],
       [ 0.04247, -0.0018 ,  0.00394,  0.00021,  0.02554],
       [ 0.08089, -0.00309,  0.00348,  0.00016,  0.03435],
       [ 0.70351, -0.34161, -0.57473,  0.08044, -0.21938],
       [ 0.45065,  0.14333,  0.47872, -0.68491, -0.27022],
       [ 0.32168,  0.00613,  0.02733, -0.12384,  0.93004],
       [ 0.06266,  0.01492,  0.03084, -0.00625,  0.0562 ],
       [ 0.04285,  0.00965,  0.01857, -0.01785,  0.03937]])

In [80]:
np.set_printoptions(5, suppress=True)

In [82]:
cumsum = np.cumsum(svd.explained_variance_ratio_)
cumsum # it shows us how each topic explains the variance of our documnets in our new topic vectos space 

array([ 0.13686,  0.29676,  0.48757,  0.67109,  0.78101])

In [83]:
svd.singular_values_

array([ 859.97824,  773.54179,  770.23736,  756.31047,  588.55946])

In [84]:
svd_topic_vectors.shape

(15, 5)

In [85]:
svd.components_.shape # (topics,terms)

(5, 3710)

In [89]:
#Top words in each topic
k=8
words = np.array(vectorizer.get_feature_names())
for topic in svd.components_:
    indexes = np.argsort(topic)
    indexes = indexes[::-1]
    top_words_indices = indexes[:k]
    print(words[top_words_indices])
    print(topic[indexes[:k]])
# each topic is a linear combiantations of terms

['tolkien' 'warsaw' 'gogh' 'van' 'england' 'english' 'citi' 'paint']
[ 0.58943  0.32555  0.22585  0.19602  0.17519  0.12583  0.11924  0.10898]
['integr' 'function' 'f' 'x' 'riemann' 'deriv' 'interv' 'warsaw']
[ 0.56607  0.31472  0.3      0.26081  0.20707  0.15383  0.12978  0.11498]
['warsaw' 'gogh' 'van' 'paint' 'citi' 'polish' 'poland' 'gauguin']
[ 0.38583  0.3836   0.32278  0.15324  0.12908  0.08367  0.08249  0.07587]
['gogh' 'van' 'paint' 'gauguin' '1888' 'portrait' 'vincent' 'tolkien']
[ 0.46511  0.39173  0.17347  0.09257  0.0862   0.08334  0.08214  0.07606]
['england' 'english' 'london' 'kingdom' 'britain' 'world' 'british' 'roman']
[ 0.62128  0.23917  0.1669   0.15622  0.14267  0.11179  0.11077  0.0897 ]


In [90]:
# topics in documents
svd_topic_vectors = svd_topic_vectors/svd.singular_values_#(documents,topics)
svd_topic_vectors

array([[ 0.13342,  0.7143 , -0.23265,  0.08995, -0.03971],
       [ 0.06601,  0.37659, -0.12372,  0.04717, -0.03153],
       [ 0.02303,  0.13258, -0.04335,  0.01696, -0.00983],
       [ 0.08651,  0.4311 , -0.14329,  0.05106,  0.00121],
       [ 0.01549,  0.07816, -0.02557,  0.0097 , -0.00172],
       [ 0.07614,  0.00274,  0.05944,  0.05603,  0.01432],
       [ 0.38324,  0.02781,  0.58426,  0.70161, -0.06677],
       [ 0.03633,  0.0044 ,  0.03243, -0.00517, -0.00305],
       [ 0.04247, -0.0018 ,  0.00394,  0.00021,  0.02554],
       [ 0.08089, -0.00309,  0.00348,  0.00016,  0.03435],
       [ 0.70351, -0.34161, -0.57473,  0.08044, -0.21938],
       [ 0.45065,  0.14333,  0.47872, -0.68491, -0.27022],
       [ 0.32168,  0.00613,  0.02733, -0.12384,  0.93004],
       [ 0.06266,  0.01492,  0.03084, -0.00625,  0.0562 ],
       [ 0.04285,  0.00965,  0.01857, -0.01785,  0.03937]])

In [91]:
pd.DataFrame(svd_topic_vectors,
             index=titles,
             columns=['topic{}'.format(i) for i in range(svd_topic_vectors.shape[1])])

Unnamed: 0,topic0,topic1,topic2,topic3,topic4
Integral,0.133416,0.714304,-0.232649,0.089952,-0.039705
Riemann_integral,0.066007,0.376593,-0.12372,0.047167,-0.03153
Riemann-Stieltjes_integral,0.02303,0.132584,-0.043353,0.016959,-0.009829
Derivative,0.086505,0.431104,-0.143288,0.051058,0.001215
Limit_of_a_sequence,0.01549,0.078165,-0.025569,0.0097,-0.001717
Edvard_Munch,0.07614,0.002742,0.059437,0.056028,0.014324
Vincent_van_Gogh,0.383241,0.027809,0.58426,0.70161,-0.066774
Jan_Matejko,0.036327,0.004399,0.032435,-0.005167,-0.003052
Lev_Tolstoj,0.042466,-0.0018,0.003943,0.000212,0.025536
Franz_Kafka,0.080888,-0.003088,0.003484,0.000163,0.034353


In [96]:
for col in range(svd_topic_vectors.shape[1]):
    indexes = np.argsort(svd_topic_vectors[:,col])
    indexes = indexes[::-1]
    print('topic_{}'.format(col))
    print([titles[ind] for ind in indexes])

topic_0
['J._R._R._Tolkien', 'Warsaw', 'Vincent_van_Gogh', 'England', 'Integral', 'Derivative', 'Franz_Kafka', 'Edvard_Munch', 'Riemann_integral', 'Eiffel_Tower', 'Taipei_101', 'Lev_Tolstoj', 'Jan_Matejko', 'Riemann-Stieltjes_integral', 'Limit_of_a_sequence']
topic_1
['Integral', 'Derivative', 'Riemann_integral', 'Warsaw', 'Riemann-Stieltjes_integral', 'Limit_of_a_sequence', 'Vincent_van_Gogh', 'Eiffel_Tower', 'Taipei_101', 'England', 'Jan_Matejko', 'Edvard_Munch', 'Lev_Tolstoj', 'Franz_Kafka', 'J._R._R._Tolkien']
topic_2
['Vincent_van_Gogh', 'Warsaw', 'Edvard_Munch', 'Jan_Matejko', 'Eiffel_Tower', 'England', 'Taipei_101', 'Lev_Tolstoj', 'Franz_Kafka', 'Limit_of_a_sequence', 'Riemann-Stieltjes_integral', 'Riemann_integral', 'Derivative', 'Integral', 'J._R._R._Tolkien']
topic_3
['Vincent_van_Gogh', 'Integral', 'J._R._R._Tolkien', 'Edvard_Munch', 'Derivative', 'Riemann_integral', 'Riemann-Stieltjes_integral', 'Limit_of_a_sequence', 'Lev_Tolstoj', 'Franz_Kafka', 'Jan_Matejko', 'Eiffel_Tow

In [93]:
# documents' similarity according to topics 
for art_ind in range(svd_topic_vectors.shape[0]):
    distances = [cosine(svd_topic_vectors[art_ind,:],art2) for art2 in svd_topic_vectors]
    distances[art_ind] = 1.0
    print('Article {} is most similar to {}'.format(titles[art_ind],titles[np.argmin(distances)]))

Article Integral is most similar to Riemann-Stieltjes_integral
Article Riemann_integral is most similar to Riemann-Stieltjes_integral
Article Riemann-Stieltjes_integral is most similar to Riemann_integral
Article Derivative is most similar to Limit_of_a_sequence
Article Limit_of_a_sequence is most similar to Derivative
Article Edvard_Munch is most similar to Vincent_van_Gogh
Article Vincent_van_Gogh is most similar to Edvard_Munch
Article Jan_Matejko is most similar to Edvard_Munch
Article Lev_Tolstoj is most similar to Franz_Kafka
Article Franz_Kafka is most similar to Lev_Tolstoj
Article J._R._R._Tolkien is most similar to Franz_Kafka
Article Warsaw is most similar to Jan_Matejko
Article England is most similar to Taipei_101
Article Eiffel_Tower is most similar to Taipei_101
Article Taipei_101 is most similar to Eiffel_Tower


## LDA (Latent Dirichlet Allocation) 
Motywacja: przedstawienie tekstu jako mieszanki tematów.
Temat - rozkład prawdopodobieństwa na zbiorze słów.

**Rozkład Dirichleta**
Jest to rozkład, na którym opiera się model LDA

Gęstość trójwymiarowego rozkładu Dirichleta Dir( αα ).

Wektor losowy  (x1,...,xK)(x1,...,xK)  z  KK -wymiarowego rozkładu Dirichleta to punkt na  (K−1)(K−1) -wymiarowym "trójkącie" (sympleksie), czyli  x1+...+xK=1x1+...+xK=1 ,  xi≥0xi≥0 .

<img src="Grafika/3d_simplex.png" width="500">

<img src="Grafika/Smoothed_LDA.png">


,gdzie

$\theta_d \sim Dir(\alpha)$  - rozkład tematów w dokumencie

$Z \sim Discr(\theta)$ - temat, którego pochodz słowo

$W \sim Discr(\phi_Z)$ - słowo

$\phi_i \sim Dir(\beta)$ - tematy

Dokumenty będą składać sie tylko z kilku tematów (alfa będzie mała).

Tematy będą charakteryzowane również tylko przez cześć słów (beta małe). Dzięki temu możemy ludzkim okiem rozróżnić i zintepretować tematy.

### LDA w module GENSIM
gensim to modul stworzony do analizy tekstów: https://radimrehurek.com/gensim/

### LDA assumes that each document is a mixture (linear combination) of some arbitrary number of topics. LDA also assumes taht each topic can be represented by a distribution of words

### Create corpus from text

In [97]:
dictionary = corpora.Dictionary(articles_stemmed)# create a dictionary from all words

In [98]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f9292d25b00>

In [99]:
dictionary.keys()[-10:]

[10521, 10522, 10523, 10524, 10525, 10526, 10527, 10528, 10529, 10530]

In [100]:
dictionary.get(300)

'determin'

In [101]:
dictionary.id2token[300]

'determin'

In [102]:
dictionary.doc2bow(['mathemat' ,'mathemat','mathemat'])# word nr 644 apears 2 times in this doc

[(626, 3)]

In [103]:
corpus = [dictionary.doc2bow(art) for art in articles_stemmed] # create corpus

In [104]:
len(corpus) # corpus made of all words from all documents

15

In [105]:
len(dictionary) #number of tokens in dictionary 

10531

In [106]:
len(corpus[0])# number of words in first document

1084

In [107]:
corpus[0] # word nb 0 epears 18 times

[(0, 18),
 (1, 1),
 (2, 15),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 2),
 (10, 1),
 (11, 1),
 (12, 12),
 (13, 2),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 18),
 (18, 6),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 2),
 (25, 1),
 (26, 1),
 (27, 4),
 (28, 1),
 (29, 2),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 10),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 2),
 (53, 3),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 1),
 (61, 2),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 13),
 (71, 1),
 (72, 4),
 (73, 1),
 (74, 1),
 (75, 3),
 (76, 18),
 (77, 1),
 (78, 2),
 (79, 1),
 (80, 1),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 3),
 (85, 6),
 (86, 2),
 (87, 1),
 (88, 1),
 (89, 2),
 (90, 2),
 (91, 3),
 (92, 1),
 (93, 2),
 (94, 1),
 (95, 1),
 (96, 5),
 (97, 1),
 (98, 1),
 (99, 1),
 (1

In [108]:
model_corpus = LdaModel(corpus=corpus,id2word=dictionary,num_topics=5)

### Create corpus from sparse matrix

In [109]:
vectorizer = CountVectorizer(min_df=0.2,max_df=0.5)
dtm_cv = vectorizer.fit_transform(articles_cleaned)

In [110]:
corpus_sparse = matutils.Sparse2Corpus(dtm_cv,documents_columns=False) # by default documents in columns

In [111]:
dictionary_sparse = vectorizer.get_feature_names()

In [112]:
len(dictionary_sparse)

1908

In [113]:
dictionary_sparse_dict = {i:j for i,j in enumerate(dictionary_sparse)}

In [114]:
dictionary_sparse_dict[615]

'convinc'

In [115]:
dictionary_sparse = corpora.Dictionary.from_corpus(corpus=corpus_sparse,id2word=dictionary_sparse_dict)

In [116]:
model_corpus_sparse = LdaModel(corpus=corpus_sparse,id2word=dictionary_sparse,num_topics=5)

### LDA model

In [117]:
model_corpus = LdaModel(corpus=corpus_sparse,id2word=dictionary_sparse,num_topics=5, alpha='auto',eta='auto')

In [118]:
model_corpus.get_document_topics(corpus_sparse[5],minimum_probability=0)

[(0, 0.00031131925),
 (1, 0.00042808364),
 (2, 0.067101486),
 (3, 0.93182719),
 (4, 0.00033188384)]

In [119]:
for i in range(len(corpus_sparse)):
    print(model_corpus.get_document_topics(corpus_sparse[i],minimum_probability=0))

[(0, 0.93745089), (1, 0.0020810517), (2, 0.059966374), (3, 0.0002389888), (4, 0.00026269618)]
[(0, 0.65862131), (1, 0.0004315454), (2, 0.34018096), (3, 0.00037473935), (4, 0.0003914712)]
[(0, 0.84734547), (1, 0.076737665), (2, 0.073287539), (3, 0.0012819829), (4, 0.0013473193)]
[(0, 0.94320649), (1, 0.00059398008), (2, 0.055219535), (3, 0.00047249213), (4, 0.00050747831)]
[(0, 0.0013685753), (1, 0.0013787752), (2, 0.9946236), (3, 0.0012973673), (4, 0.0013316747)]
[(0, 0.00031130385), (1, 0.00038182785), (2, 0.068758428), (3, 0.93021882), (4, 0.00032957399)]
[(0, 0.00014048049), (1, 0.99939388), (2, 0.00017576713), (3, 0.00014461015), (4, 0.00014524098)]
[(0, 0.0009358605), (1, 0.001053397), (2, 0.0011465134), (3, 0.26354808), (4, 0.73331618)]
[(0, 0.00042136171), (1, 0.00099713844), (2, 0.62272626), (3, 0.00041702477), (4, 0.37543818)]
[(0, 0.00021689587), (1, 0.00022829071), (2, 0.00025986691), (3, 0.0027350439), (4, 0.99655986)]
[(0, 0.00015377723), (1, 0.009764472), (2, 0.0024394239

In [120]:
titles

['Integral',
 'Riemann_integral',
 'Riemann-Stieltjes_integral',
 'Derivative',
 'Limit_of_a_sequence',
 'Edvard_Munch',
 'Vincent_van_Gogh',
 'Jan_Matejko',
 'Lev_Tolstoj',
 'Franz_Kafka',
 'J._R._R._Tolkien',
 'Warsaw',
 'England',
 'Eiffel_Tower',
 'Taipei_101']

In [123]:
model_corpus.get_topic_terms?

In [125]:
for tup in model_corpus.get_topic_terms(topicid=1):
    print(dictionary_sparse[tup[0]],tup[1])

integr 0.0213524
gogh 0.0194905
van 0.0173988
tower 0.0119824
warsaw 0.0115928
riemann 0.00612462
pari 0.00514521
interv 0.00411954
1888 0.00400462
level 0.00370289


In [126]:
model_corpus.alpha

array([ 0.33863,  0.34356,  0.36412,  0.33029,  0.33625], dtype=float32)

In [127]:
model_corpus.eta

array([ 0.3478 ,  0.4259 ,  0.33266, ...,  0.34632,  0.32715,  0.40179], dtype=float32)

In [128]:
model_corpus.print_topics(num_words=8)

[(0,
  '0.023*"integr" + 0.017*"warsaw" + 0.014*"riemann" + 0.008*"tower" + 0.007*"england" + 0.006*"interv" + 0.005*"level" + 0.005*"defin"'),
 (1,
  '0.021*"integr" + 0.019*"gogh" + 0.017*"van" + 0.012*"tower" + 0.012*"warsaw" + 0.006*"riemann" + 0.005*"pari" + 0.004*"interv"'),
 (2,
  '0.019*"integr" + 0.015*"displaystyl" + 0.013*"van" + 0.009*"gogh" + 0.009*"riemann" + 0.009*"sequenc" + 0.007*"sum" + 0.006*"tower"'),
 (3,
  '0.014*"warsaw" + 0.008*"gogh" + 0.007*"van" + 0.007*"integr" + 0.007*"england" + 0.006*"tower" + 0.005*"polish" + 0.004*"stori"'),
 (4,
  '0.020*"england" + 0.015*"integr" + 0.008*"warsaw" + 0.006*"kingdom" + 0.005*"stori" + 0.005*"british" + 0.005*"london" + 0.004*"tower"')]

In [None]:
# zsumowac wystapienia tematow w corpusie i zobaczyc ktory jest najczestszy

In [129]:
sum_percent = np.zeros(5)
for i in range(len(corpus_sparse)):
    dict_topics = model_corpus.get_document_topics(corpus_sparse[i],minimum_probability=0)
    doc_topics = [topic[1] for topic in dict_topics]
    added = [doc+sum_ for doc,sum_ in zip(doc_topics,sum_percent)]
    sum_percent = added.copy()
    
print(np.argsort(sum_percent)[::-1])


[0 4 2 1 3]


In [130]:
pyLDAvis.enable_notebook()

In [131]:
# topics are sorted from most to least popular! the order of topics is changed
pyLDAvis.gensim.prepare(model_corpus,corpus=corpus_sparse,dictionary=dictionary_sparse)  