In [92]:
%matplotlib inline


# Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation


This is an application of Latent Dirichilet Allocation and Non-negative Matrix Factorization on a corpus
of documents and extract additive models of the topic structure of the
corpus.

Non-negative Matrix Factorization is applied with the objective
function: the Frobenius norm.



In [1]:
from time import time
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


In [2]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

def process(x):
    x = pd.Series(x)
    x = tfidf_vectorizer.transform(x)
    return x

def result(x):
    x = lda.transform(x)
    x = x.reshape(10,)
    xd = {'Themes':themes,'Percent Probability': x} 
    xd = pd.DataFrame(xd)
    print(xd)

In [3]:
read = pd.read_csv('people_wiki.csv')
data_samples = read.text

In [4]:
read

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...
...,...,...,...
59066,<http://dbpedia.org/resource/Olari_Elts>,Olari Elts,olari elts born april 27 1971 in tallinn eston...
59067,<http://dbpedia.org/resource/Scott_F._Crago>,Scott F. Crago,scott francis crago born july 26 1963 twin bro...
59068,<http://dbpedia.org/resource/David_Cass_(footb...,David Cass (footballer),david william royce cass born 27 march 1962 in...
59069,<http://dbpedia.org/resource/Keith_Elias>,Keith Elias,keith hector elias born february 3 1972 in lac...


In [5]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(data_samples)

In [6]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

done in 56.329s.

Topics in LDA model:
Topic #0: world won championships championship team tour racing olympics champion race olympic finished competed event title professional win medal time record
Topic #1: party election minister elected member state served politician parliament democratic assembly committee candidate government republican house council district seat president
Topic #2: league season football played team coach games club baseball player game career seasons basketball playing cup signed professional goals hockey
Topic #3: business board president university company executive director chairman development international management served public member ceo chief global technology policy new
Topic #4: court law united church states judge served military police prison general president army bishop district rights justice chief years appointed
Topic #5: music album band released song songs records jazz albums singer rock recorded guitar solo recording new label single musi

In [7]:
final = lda.transform(tfidf)
final

array([[0.0154394 , 0.01544066, 0.86102581, ..., 0.01546194, 0.01543905,
        0.01543824],
       [0.01649907, 0.01649879, 0.01649867, ..., 0.01649605, 0.52282269,
        0.01650284],
       [0.01352432, 0.01352331, 0.01352514, ..., 0.01352253, 0.01352579,
        0.01352455],
       ...,
       [0.01428888, 0.01428937, 0.87137404, ..., 0.01428816, 0.01429388,
        0.0142924 ],
       [0.01372446, 0.0137222 , 0.87649441, ..., 0.01372098, 0.01372296,
        0.01372273],
       [0.01540958, 0.01541149, 0.01540975, ..., 0.01540815, 0.01541295,
        0.01541231]])

In [9]:
res = pd.DataFrame(data= final,columns=themes)
res

Unnamed: 0,international athletics,politics,team sports,business,Millitary,general music,TV and film,Matches,science and research,art and publishing
0,0.015439,0.015441,0.861026,0.015439,0.015440,0.015438,0.015438,0.015462,0.015439,0.015438
1,0.016499,0.016499,0.016499,0.345178,0.016501,0.016499,0.016503,0.016496,0.522823,0.016503
2,0.013524,0.013523,0.013525,0.013524,0.013524,0.878282,0.013525,0.013523,0.013526,0.013525
3,0.014588,0.014587,0.014588,0.014588,0.014587,0.014586,0.014587,0.014585,0.014590,0.868714
4,0.016797,0.016795,0.016796,0.016795,0.016795,0.848844,0.016795,0.016794,0.016794,0.016795
...,...,...,...,...,...,...,...,...,...,...
59066,0.018898,0.018900,0.018899,0.018900,0.018900,0.018898,0.829864,0.018943,0.018899,0.018898
59067,0.014199,0.014196,0.014199,0.014197,0.014197,0.872185,0.014197,0.014236,0.014197,0.014197
59068,0.014289,0.014289,0.871374,0.014300,0.014292,0.014291,0.014290,0.014288,0.014294,0.014292
59069,0.013724,0.013722,0.876494,0.013723,0.013722,0.013724,0.013723,0.013721,0.013723,0.013723


In [8]:
themes = [ 
          'international athletics', 
          'politics',
          'team sports',
          'business',
          'Millitary',
          'general music',
          'TV and film',
          'Matches',
          'science and research',
          'art and publishing'  
        ]

In [100]:
read.text[2:3]

2    harpdog brown is a singer and harmonica player...
Name: text, dtype: object

In [105]:
test = process(read.text[2:3])
result(test)

                    Themes  Percent Probability
0  international athletics             0.013524
1                 politics             0.013523
2              team sports             0.013525
3                 business             0.013524
4                Millitary             0.013524
5            general music             0.878282
6              TV and film             0.013525
7                  Matches             0.013523
8     science and research             0.013526
9       art and publishing             0.013525


In [46]:
s = ["At cricket he was equally good as a bat and as a wicket-keeper. Here is the Sussex county cricket ground. The company owns a park with football and cricket grounds."]

In [107]:
test = process(s)
result(test)

                    Themes  Percent Probability
0  international athletics             0.031009
1                 politics             0.031015
2              team sports             0.629184
3                 business             0.122733
4                Millitary             0.031011
5            general music             0.031011
6              TV and film             0.031018
7                  Matches             0.031003
8     science and research             0.031005
9       art and publishing             0.031011


In [69]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[
                          ('tfidf',tfidf_vectorizer),
                          ('lda',lda)])
pipeline.fit(data_samples)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.95, max_features=1000,
                                 min_df=2, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pat...
                                 vocabulary=None)),
                ('lda',
                 LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                                           evaluate_every=-1,
                                           learning_decay=0.7,
                    

In [50]:
import math
def recommend(x):
    m = np.argmax(x['Percent Probability'])
    am = np.max(x['Percent Probability']) 
    high = 10
    for i,j in enumerate(res[res.columns[0]].values):
        if abs(j-am) < high:
            high = i
    most = read.URI[high]
    s = ""
    for i in most:
        if i not in "<>":
            s += i
    return s

In [45]:
import joblib
joblib.dump(pipeline,filename="Text_Classification1.joblib")

In [19]:
x = joblib.load("Text_Classification.joblib")



In [51]:
def result_pipeline(x):
    x = x.reshape(10,)
    xd = {'Themes':themes,'Percent Probability': x} 
    xd = pd.DataFrame(xd)
    print(xd)
    return xd

In [48]:
r1 = result_pipeline(x.transform(s))

                    Themes  Percent Probability
0  international athletics             0.031009
1                 politics             0.031015
2              team sports             0.629184
3                 business             0.122733
4                Millitary             0.031011
5            general music             0.031011
6              TV and film             0.031018
7                  Matches             0.031003
8     science and research             0.031005
9       art and publishing             0.031011


In [49]:
print(recommend(r1))

0.015409577023628463
http://dbpedia.org/resource/Digby_Morrell
