## Part II - Latent Semantic Analysis to Grab Related Articles Given a Query

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from os import chdir
chdir('/home/jovyan/')

In [7]:
import library.db_helper as db
import library.functions as fy

### Grab Text from the Page table

In [8]:
query = '''
SELECT text
FROM page
'''

X = db.query_to_dataframe(query)

In [9]:
X.sample(5)

Unnamed: 0,text
2129,the sun java system communications express pro...
455,journal of machine learning research abbreviat...
246,gradient boosting is a machine learning techni...
536,a schema is a template in computer science use...
25,the accuracy paradox for predictive analytics ...


In [21]:
query = '''
SELECT * 
FROM page 
'''

page = db.query_to_dataframe(query)

### Setup Latent Semantic Analysis - Article Modeling

![](http://interactive.blockdiag.com/image?compression=deflate&encoding=base64&src=eJxLyslPzk7JTExXqOZSUFAqSixXSEksSVRS0LVTUCpILCpOhTCTc1IT85SsQWogTLBoSGpRrq5LfnJpbmpeiUJMnm9iSVFmBUQZDjmwvuAwFwjDM68ktaigKBWkICi1uDSnpBiouxYAYKwuOg)

#### 1. Vetorize the Search Term and the Corpus (X['text'])

In [14]:
def tfidf_vectorizer (search_query, min_df=1):
    tfidf_vec = TfidfVectorizer(stop_words = 'english', min_df=min_df)
    doc_term_matrix = tfidf_vec.fit_transform(X['text'])
    search_query_vec = tfidf_vec.transform([search_query])
    return doc_term_matrix, search_query_vec

In [15]:
tfidf_vectorizer('oracle', min_df=2)

(<2787x42182 sparse matrix of type '<class 'numpy.float64'>'
 	with 825514 stored elements in Compressed Sparse Row format>,
 <1x42182 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>)

#### 2. Use SVD to reduce dimensionality for the sparse document matrix

In [17]:
def SVD_lsa (search_query, n_components=300, min_df=1):
    SVD = TruncatedSVD(n_components=n_components)
    doc_term_matrix, search_query_vec = tfidf_vectorizer (search_query, min_df = min_df)
    
    lsa_doc_term = SVD.fit_transform(doc_term_matrix)
    search_query_lsa = SVD.transform(search_query_vec)
    
    return lsa_doc_term, search_query_lsa

In [18]:
SVD_lsa ('oracle', min_df=2, n_components=400)

(array([[  2.19187973e-01,  -9.22950396e-02,   6.84874674e-02, ...,
           7.34548933e-03,   1.14847961e-02,   2.76274111e-02],
        [  5.97464119e-02,  -1.09973624e-02,   4.15869659e-02, ...,
           1.93461839e-03,   1.87320767e-02,   1.52070552e-03],
        [  1.42714207e-01,  -1.10184687e-01,  -2.99154346e-02, ...,
           2.28358944e-02,   2.83076098e-02,   2.67475152e-02],
        ..., 
        [  1.10979565e-01,  -5.73086253e-02,  -2.86838234e-02, ...,
          -1.66288794e-02,   1.34649422e-02,   2.32041595e-02],
        [  2.70213254e-01,   8.72708550e-02,   1.75249455e-01, ...,
          -2.06202565e-02,  -7.56653666e-05,  -3.00825094e-02],
        [  1.59352198e-01,  -1.23852041e-01,  -7.50155022e-02, ...,
          -1.26395381e-02,  -5.67572018e-03,  -1.87137407e-02]]),
 array([[  3.64857226e-02,  -3.00911160e-02,  -1.52674420e-02,
          -8.03341810e-02,  -5.74284470e-02,   1.18168588e-01,
          -9.92074447e-02,  -1.93431701e-02,   2.47798289e-02,
   

#### 3. Apply sklearn's cosine_similiarity to return article matches for the given search term

In [11]:
X.shape # corpus

(2787, 1)

In [22]:
def grab_related_articles (search_query, n_results=5, n_components = 300, min_df=1):
    lsa_doc_term, search_query_lsa = SVD_lsa(search_query, n_components=n_components, min_df=min_df)
    cos_sim_arr = cosine_similarity(lsa_doc_term, search_query_lsa).ravel()
    
    first_term = -1*(n_results) - 1 
    indices = (np.argsort(cos_sim_arr)[:first_term: -1])
    
    while len(list(set(page['title'].iloc[indices]))) < n_results:
        first_term -= 1
        indices = (np.argsort(cos_sim_arr)[:first_term: -1])
    related_articles = list(set(page['title'].iloc[indices]))
    return related_articles   

In [23]:
grab_related_articles("oracle", n_results=10, n_components=400, min_df=2)

['oracle applications',
 'oracle application server',
 'oracle enterprise resource planning cloud',
 'oracle corporation',
 'oracle fusion applications',
 'oracle fusion middleware',
 'peoplesoft',
 'oracle soa suite',
 'oracle reports',
 'oracle fusion architecture']