## Part II - Latent Semantic Analysis to Grab Related Articles Given a Query

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

### Apply the same text cleaner used to clean the data to the search query

In [2]:
from os import chdir
chdir('/home/jovyan/')

In [3]:
import library.db_helper as db
import library.functions as fy

### Grab Text from the Page table

In [7]:
query = '''
SELECT text
FROM page
'''

X = db.query_to_dataframe(query)

In [8]:
X.sample(5)

Unnamed: 0,text
2523,fleet management software fms is computer soft...
2120,qlootype of siteprivatefoundedheadquartersnew ...
1518,apache openwebbeansdeveloper s apache software...
1085,numentatypeprivately heldindustryanalytics art...
1986,in machine learning and natural language proce...


In [10]:
query = '''
SELECT * 
FROM page
'''
page = db.query_to_dataframe(query)
page.sample(5)

Unnamed: 0,pid,text,title
2051,20027065,the structured support vector machine is a mac...,structured support vector machine
1422,9452661,tom clancy s ruthless comdeveloper s red storm...,tom clancy s ruthless com
1375,19172663,enns redirects here for other uses see enns t...,european neural network society
2418,36494971,this article is an orphan as no other articles...,mexican international conference on artificial...
1175,8220913,this article is about the neural network for o...,adaline


In [13]:
page['title'].apply(lambda x : fy.text_cleaner(x))

0       user custintelmngt sandbox customer intelligen...
1                                                   timit
2           microsoft dynamics for finance and operations
3                                          tag management
4                                 milestones professional
5                                        data exploration
6                                dimensionality reduction
7                                  microsoft dynamics erp
8                                  team foundation server
9                                                netpoint
10         list of datasets for machine learning research
11                      canonical correspondence analysis
12                                    millennium software
13                                               teamnote
14                                               omniplan
15                                       machine learning
16                                correspondence analysis
17            

### Setup Latent Semantic Analysis - Article Modeling

![](http://interactive.blockdiag.com/image?compression=deflate&encoding=base64&src=eJxLyslPzk7JTExXqOZSUFAqSixXSEksSVRS0LVTUCpILCpOhTCTc1IT85SsQWogTLBoSGpRrq5LfnJpbmpeiUJMnm9iSVFmBUQZDjmwvuAwFwjDM68ktaigKBWkICi1uDSnpBiouxYAYKwuOg)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity

#### 1. Vetorize the Search Term and the Corpus (X['text'])

In [12]:
def tfidf_vectorizer (search_query, min_df=1):
    tfidf_vec = TfidfVectorizer(stop_words = 'english', min_df=min_df)
    doc_term_matrix = tfidf_vec.fit_transform(X['text'])
    search_query_vec = tfidf_vec.transform([search_query])
    return doc_term_matrix, search_query_vec

#### 2. Use SVD to reduce dimensionality for the sparse document matrix

In [13]:
def SVD_lsa (search_query, n_components=300, min_df=1):
    SVD = TruncatedSVD(n_components=n_components)
    doc_term_matrix, search_query_vec = tfidf_vectorizer (search_query, min_df = min_df)
    
    lsa_doc_term = SVD.fit_transform(doc_term_matrix)
    search_query_lsa = SVD.transform(search_query_vec)
    
    return lsa_doc_term, search_query_lsa

#### 3. Apply sklearn's cosine_similiarity to return article matches for the given search term

In [27]:
X.shape # corpus

(2787, 1)

In [15]:
def grab_related_articles (search_query, n_results=5, n_components = 300, min_df=1):
    lsa_doc_term, search_query_lsa = SVD_lsa(search_query, n_components=n_components, min_df=min_df)
    cos_sim_arr = cosine_similarity(lsa_doc_term, search_query_lsa).ravel()
    
    first_term = -1*(n_results) - 1 
    indices = np.argsort(cos_sim_arr)[:first_term: -1]
    
    related_articles = list(page['title'].iloc[indices])
    return related_articles   

In [51]:
grab_related_articles("oracle", n_results=20, n_components=400, min_df=2)

['oracle applications',
 'oracle corporation',
 'oracle soa suite',
 'oracle application server',
 'oracle fusion middleware',
 'oracle enterprise resource planning cloud',
 'oracle reports',
 'oracle fusion applications',
 'oracle fusion architecture',
 'oracle policy automation',
 'oracle beehive',
 'oracle health sciences',
 'proactive learning',
 'oracle data mining',
 'peoplesoft',
 'oracle webcenter',
 'hyperion planning',
 'netsuite',
 'oracle bi publisher',
 'oracle weblogic server']

### 4.A Write functions to perform this on all types of search queries
#### 4.B Write a class to run these different steps 
    - tfidf (vectorize search term & create vectorized document)
    - lsa (complete lsa)
    - return the 5 most relevant articles (use a method like cosine similiarty, nearest neighbor if I have time to compare)
#### 4.C Build a model pipeline to predict which Wikipedia articles are most relevant for the search query

In [None]:
class tfidf_lsa:
    
    def __init__(self):
        
    def tfdif_vectorizor:
        
        
        

In [4]:
def tokenize (text):
    clean_text = text_cleaner(text)
    return clean_text.lower().split()

In [6]:
tfidf_vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english')