In [26]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

### Apply the same text cleaner used to clean the data to the search query

In [27]:
from os import chdir
chdir('/home/jovyan/')

In [28]:
import library.db_helper as db
import library.functions as fy

In [29]:
search = 'support vector machine'

In [30]:
clean_search = fy.text_cleaner(search)

In [31]:
clean_search

'support vector machine'

### Grab Text from the Page table

In [32]:
query = '''
SELECT text
FROM page
'''

X = db.query_to_dataframe(query)

In [35]:
X.head()

Unnamed: 0,text
0,this article is an orphan as no other articles...
1,this article has multiple issues please help i...
2,this article has multiple issues please help i...
3,a block diagram of the cmac system for a singl...
4,choose and book was an e booking software appl...


In [37]:
query = '''
SELECT * 
FROM page
'''
page = db.query_to_dataframe(query)
page.head()

Unnamed: 0,pid,text,title
0,48844125,this article is an orphan as no other articles...,structured sparsity regularization
1,2506529,this article has multiple issues please help i...,cellular neural network
2,33085387,this article has multiple issues please help i...,computhink
3,8160211,a block diagram of the cmac system for a singl...,cerebellar model articulation controller
4,1917193,choose and book was an e booking software appl...,choose and book


In [38]:
page['pid'].nunique()

2439

In [39]:
page.shape

(2439, 3)

In [40]:
page['title'].apply(lambda x : fy.text_cleaner(x))

0                      structured sparsity regularization
1                                 cellular neural network
2                                              computhink
3                cerebellar model articulation controller
4                                         choose and book
5                                                    codi
6                                             clinicalkey
7                                    competitive learning
8       user custintelmngt sandbox customer intelligen...
9                                        data exploration
10         list of datasets for machine learning research
11                                       machine learning
12                            outline of machine learning
13                                      random projection
14                        statistical relational learning
15                                 stochastic block model
16                                       draft alldone io
17            

### Setup Latent Semantic Analysis - Article Modeling

![](http://interactive.blockdiag.com/image?compression=deflate&encoding=base64&src=eJxLyslPzk7JTExXqOZSUFAqSixXSEksSVRS0LVTUCpILCpOhTCTc1IT85SsQWogTLBoSGpRrq5LfnJpbmpeiUJMnm9iSVFmBUQZDjmwvuAwFwjDM68ktaigKBWkICi1uDSnpBiouxYAYKwuOg)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity

#### 1. Vetorize the Search Term and the Corpus (X['text'])

In [12]:
def tfidf_vectorizer (search_query, min_df=1):
    tfidf_vec = TfidfVectorizer(stop_words = 'english', min_df=min_df)
    doc_term_matrix = tfidf_vec.fit_transform(X['text'])
    search_query_vec = tfidf_vec.transform([search_query])
    return doc_term_matrix, search_query_vec

#### 2. Use SVD to reduce dimensionality for the sparse document matrix

In [13]:
def SVD_lsa (search_query, n_components=300, min_df=1):
    SVD = TruncatedSVD(n_components=n_components)
    doc_term_matrix, search_query_vec = tfidf_vectorizer (search_query, min_df = min_df)
    
    lsa_doc_term = SVD.fit_transform(doc_term_matrix)
    search_query_lsa = SVD.transform(search_query_vec)
    
    return lsa_doc_term, search_query_lsa

#### 3. Apply sklearn's cosine_similiarity to return article matches for the given search term

In [27]:
X.shape # corpus

(2787, 1)

In [15]:
def grab_related_articles (search_query, n_results=5, n_components = 300, min_df=1):
    lsa_doc_term, search_query_lsa = SVD_lsa(search_query, n_components=n_components, min_df=min_df)
    cos_sim_arr = cosine_similarity(lsa_doc_term, search_query_lsa).ravel()
    
    first_term = -1*(n_results) - 1 
    indices = np.argsort(cos_sim_arr)[:first_term: -1]
    
    related_articles = list(page['title'].iloc[indices])
    return related_articles   

In [43]:
grab_related_articles("bayes", n_results=5, n_components = 400, min_df = 2)

['naive bayes classifier',
 'averaged one dependence estimators',
 'ensemble learning',
 'labeled data',
 'bayesian network']

#### 4.A Write functions to perform this on all types of search queries
#### 4.B Write a class to run these different steps 
    - tfidf (vectorize search term & create vectorized document)
    - lsa (complete lsa)
    - return the 5 most relevant articles (use a method like cosine similiarty, nearest neighbor if I have time to compare)
#### 4.C Build a model pipeline to predict which Wikipedia articles are most relevant for the search query

In [None]:
class tfidf_lsa:
    
    def __init__(self):
        
    def tfdif_vectorizor:
        
        
        

In [4]:
def tokenize (text):
    clean_text = text_cleaner(text)
    return clean_text.lower().split()

In [6]:
tfidf_vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english')