In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline


1. clean the input string the same way you cleaned your data
1. use your tfidf or count-vectorizer to transform your search query (transform)
1. use your TruncatedSVC to transform your document-term matrix
1. use cosine similarity OR nearest neighbors (or both) to find the top 5 most similar documents

### Apply the same text cleaner used to clean the data to the search query

In [3]:
from os import chdir
chdir('/home/jovyan/')

In [4]:
import library.db_helper as db
import library.functions as fy

In [5]:
search_query = 'support vector machine'

In [7]:
clean_search = fy.text_cleaner(search_query)

In [8]:
clean_search

'support vector machine'

### Grab Text from the Page table

In [12]:
query = '''
SELECT text
FROM page
'''

X = db.query_to_dataframe(query)

In [13]:
X.head()

Unnamed: 0,text
0,this is not a wikipedia article it is an indiv...
1,this article has multiple issues please help i...
2,machine learning anddata miningproblemsclassif...
3,for the journal see machine learning journal m...
4,the following outline is provided as an overvi...


In [14]:
query = '''
SELECT * 
FROM page
'''
page = db.query_to_dataframe(query)
page.head()

Unnamed: 0,pid,text,title
0,54972729,this is not a wikipedia article it is an indiv...,user custintelmngt sandbox customer intelligen...
1,43385931,this article has multiple issues please help i...,data exploration
2,49082762,machine learning anddata miningproblemsclassif...,list of datasets for machine learning research
3,233488,for the journal see machine learning journal m...,machine learning
4,53587467,the following outline is provided as an overvi...,outline of machine learning


### Setup Latent Semantic Analysis - Article Modeling

![](http://interactive.blockdiag.com/image?compression=deflate&encoding=base64&src=eJxLyslPzk7JTExXqOZSUFAqSixXSEksSVRS0LVTUCpILCpOhTCTc1IT85SsQWogTLBoSGpRrq5LfnJpbmpeiUJMnm9iSVFmBUQZDjmwvuAwFwjDM68ktaigKBWkICi1uDSnpBiouxYAYKwuOg)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity

#### 1. Vetorize the Search Term and the Corpus (X['text'])

In [16]:
# Instantiate the tfidf vectorizer and apply it to the corpus and to the searh term
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=1)

In [17]:
# fit and transform the corpus into a sparse matrix using tfidf vectorizer
doc_term_matrix = tfidf_vectorizer.fit_transform(X['text'])

In [18]:
# transform the search term to a vector (do not fit)
search_term_vec = tfidf_vectorizer.transform([search_query])

In [19]:
def tfidf_vectorizer (search_query, min_df=1):
    tfidf_vec = TfidfVectorizer(stop_words = 'english', min_df=min_df)
    doc_term_matrix = tfidf_vec.fit_transform(X['text'])
    search_query_vec = tfidf_vec.transform([search_query])
    return doc_term_matrix, search_query_vec

#### 2. Use SVD to reduce dimensionality for the sparse document matrix

In [20]:
# Instantiate TruncatedSVD
SVD = TruncatedSVD(n_components=300)

In [21]:
# fit & transform the document term matrix to reduce the dimensionality of the doc_term_matrix
latent_semantic_analysis = SVD.fit_transform(doc_term_matrix)

In [22]:
# transform the vectorized search term (do not fit)
search_term_vec_lsa = SVD.transform(search_term_vec)
type(search_term_vec_lsa)

numpy.ndarray

In [32]:
def SVD_lsa (search_query, n_components=300):
    SVD = TruncatedSVD(n_components=n_components)
    doc_term_matrix, search_query_vec = tfidf_vectorizer (search_query)
    lsa_doc_term = SVD.fit_transform(doc_term_matrix)
    search_query_lsa = SVD.transform(search_query_vec)
    return lsa_doc_term, search_query_lsa

#### 3. Apply sklearn's cosine_similiarity to return article matches for the given search term

In [24]:
cos_sim = cosine_similarity

In [25]:
cos_sim_arr = cos_sim(latent_semantic_analysis, search_term_vec_lsa).ravel()
cos_sim_arr

array([ 0.06200576,  0.08903066,  0.05471174, ...,  0.03365297,
       -0.00027005, -0.01681301])

In [26]:
indices = np.argsort(cos_sim_arr)[:-6:-1]

In [27]:
X.shape # corpus

(2787, 1)

In [28]:
page.shape # page table in my SQL db

(2787, 3)

In [29]:
list(page['title'].iloc[indices])

['relevance vector machine',
 'relevance vector machine',
 'corinna cortes',
 'libsvm',
 'international conference on machine learning']

In [35]:
def grab_related_articles (search_query, n_results=5):
    lsa_doc_term, search_query_lsa = SVD_lsa(search_query)
    cos_sim_arr = cosine_similarity(lsa_doc_term, search_query_lsa).ravel()
    
    first_term = -1*(n_results) - 1 
    np.argsort(cos_sim_arr)[:first_term: -1]
    related_articles = list(page['title'].iloc[indices])
    return related_articles   

In [36]:
grab_related_articles("oracle")

['relevance vector machine',
 'relevance vector machine',
 'corinna cortes',
 'libsvm',
 'international conference on machine learning']

#### 4.A Write functions to perform this on all types of search queries
#### 4.B Write a class to run these different steps 
    - tfidf (vectorize search term & create vectorized document)
    - lsa (complete lsa)
    - return the 5 most relevant articles (use a method like cosine similiarty, nearest neighbor if I have time to compare)
#### 4.C Build a model pipeline to predict which Wikipedia articles are most relevant for the search query

In [None]:
class tfidf_lsa:
    
    def __init__(self):
        
    def tfdif_vectorizor:
        
        
        

In [4]:
def tokenize (text):
    clean_text = text_cleaner(text)
    return clean_text.lower().split()

In [6]:
tfidf_vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english')