In [2]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import re
import spacy
from tqdm import tqdm_notebook

nlp = spacy.load('en')


def cleaner(text):
    mapping = [('&#39;', ''), 
               ('<br />', ''), 
               ('<.*>.*</.*>', ''), 
               ('\\ufeff', ''), 
               ('[\d]', ''),  
               ('\[.*\]', ''),  
               ('[^a-z ]', '')]
    for k, v in mapping:
        text = re.sub(k, v, text)
        
    text = ' '.join(i.lemma_ for i in nlp(text))
    text = ' '.join(text.split())
    
    return text

In [None]:
client = MongoClient('ec2-54-69-203-249.us-west-2.compute.amazonaws.com', 27016)

In [None]:
client.database_names()

In [None]:
pages_db = client['wiki']

In [None]:
pages_db.collection_names()

In [None]:
pages_coll = pages_db['wiki_pages']

In [None]:
cursor = pages_coll.find()

In [None]:
n = pages_coll.count()
wiki_corpus = pd.DataFrame(columns = ['_id', 'pageid', 'text', 'title'])

for i in tqdm_notebook(range(n)):
    try:
        tmp = next(cursor)
        tmp.pop('categories') # remove categories as it is causing some issues
        tmp['text'] = cleaner(tmp['text'].lower().replace("\n", " "))
        
        pages_coll.update_one({"_id": tmp["_id"]}, {"$set": {"clean_text": tmp['text']}})

        
        tmp_df = pd.DataFrame(tmp, index=[i])

        wiki_corpus = pd.concat([wiki_corpus, tmp_df])
    except:
        pass

In [None]:
wiki_corpus.shape

In [None]:
wiki_corpus.head()

In [None]:
client.close()

In [None]:
wiki_corpus.columns

In [None]:
wiki_corpus = wiki_corpus.drop('text', axis = 1)

In [None]:
pass_notability_mask = wiki_corpus['clean_text'].str.contains('this article may not meet wikipedia general notability guideline') == False
pass_issues_mask = wiki_corpus['clean_text'].str.contains('this article have multiple issue please help improve') == False

In [None]:
wiki_corpus = wiki_corpus.loc[pass_notability_mask & pass_issues_mask, :]

In [None]:
wiki_corpus.shape

In [None]:
wiki_corpus.to_pickle('wiki_corpus.pkl')

In [3]:
wiki_corpus = pd.read_pickle('wiki_corpus.pkl')

In [17]:
tfidf_vectorizer = TfidfVectorizer(min_df = 25, max_df = 0.50, stop_words = 'english', ngram_range=(1,2))

In [18]:
document_term_matrix_sps = tfidf_vectorizer.fit_transform(wiki_corpus['clean_text'])

In [19]:
# document_term_matrix_df = pd.DataFrame(document_term_matrix_sps.toarray(),
#                                        index=wiki_corpus.index,
#                                        columns=tfidf_vectorizer.get_feature_names())

# document_term_matrix_df.head()
document_term_matrix_sps.shape

(42839, 59284)

In [20]:
from sklearn.externals import joblib

In [21]:
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl') 

['tfidf_vectorizer.pkl']

## Document Term Matrix is over determined

Use SVD to drop the number of features

In [22]:
n_components = 100
SVD = TruncatedSVD(n_components, random_state=42)
component_names = ["component_"+str(i+1) for i in range(n_components)]

In [23]:
svd_matrix = SVD.fit_transform(document_term_matrix_sps)

In [24]:
# svd_matrix_df = pd.DataFrame(svd_matrix,
#                              index=wiki_corpus.index, 
#                              columns=component_names)
# svd_matrix_df['title'] = wiki_corpus['title']

svd_matrix_df = pd.DataFrame(svd_matrix,
                             index=wiki_corpus['title'], 
                             columns=component_names)

In [25]:
svd_matrix_df.shape

(42839, 100)

In [26]:
svd_matrix_df.head()

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_91,component_92,component_93,component_94,component_95,component_96,component_97,component_98,component_99,component_100
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Activity recognition,0.071389,0.040053,0.028408,-0.069883,-0.001867,0.005348,0.006114,-0.047876,-0.018117,-0.031499,...,0.019571,-0.012486,-0.003922,0.026974,0.027402,0.01491,0.00502,0.008259,0.011405,-0.005507
AlchemyAPI,0.073155,0.026081,0.040616,-0.067207,-0.010691,0.002985,0.00645,-0.037876,-0.058823,-0.037097,...,0.011816,0.016043,0.010867,-0.008517,0.016883,-0.018888,-0.009693,0.006752,0.023394,0.012028
BigDL,0.054866,0.017018,0.025618,-0.036519,-0.008558,0.003645,-0.015233,-0.019314,-0.048705,-0.047201,...,0.000354,0.009501,0.046713,-0.035232,-0.042077,0.002097,-0.009872,0.012512,-0.034184,-0.021799
Caffe (software),0.082547,0.021633,0.017583,-0.050028,-0.012164,0.004306,-0.009593,-0.030145,-0.076603,-0.053127,...,0.016038,0.009571,0.043851,-0.018377,-0.024303,-0.001933,0.012483,0.020335,-0.010086,-0.015337
Comparison of deep learning software,0.056958,0.028135,0.024462,-0.049099,-0.009325,0.003126,-0.003438,-0.030655,-0.007986,-0.035328,...,-0.017045,0.006319,0.003174,-0.003137,0.000909,0.000975,0.00642,0.006358,-0.004187,0.010001


In [27]:
svd_matrix_df.to_pickle('svd_matrix_df.pkl')

In [28]:
def search_for_pages(search_term):
    tmp_svd_df = svd_matrix_df.copy()
    
    clean_search = search_term.lower().replace("\n", " ") # converts to lowercase and drops any \n
    clean_search = cleaner(clean_search) #cleans other special characters, whitespace, numbers, etc.
    clean_search = [clean_search]
    
    tfdif_search = tfidf_vectorizer.transform(clean_search)
    svd_search = SVD.transform(tfdif_search)
    
    cosine = cosine_similarity(tmp_svd_df, svd_search)
    pearson = tmp_svd_df.apply(lambda x: np.corrcoef(x, svd_search)[0][1], axis = 1)
    
    tmp_svd_df['cosine_similarity'] = cosine
    tmp_svd_df['pearson_similarity'] = pearson
#     print(tmp_svd_df.shape)
#     print(svd_search[0])

    return tmp_svd_df[['cosine_similarity', 'pearson_similarity']]
    

In [29]:
search_string = """
learning algorithms heuristic implicit latent
"""

In [30]:
df = search_for_pages(search_string)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [31]:
df.sort_values('cosine_similarity', ascending=False).head()

Unnamed: 0_level_0,cosine_similarity,pearson_similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Relief (feature selection),0.96249,0.962898
Matrix regularization,0.958635,0.95941
Distribution learning theory,0.954589,0.954722
BrownBoost,0.953753,0.954082
Algorithmic learning theory,0.953635,0.95368


In [32]:
joblib.dump(SVD, 'SVD.pkl') 

['SVD.pkl']

`min_df` parameter in `TfidfVectorizer` and `n_components` in `TruncatedSVD` were chosen to keep the saved models under 100 MB in size (so they can be pushed to git)