In [1]:
import re
import requests
import pandas as pd
import numpy as np
import string
from string import punctuation
from pymongo import MongoClient

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from IPython.display import display
from scipy import stats
from sklearn.model_selection import train_test_split

%matplotlib inline 
np.random.seed(42)

from sklearn.preprocessing import LabelEncoder

### I saved the page contents two different ways
- Written to a MongoDB server
- Pickled "Machine learning" and "Business software" pages (3 nested levels deep each)

### This is using the pickled data

In [107]:
ml_df = pd.read_pickle('ml.p')
bs_df = pd.read_pickle('bs.p')

In [108]:
corpus_df = pd.concat([ml_df, bs_df])

In [111]:
corpus_df.head()

Unnamed: 0,category,page_tilte,page_contents
12424,ML,Genetic programming,In artificial intelligence genetic programming...
20412,ML,MATLAB,MATLAB matrix laboratory is a multiparadigm nu...
20926,ML,Supervised learning,Supervised learning is the machine learning ta...
21523,ML,Artificial neural network,Artificial neural networks ANNs or connectioni...
25984,ML,Ray Kurzweil,Raymond Ray Kurzweil KURZwyl born February 12...


### This is using data from the MongoDB server (recommended)

In [2]:
client = MongoClient('54.245.184.134', 27016)
db_ref = client.wikipedia_database
coll_ref = db_ref.wikipedia_collection


corpus_df = pd.DataFrame(list(coll_ref.find()))

In [4]:
corpus_df.sample(10)

Unnamed: 0,_id,category,page_contents,page_title
4329,5a24d03ddd01662634004d4a,machine learning,Quantum machine learning is an emerging interd...,Quantum machine learning
2543,5a249b77dd016621942e6753,Free groupware,phpBB is an Internet forum package in the PHP ...,PhpBB
3526,5a24b766dd016621942e6b7e,Automatic Data Processing,,File:Automatic Data Processing (logo).svg
302,5a248632dd0166215cc2b350,Deep learning,Hierarchical temporal memory HTM is a biologic...,Hierarchical temporal memory
1022,5a248c60dd0166215cc2b642,Learning in computer vision,ImageNets is an open source and platform indep...,ImageNets
2811,5a24a09fdd016621942e6872,ERP software companies,Comarch is a Polish multinational software hou...,Comarch
2586,5a249c65dd016621942e6782,Collaborative real-time editors,Etherpad previously known as EtherPad is an op...,Etherpad
2098,5a24968add016621942e6580,Financial software companies,Blackbaud Inc NASDAQBLKB is a supplier of soft...,Blackbaud
2515,5a249a62dd016621942e6732,Groupware,GNU Ring formerly SFLphone is a SIPcompatible ...,Ring (software)
2880,5a24a0dcdd016621942e68b7,ERP software companies,Visma is a privately held company based in Osl...,Visma


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
tfidf_vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english')

In [7]:
document_term_matrix_sps = tfidf_vectorizer.fit_transform(corpus_df.page_contents)

In [8]:
document_term_matrix_df = pd.DataFrame(document_term_matrix_sps.toarray(),
                                       index=corpus_df.index,
                                       columns=tfidf_vectorizer.get_feature_names())

In [9]:
pd.concat([corpus_df.page_contents, document_term_matrix_df], axis=1).sample(4)

Unnamed: 0,page_contents,00,000,000001,00001,0001,000198,000198ttt01584tft000198ttt0288ttf01584tft00tfffrac,0001l,0002,...,金融帝国ii,金融帝國ii,電話,飲む,高橋書店ゲーム攻略本シリーズ,자이오넥스,ﬂexibility,ﬂock,ﬂocking,ａ列車で行こうｍｄ
903,Linear genetic programming is unrelated to lin...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3142,Palo is a memory resident multidimensional onl...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2807,JFire was an Enterprise Resource Planning and ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
875,The weasel program Dawkins weasel or the Dawki...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
from sklearn.decomposition import TruncatedSVD

In [11]:
n_components = 300
SVD = TruncatedSVD(n_components)
component_names = ["component_"+str(i+1) for i in range(n_components)]

In [12]:
svd_matrix = SVD.fit_transform(document_term_matrix_df)

In [13]:
np.sum(SVD.explained_variance_ratio_)

0.35335004561591754

In [14]:
latent_semantic_analysis = pd.DataFrame(svd_matrix,
                                        index=document_term_matrix_df.index,
                                        columns=component_names)
latent_semantic_analysis['page_contents'] = corpus_df.page_contents
latent_semantic_analysis['title'] = corpus_df.page_title

In [15]:
latent_semantic_analysis.sample(4)

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_293,component_294,component_295,component_296,component_297,component_298,component_299,component_300,page_contents,title
1513,0.067674,0.155391,-0.060427,-0.012821,-0.087862,-0.022598,-0.042114,-0.02348,-0.037189,-0.038193,...,0.013989,-0.005269,-0.016745,-0.001041,0.01018,-0.021153,-0.0019,0.017872,Get Satisfaction is a customer community softw...,Get Satisfaction
1192,0.125714,0.110448,0.203185,0.003921,-0.034661,0.014937,-0.045996,-0.069583,-0.18123,0.076562,...,0.021044,-0.013238,0.01036,-0.002464,-0.013335,-0.012884,-0.00286,-0.00566,Jaime Guillermo Carbonell born July 29 1953 is...,Jaime Carbonell
3156,0.079726,0.193538,-0.071775,-0.08685,0.086289,0.120314,0.075462,-0.011626,-0.021798,-0.020229,...,-0.01572,-0.025364,0.018442,-0.01718,-0.008082,-0.036944,-0.009276,0.006938,Moqui is an open source enterprise resource pl...,Moqui
2602,0.051434,0.099882,-0.014902,-0.02542,0.045325,-0.014064,-0.021402,-0.008719,-0.006276,0.008389,...,-0.002201,-0.010815,0.008486,-0.006319,0.006397,0.02166,-0.003375,0.006941,CoCalc formerly called SageMathCloud is a webb...,CoCalc


### Using Cosine similarities to find the top 5 related articles for a given search word

In [36]:
search_term = 'clustering'
search_term_vec = tfidf_vectorizer.transform([search_term])
search_term_lsa = SVD.transform(search_term_vec)

In [37]:
cosine_similarities = svd_matrix.dot(search_term_lsa.T).ravel()

In [38]:
indexes = cosine_similarities.argsort()[:-6:-1]


In [39]:
for i in indexes:

    print(corpus_df.iloc[i]['page_title'])

Cluster analysis
Consensus clustering
Single-linkage clustering
Hierarchical clustering
Correlation clustering
