# Topic Modeling
---
#### Imports

In [1]:
# processing
from operator import methodcaller
import csv
import re
import numpy as np
import pandas as pd
from pprint import pprint

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

#sci-kit
from sklearn import feature_extraction



#### Processing

In [2]:
blacklist = [t.strip() for t in next(csv.reader(open("blacklist.csv", 'r')))]

inPath = "input.csv"
outPath = "out.csv"
wordBound = 10
charBound = 70

inFile = open(inPath, 'r')
inReader = csv.reader(inFile)

outFile = open(outPath, 'w')
outWriter = csv.writer(outFile)


docTokens = dict()


next(inReader)
for inRow in inReader:

    charDist = int(inRow[0])
    wordDist = int(inRow[1])

    if wordDist < wordBound and charDist < charBound:

        #predTerm, subTerm, objTerm = map(methodcaller("split", ":"), inRow[2:5])
        #allTerms = predTerm + subTerm + objTerm

        subTerm, objTerm = map(methodcaller("split", ":"), inRow[3:5])
        
        sub = "_".join([t for t in subTerm if re.match(r'[^\W\d]*$', t) and not t in blacklist])
        obj = "_".join([t for t in subTerm if re.match(r'[^\W\d]*$', t) and not t in blacklist])
        
        tokens = list()
        if not sub in blacklist and not obj in blacklist and len(sub) > 0 and len(obj) > 0:
            tokens = [sub, obj]

            docID = inRow[5]

            if docID in docTokens:
                docTokens[docID] += tokens
            else:
                docTokens[docID] = tokens
docIDs = list(docTokens.keys())
data = list(docTokens.values())

#### Model

In [13]:
id2word = corpora.Dictionary(data)
texts = data

corpus = [id2word.doc2bow(text) for text in texts]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       update_every=1,
                                       chunksize=20,
                                       passes=1,
                                       alpha='auto',
                                       per_word_topics=True)


pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


[(0,
  '0.109*"crystal_structure" + 0.020*"agc_family" + '
  '0.020*"detector_technology" + 0.020*"study_reliability" + '
  '0.020*"phosphate_iron_vanadate" + 0.015*"hydrogenous_material" + '
  '0.015*"recent_time" + 0.015*"morphological_information" + '
  '0.015*"alternative_quantification" + 0.015*"obtain_approach"'),
 (1,
  '0.063*"diffraction_datum" + 0.023*"important_role" + 0.023*"da_minus" + '
  '0.019*"weak_pi" + 0.017*"kda_protein" + 0.015*"recent_research" + '
  '0.015*"e_pathway" + 0.015*"vm_value" + '
  '0.015*"nucleocapsid_protein_interest" + 0.015*"monomer_position"'),
 (2,
  '0.033*"system_performance" + 0.022*"qualitative_analysis" + '
  '0.022*"transform" + 0.022*"concentrate_polymer_solution" + 0.022*"mix_film" '
  '+ 0.018*"molecular_replacement" + 0.016*"quantitative_analysis" + '
  '0.016*"supervise_quantitative_analysis" + 0.015*"datum_set" + '
  '0.015*"ring_system"'),
 (3,
  '0.159*"cyano_group" + 0.080*"pyrazole_ring_nh_h_atom" + 0.080*"nh_h_atom" + '
  '0.080*

#### Visualize

In [156]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.show(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))



Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [03/Jul/2019 22:08:27] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [03/Jul/2019 22:08:27] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [03/Jul/2019 22:08:27] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [03/Jul/2019 22:08:27] "GET /LDAvis.js HTTP/1.1" 200 -
127.0.0.1 - - [03/Jul/2019 22:08:27] code 404, message Not Found
127.0.0.1 - - [03/Jul/2019 22:08:27] "GET /favicon.ico HTTP/1.1" 404 -



stopping Server...


In [4]:

from sklearn.feature_extraction.text import TfidfVectorizer

newDocs = list()
for doc in data:
    newDocs.append(" ".join(doc))


#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.05, max_features=200000,
                                 min_df=0.000, stop_words='english',
                                 use_idf=True, tokenizer=None, ngram_range=(1, 1))

tfidf_matrix = tfidf_vectorizer.fit_transform(newDocs) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

(923, 2200)


In [5]:
terms = tfidf_vectorizer.get_feature_names()

from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [161]:
from sklearn.cluster import KMeans

num_clusters = 20

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

Wall time: 342 ms


In [8]:
import sys
import urllib.request
from urllib.error import HTTPError


BASE_URL = 'http://dx.doi.org/'

def getTitle(doi):
    url = BASE_URL + doi
    req = urllib.request.Request(url)
    req.add_header('Accept', 'application/x-bibtex')
    try:
        with urllib.request.urlopen(req) as f:
            bibtex = f.read().decode()
        start = bibtex.find("title = {")
        end = bibtex.find("},", start)
        return bibtex[start + 9:end]
        
        
    except HTTPError as e:
        if e.code == 404:
            return('DOI not found.')
        else:
            return('Service unavailable.')

In [162]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()


print(clusters)

[7, 7, 7, 7, 7, 7, 7, 7, 13, 1, 7, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 7, 7, 7, 7, 1, 7, 7, 1, 7, 19, 12, 7, 7, 7, 1, 7, 7, 7, 7, 7, 1, 7, 9, 1, 7, 1, 1, 7, 7, 1, 7, 7, 7, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 16, 1, 7, 0, 1, 7, 7, 7, 6, 7, 1, 3, 7, 17, 7, 7, 7, 7, 7, 9, 7, 1, 2, 7, 7, 7, 1, 7, 10, 1, 7, 7, 7, 1, 7, 7, 11, 7, 1, 7, 6, 7, 7, 0, 8, 7, 1, 1, 7, 7, 7, 1, 7, 10, 7, 7, 1, 7, 1, 7, 1, 7, 7, 7, 7, 7, 7, 7, 10, 7, 1, 7, 7, 19, 7, 7, 1, 14, 7, 1, 7, 1, 7, 7, 7, 7, 7, 1, 16, 7, 7, 7, 7, 7, 7, 7, 1, 7, 1, 7, 1, 1, 7, 7, 1, 1, 7, 7, 13, 7, 7, 7, 1, 7, 1, 1, 1, 7, 7, 1, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 7, 7, 7, 7, 7, 1, 1, 7, 7, 7, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 7, 7, 1, 7, 7, 7, 7, 1, 9, 7, 7, 5, 7, 7, 7, 7, 7, 7, 0, 1, 14, 7, 7, 0, 1, 7, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 7, 14, 7, 18, 7, 1, 1, 1, 7, 1, 1, 0, 1, 7, 7, 7, 7, 7, 7, 0, 1, 1, 7, 7, 7, 7, 7, 16, 7, 7, 1, 7, 1, 7, 7, 7, 7, 7, 7, 7

In [163]:
documents = {'docID': docIDs, 'text': data, 'cluster': clusters }

frame = pd.DataFrame(documents, index = [clusters] , columns = ['docID','cluster'])

frame['cluster'].value_counts() #number of films per cluster (clusters from 0 to 4)



7     614
1     141
2      31
9      22
14     14
10     13
6      11
5      10
0      10
11     10
17      9
3       7
15      6
4       6
8       5
19      5
16      4
13      3
18      1
12      1
Name: cluster, dtype: int64

In [None]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid

outWriter = csv.writer(open("out.csv", 'w'), lineterminator = "\n")


for i in range(num_clusters):
    print("Cluster %d docIDs:" % i, end='')
    
    DOIs = frame.ix[i]['docID'].values.tolist()
    titles = [getTitle(ID) for ID in DOIs]
    print(titles)
    outWriter.writerow(titles)
    print() #add whitespace
    print() #add whitespace

print()
print()

Top terms per cluster:

Cluster 0 docIDs:

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


['Di-2-pyridyl ketonep-aminobenzoylhydrazone hydrate', 'A novel metallo-organically templated pentaborate: acetato[N,N$\\prime$-bis(2-aminoethyl)ethane-1,2-diamine]zinc({II}) 4,4$\\prime$,6,6$\\prime$-tetrahydroxy-2,2$\\prime$-spirobi[cyclotriboroxane](1-)', 'Crystal structure of $\\upbeta$-luffin, a ribosome-inactivating protein, at 2.0{\\hspace{0.25em}}{\\{AA}} resolution', 'Gene design, expression, crystallization and preliminary diffraction analysis of two isolectins from the {fungusCoprinus} cinereus: a model for studying functional diversification of galectins in the same organism and their evolutionary pathways', 'Structural basis of high-order oligomerization of the cullin-3 adaptor {SPOP}', 'Preliminary crystallographic characterization of anin vitroevolved biotin-binding {RNA} pseudoknot', 'Purification and crystallization of Cor a 9, a major hazelnut allergen', 'Cloning, expression, purification and crystallization of an endotoxin-biosynthesis enzyme {fromNeisseria} meningit

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


['Polarised neutron investigation of iron composite nanoparticles', 'Accuracy in Rietveld quantitative phase analysis of Portland cements', 'Combined synchrotron X-ray and image-correlation analyses of biaxially deformed W/Cu nanocomposite thin films on Kapton', 'A dynamic study of the crystallization of polyethylene from the melt', 'A new method to determine the exact values of the fiber identity period of polyamides', 'Whole-powder-pattern fitting without reference to a structural model: application to X-ray powder diffraction data', 'Structure refinement of {GeO}2 polymorphs at high pressures and temperatures by energy-dispersive spectra of powder diffraction', 'Relation between the optical properties and structure of {KLiSO}4in the room-temperature phase', 'The structure of pumice by neutron diffraction', 'An open-flow cryogenic cooler for single-crystal diffraction experiments', 'Determination of the Cation Distribution in {NiFe}2({PO}4)2 using Resonant X-ray and Neutron Powder Di

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


['Do C{\\textemdash}H...O and C{\\textemdash}H...$\\uppi$ interactions help to stabilize a non-centrosymmetric structure for racemic 2,3-dibromo-1,3-diphenylpropan-1-one?', '21$\\upalpha$-Fluoro-7-nor-12,13,15,16-tetrahydrovouacapane-17$\\upbeta$,21$\\upalpha$-lactone', 'N$\\prime$-(Benzenesulfonyl)-4-methylthiazole-5-carbohydrazide', '2,5,7-Trinitro-2,5,7,9-tetraazabicyclo[4.3.0]nonan-8-one', '2-Amino-5-chloro-1,3-benzoxazole', '(E)-2-Chlorobenzaldehyde oxime', '1-(5,6-Dimethyl-1,2,4-triazin-3-yl)-2-methyl-1H-benzimidazole', '2-(2-Pyridylmethylammonio)ethanesulfonate dihydrate', '9-(Bromoacetyl)anthracene', 'Bis[iodidobis(1,10-phenanthroline-$\\upkappa$2N,N$\\prime$)copper({II})] tetraiodidocadmate({II})', 'Bis[$\\upmu$-5-(pyrazin-2-yl)tetrazol-1-ido]bis[azido(2,2$\\prime$-bipyridine)copper({II})]', 'N,N,N$\\prime$,N$\\prime$-Tetraphenylnaphthalene-1,4-dicarboxamide', '(E)-1,3-Benzodioxole-5-carbaldehyde 4-nitrophenylhydrazone', '1,t-3-Dimethyl-r-2,c-6-diphenylpiperidin-c-4-yl acetate

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


['Asperuloside monohydrate', 'Hydrogen-bonding and C{\\textemdash}H...$\\uppi$ interactions in 1,7-bis(4-hydroxy-3-methoxyphenyl)heptane-3,5-dione (tetrahydrocurcumin)', 'N-(9-Anthrylmethyl)propylaminium diphenylphosphinate monohydrate', '1,4-Dibromonaphthalene-2,3-diol', '1,3-Bis(2-chlorophenyl)thiourea: a monoclinic polymorph', 'N-(4-Chlorobutanoyl)-N$\\prime$-[2-(trifluoromethyl)phenyl]thiourea', 'Crystal structure of 2-(diphenylphosphanyl)phenyl 4-(hydroxymethyl)benzoate']


Cluster 4 docIDs:

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


['1-(1H-1,3-Benzimidazol-2-yl)guanidinium dihydrogendodecamolybdophosphate{\\textendash}methanol{\\textendash}water (1/2.5/4)', 'Bupropion hydrobromide propanol hemisolvate', '$\\lbrace$4-Bromo-2-[(5-chloro-2-oxidophenyl)iminomethyl]phenolato-$\\upkappa$3O,N,O$\\prime$$\\rbrace$(methanol-$\\upkappa$O)(methanolato-$\\upkappa$O)oxidovanadium(V)', 'Bis[N-(2-hydroxyethyl)-N-methyldithiocarbamato-$\\upkappa$S][2,4,6-tris(pyridin-2-yl)-1,3,5-triazine-$\\upkappa$3N1,N2,N6]zinc dioxane sesquisolvate', '2-Vinylpyridine{\\textendash}tris(pentafluorophenyl)borane hexane monosolvate', 'Twofac-tricarbonylrhenium(I) azadipyrromethene ({ADPM}) complexes: ligand-substitution effect on crystal structure']


Cluster 5 docIDs:

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


['The negative-quartet relation from electron-density considerations', 'Separating nucleation and growth in protein crystallization using dynamic light scattering', 'Local structure study of dilute Er in {III}{\\textendash}V semiconductors by fluorescence {EXAFS}', 'Ethyl 2-acetyl-3-anilinobutanoate', '(Acetonitrile-$\\upkappa$N)(3-amino-4-methylbenzenesulfonato-$\\upkappa$N)aqua(triphenylphosphine-$\\upkappa$P)silver(I) hemihydrate', '1-(Benzylideneamino)pyridinum iodide', 'N-[11-(4-Chlorophenyl)-11,12-dihydrobenzo[c]phenanthridin-6-yl]benzamide', '1,3-Bis(4-fluorophenyl)-N,N$\\prime$-(propane-1,3-diylidene)dihydroxylamine', '4-tert-Butylpyridinium chloride{\\textendash}4,4$\\prime$-(propane-2,2-diyl)bis(2,6-dimethylphenol){\\textendash}toluene (2/2/1)', 'Crystallization and preliminary crystallographic analysis of the catechol 2,3-dioxygenase {PheB} {fromBacillus} {stearothermophilusBR}219']


Cluster 6 docIDs:

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


['Dinicotinamidium squarate', 'Four 2-amino-6-aryl-4-methoxy-11H-pyrimido[4,5-b][1,4]benzodiazepines: similar molecular structures but different crystal structures', 'Bis[1-(2-hydroxyethyliminomethyl)-2-naphtholato-$\\upkappa$2N,O]nickel({II})', 'Triaquabis(4-formylbenzoato-$\\upkappa$2O,O$\\prime$)cadmium({II}) 3.5-hydrate', '3-Phenyl-1-[2-(3-phenylisoquinolin-1-yl)diselanyl]isoquinoline', '2-(1H-Benzotriazol-1-yl)-1-phenylethanol', '3,5-Bis(4-hydroxyphenyl)-4H-1,2,4-triazol-4-amine monohydrate', '4-Hydrazinylidene-1-methyl-3H-2$\\uplambda$6,1-benzothiazine-2,2-dione', '2-(4-Chloro-1H-indol-3-yl)acetonitrile', 'Ergotaminine', 'Methyl 4-(benzyloxy)-3-methoxybenzoate']


Cluster 7 docIDs:

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


In [11]:

from scipy.cluster.hierarchy import ward, dendrogram


import matplotlib as mpl
mpl.rcParams['text.usetex'] = True
mpl.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}']


linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

titles = [getTitle(ID) for ID in docIDs]

fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=titles);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters


FileNotFoundError: [WinError 2] The system cannot find the file specified

Error in callback <function install_repl_displayhook.<locals>.post_execute at 0x000001E5E570CC80> (for post_execute):


FileNotFoundError: [WinError 2] The system cannot find the file specified

FileNotFoundError: [WinError 2] The system cannot find the file specified

<Figure size 1080x1440 with 1 Axes>