In [81]:
import os
import numpy as np
import pandas as pd
from pprint import pprint
import matplotlib.pyplot as plt

import networkx as nx

import warnings
warnings.filterwarnings("ignore")

In [5]:
!pip install spacy
!pip install pytextrank

Collecting pytextrank
  Downloading pytextrank-3.2.4-py3-none-any.whl (30 kB)
Collecting icecream>=2.1
  Downloading icecream-2.1.3-py2.py3-none-any.whl (8.4 kB)
Collecting scipy>=1.7
  Using cached scipy-1.7.3-cp37-cp37m-macosx_10_9_x86_64.whl (33.0 MB)
Collecting pygments>=2.7.4
  Downloading Pygments-2.13.0-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 10.6 MB/s eta 0:00:01
[?25hCollecting graphviz>=0.13
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[K     |████████████████████████████████| 47 kB 10.7 MB/s eta 0:00:01
[?25hCollecting networkx[default]>=2.6
  Downloading networkx-2.6.3-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 11.4 MB/s eta 0:00:01
Collecting executing>=0.3.1
  Downloading executing-1.1.1-py2.py3-none-any.whl (22 kB)
Collecting asttokens>=2.0.1
  Downloading asttokens-2.0.8-py2.py3-none-any.whl (23 kB)
Collecting matplotlib>=3.3; extra == "default"
  Downloading matplotlib

In [67]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [52]:
text = ""

file = open('/Users/megan/work/ml-study/text-mining-example/covid19/docs.txt', 'r')
lines = file.readlines()[0:5]

for line in lines:
    text += line.strip()

print(text)



In [68]:
import spacy
import pytextrank

# text = ("When Sebastian Thrun started working on self-driving cars at "
#         "Google in 2007, few people outside of the company took him "
#         "seriously. “I can tell you very senior CEOs of major American "
#         "car companies would shake my hand and turn away because I wasn’t "
#         "worth talking to,” said Thrun, in an interview with Recode earlier "
#         "this week.")

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

# for token in doc[:2]:
#     print(token.text, token.lemma_, token.pos_, token.tag_)
# mutations mutation NOUN NNS
# reveal reveal VERB VBP

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")
doc = nlp(text)

keyword_list = []
ranks_list = []
for phrase in doc._.phrases:
    # print(phrase.text)
    keyword_list.append(phrase.text)
    print(phrase.rank, phrase.count)
    ranks_list.append(phrase.rank)
    print(phrase.chunks)

print(keyword_list[:5])
print(ranks_list[:5])
print(len(keyword_list))
print(len(ranks_list))

0.08641318107468375 1
[initiation di rna replication wild type leader]
0.0809209027940595 1
[rna molecules leader]
0.08006492392426426 1
[rna replication test hypothesis]
0.076783555087456 1
[natural mouse hepatitis coronavirus di rnas di rna bovine coronavirus]
0.0761753149359331 1
[genome replication]
0.07294000568981611 1
[polymerase gene replication packagingmechanism genome transcription]
0.07032408708048372 1
[genome transcription critical stage life cycle virus process viral genetic information]
0.06988864204467865 1
[synthetic rna transcripts]
0.06808785862768796 1
[subsequent replication]
0.06676039914849197 1
[viral genome transcription]
0.06545486451774161 1
[release messenger rna transcripts]
0.06529602837103311 1
[cyclodextrin glycyrrhetinic acid conjugates potential anti influenza virus agents]
0.06374629072284257 1
[new virus]
0.06342979164604863 1
[similar virus]
0.06314378934803085 2
[virus, virus]
0.06302701081194946 1
[leader subgenomic mrnas]
0.062470606339517 1
[pa

In [69]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'textrank']


In [60]:
import spacy
from spacy import displacy

# displacy.serve(doc, style="dep")
displacy.serve(doc, style="ent")



OSError: [Errno 48] Address already in use

In [55]:
rank_cutoff = 0.01

top_keywords = []
for keywords, ranks in zip(keyword_list, ranks_list):
    top_keywords.append((np.array(keywords)[np.array(ranks) > rank_cutoff]).tolist())

print(top_keywords[:5])

[['initiation di rna replication wild type leader'], ['rna molecules leader'], ['rna replication test hypothesis'], ['natural mouse hepatitis coronavirus di rnas di rna bovine coronavirus'], ['genome replication']]


In [73]:
from itertools import combinations, chain
from collections import Counter

# form pairs of keywords from each document
keyword_pair = list(chain(*[list(combinations(str(kw_list).split(), 2)) for kw_list in top_keywords]))

# count all the unique pairs
pair_counter = Counter(keyword_pair).items()

print(len(pair_counter))

845


In [74]:
def get_pair_graph(pair_counter, weight_times=1, degree_cutoff=50):
    G = nx.Graph()

    #construct the graph from the edges
    for pair, weight in pair_counter:
        G.add_edge(*pair, weight=weight_times * (weight))

    # remove nodes with degrees smaller than the cutoff
    node_list = []
    for node in np.copy(G.nodes):
        if G.degree(node) < degree_cutoff:
            G.remove_node(node)

    return G

In [75]:
# get the keyword pair graph
G = get_pair_graph(pair_counter, degree_cutoff=50)

In [78]:
# calculate the node sizes using arbitrary transformation
node_sizes = [20 * G.degree[node] ** 2 + 100 for node in G.nodes]

# construct the label dictionary
labels = {i: i for i in list(G.nodes)}

In [79]:
print(len(G.nodes))

0


In [80]:
# draw the graph
plt.figure(figsize=(10, 10), dpi=100)

pos = nx.spring_layout(G, k=3,
                       fixed=["viruses"], pos={"viruses": (0, 0)},
                       dim=2, iterations=50)

nx.draw_networkx_nodes(G, pos,
                       #with_labels=True,
                       node_color="tab:orange",
                       node_size=node_sizes,
                       node_shape="8",
                       edgecolors="tab:red",
                       )

nx.draw_networkx_edges(G, pos,
                       #with_labels=True,
                       edgecolors="grey",
                       alpha=0.1,
                       )

_ = nx.draw_networkx_labels(G, pos,
                            labels=labels,
                            )

KeyError: 'viruses'

<Figure size 1000x1000 with 0 Axes>