# Keyphrase Extraction & Visualization

**Step 1: Keyphrase Extraction**

In [1]:
# Uncomment the pip install lines if necessary
# !pip install rake_nltk
# !pip install yake
from rake_nltk import Rake
import yake

In [2]:
r = Rake(min_length=1, max_length=4)
y = yake.KeywordExtractor()

In [3]:
sentences = []
sentences.append("Since the release of the first novel, Harry Potter and the Philosopher's Stone, on 26 June 1997, the books have found immense popularity, critical acclaim and commercial success worldwide.")
sentences.append("The Harry Potter novels are mainly directed at a young adult audience as opposed to an audience of middle grade readers, children, or adults.")

In [4]:
for sentence in sentences:
    print("sentence: ", sentence)
    r.extract_keywords_from_text(sentence)
    rake_keywords = r.get_ranked_phrases_with_scores()
    print("rake keyphrases: ")
    for kw in rake_keywords:
        print(kw)
    yake_keywords = y.extract_keywords(sentence)
    print("yake keyphrases: ")
    for kw in yake_keywords:
        print(kw)

sentence:  Since the release of the first novel, Harry Potter and the Philosopher's Stone, on 26 June 1997, the books have found immense popularity, critical acclaim and commercial success worldwide.
rake keyphrases: 
(9.0, 'found immense popularity')
(9.0, 'commercial success worldwide')
(9.0, '26 june 1997')
(4.0, 'harry potter')
(4.0, 'first novel')
(4.0, 'critical acclaim')
(1.0, 'stone')
(1.0, 'since')
(1.0, 'release')
(1.0, 'philosopher')
(1.0, 'books')
yake keyphrases: 
(0.00601189725323029, 'found immense popularity')
(0.00601189725323029, 'commercial success worldwide')
(0.007237648981780116, 'harry potter')
(0.007237648981780116, 'philosopher stone')
(0.024805549895152507, 'immense popularity')
(0.024805549895152507, 'critical acclaim')
(0.024805549895152507, 'success worldwide')
(0.04567049649971613, 'books have found')
(0.04567049649971613, 'found immense')
(0.04567049649971613, 'acclaim and commercial')
(0.04567049649971613, 'commercial success')
(0.057012387690331526, 'ju

In [5]:
paragraphs = []
paragraphs.append("Harry Potter is a series of fantasy novels written by British author J. K. Rowling. The novels chronicle the lives of a young wizard, Harry Potter, and his friends Hermione Granger and Ron Weasley, all of whom are students at Hogwarts School of Witchcraft and Wizardry. The main story arc concerns Harry's struggle against Lord Voldemort, a dark wizard who intends to become immortal, overthrow the wizard governing body known as the Ministry of Magic and subjugate all wizards and Muggles (non-magical people).")
paragraphs.append("The central character in the series is Harry Potter, a boy who lives in the fictional town of Little Whinging, Surrey with his aunt, uncle, and cousin – the Dursleys – and discovers at the age of eleven that he is a wizard, though he lives in the ordinary world of non-magical people known as Muggles. The wizarding world exists parallel to the Muggle world, albeit hidden and in secrecy. His magical ability is inborn, and children with such abilities are invited to attend exclusive magic schools that teach the necessary skills to succeed in the wizarding world.")
print("paragraphs: ", paragraphs)

paragraphs:  ["Harry Potter is a series of fantasy novels written by British author J. K. Rowling. The novels chronicle the lives of a young wizard, Harry Potter, and his friends Hermione Granger and Ron Weasley, all of whom are students at Hogwarts School of Witchcraft and Wizardry. The main story arc concerns Harry's struggle against Lord Voldemort, a dark wizard who intends to become immortal, overthrow the wizard governing body known as the Ministry of Magic and subjugate all wizards and Muggles (non-magical people).", 'The central character in the series is Harry Potter, a boy who lives in the fictional town of Little Whinging, Surrey with his aunt, uncle, and cousin – the Dursleys – and discovers at the age of eleven that he is a wizard, though he lives in the ordinary world of non-magical people known as Muggles. The wizarding world exists parallel to the Muggle world, albeit hidden and in secrecy. His magical ability is inborn, and children with such abilities are invited to at

In [6]:
def get_keyphrases(text):
    if text.strip() == "":
        return [""]
    keyphrases = []
    r.extract_keywords_from_text(text)
    rake_keywords = r.get_ranked_phrases_with_scores()
    yake_keywords = y.extract_keywords(text)
    for kw in rake_keywords:
        if kw[0] > 3:
            keyphrases.append(kw[1])
    for kw in yake_keywords:
        if kw[0] < 0.4:
            keyphrases.append(kw[1])
    return list(set(keyphrases)) # to remove duplicates

for paragraph in paragraphs:
    print("paragraph: ", paragraph)
    print(get_keyphrases(paragraph))

paragraph:  Harry Potter is a series of fantasy novels written by British author J. K. Rowling. The novels chronicle the lives of a young wizard, Harry Potter, and his friends Hermione Granger and Ron Weasley, all of whom are students at Hogwarts School of Witchcraft and Wizardry. The main story arc concerns Harry's struggle against Lord Voldemort, a dark wizard who intends to become immortal, overthrow the wizard governing body known as the Ministry of Magic and subjugate all wizards and Muggles (non-magical people).
['author', 'harry', 'written by british', 'lord voldemort', 'young wizard', 'ron weasley', 'granger and ron', 'british author j', 'hermione granger', 'witchcraft and wizardry', 'potter', 'rowling', 'school of witchcraft', 'ministry of magic', 'magical people ).', 'wizard', 'wizard governing body known', 'british author', 'become immortal', 'hogwarts school', 'british', 'dark wizard', 'series of fantasy', 'friends hermione granger', 'harry potter', 'novels chronicle', 'fan

**Step 2: Visualization**

In [7]:
# Imports
import kmapper as km
import numpy as np
from sklearn.datasets import fetch_20newsgroups

In [8]:
# Get data
newsgroups = fetch_20newsgroups(subset='train')
raw_data = newsgroups.data
X, target, target_names = np.array(raw_data), np.array(newsgroups.target), np.array(newsgroups.target_names)
print("SAMPLE: ", X[0])
print("SHAPE: ", X.shape)
print("TARGET: ", target_names[target[0]])

SAMPLE:  From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





SHAPE:  (11314,)
TARGET:  rec.autos


In [None]:
# Format data
import re
# get rid of headers
headers = ["Lines: ", "NNTP-Posting-Host: ", "NNTP Posting Host: "]
data_formatted = []
for entry in raw_data[:1000]:
    occurrences = []
    for header in headers:
        occurrences.append(entry.lower().find(header.lower()))
    champ = max(occurrences)
    if champ != -1:
        start = entry.find("\n", champ)
        data_formatted.append(entry[start:])
    else:
        data_formatted.append(entry)
data_formatted = [re.sub("[\n\t-]", " ", entry) for entry in data_formatted]
extracted_phrases = [get_keyphrases(entry) for entry in data_formatted]
print("SAMPLE EXTRACTED PHRASES", extracted_phrases[0])
extracted_phrases_joined = [" ".join(phrase) for phrase in extracted_phrases]
print("ORIGINAL SENTENCE", data_formatted[0])
print("JOINED SENTENCE", extracted_phrases_joined[0])

In [None]:
# Project data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import Isomap
from sklearn.preprocessing import MinMaxScaler

mapper = km.KeplerMapper(verbose=2)

projected_X = mapper.fit_transform(np.array(extracted_phrases_joined),
    projection=[TfidfVectorizer(analyzer="char",
                                ngram_range=(1,6),
                                max_df=0.83,
                                min_df=0.05),
                TruncatedSVD(n_components=100,
                             random_state=1729),
                Isomap(n_components=2,
                       n_jobs=-1)],
    scaler=[None, None, MinMaxScaler()])

print("SHAPE",projected_X.shape)

In [None]:
# Cluster data
from sklearn import cluster
graph = mapper.map(projected_X, clusterer=cluster.DBSCAN(eps=0.5, min_samples=3))

In [None]:
# Get features from data
vec = TfidfVectorizer(analyzer="word",
                      strip_accents="unicode",
                      stop_words="english",
                      ngram_range=(1,3),
                      max_df=0.97,
                      min_df=0.02)

interpretable_inverse_X = vec.fit_transform(X).toarray()
interpretable_inverse_X_names = vec.get_feature_names()

print("SHAPE", interpretable_inverse_X.shape)
print("FEATURE NAMES SAMPLE", interpretable_inverse_X_names[:400])

In [None]:
# Visualize data!
html = mapper.visualize(graph,
                        X=interpretable_inverse_X,
                        X_names=interpretable_inverse_X_names,
                        path_html="newsgroups20.html",
                        lens=projected_X,
                        lens_names=["ISOMAP1", "ISOMAP2"],
                        title="Newsgroups20: Latent Semantic Char-gram Analysis with Isometric Embedding",
                        custom_tooltips=np.array([target_names[ys] for ys in target]),
                        color_function=target)