In [None]:
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tree import Tree
from nltk import FreqDist

In [None]:
# This will be the sample text used throughout this notebook.
txt = """This Washington paper describes the psychosocial effects of a program of Supported Employment (SE) 
for persons with severe mental illness the SE program involves extended individualized supported employment 
for clients through a mobile job support worker and jsw who maintains contact with the client after job 
placement and supports the client in a variety of ways a 50% simple random sample was taken of all persons 
who entered the thresholds agency between March 1st 93 and February 28th 95 and who met study criteria the 
resulting 484 cases were randomly assigned to either the SE condition treatment group or the usual protocol 
control group which consisted of life skills training and employment in an in-house sheltered workshop setting 
all participants were measured at intake and at three months after beginning employment on two measures of 
psychological Funk turning the bprs and gas and two measures of self-esteem are Sensa significant treatment 
effects were found on all four measures but they were in the opposite direction from what was hypothesized 
instead of functioning better and having more self-esteem persons in SE had lower-functioning levels and 
lower self-esteem the most likely explanation is that people who work in low-paying service jobs in real-world 
settings generally do not like them and experienced significant job stress whether they have severe mental 
illness or not the implications for theory in psychosocial Rehabilitation are considered."""

In [None]:
# Filtering stop words (Common words like "this", "that", "a", etc.)
stop_words = set(stopwords.words("english"))
words = word_tokenize(txt, language="english")  # Break text up into a list of words
filtered_words = [w for w in words if w.casefold() not in stop_words]   # Create new list without stopwords
print(filtered_words)

In [None]:
# Lemmatizing the filtered words
# Lemmatizing involves taking the root of a word (ex: running, ran, runs -> run or flight, flew, flying -> fly)
lemmatizer = WordNetLemmatizer()
filtered_lematized_words = [lemmatizer.lemmatize(w) for w in filtered_words]
print(filtered_lematized_words)

In [None]:
# Tag all the words as verbs, proper nouns, nouns, etc.
words_with_tags = pos_tag(filtered_lematized_words)
print(words_with_tags)

In [None]:
# Find all named entities
tree = ne_chunk(words_with_tags, binary=True)
named_entities = []
current_chunk = []

for i in tree:
    if type(i) == Tree:           
        current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        if current_chunk:
            named_entity = " ".join(current_chunk)                  
            if named_entity not in named_entities:
                named_entities.append(named_entity)
                current_chunk = []
    else:
        continue

In [None]:
# Find all words that have over 3 occurrences
FREQ_THRESHOLD = 3
frequency_distribution = FreqDist(filtered_lematized_words)
most_common = [x[0] for x in frequency_distribution.most_common(20) if x[1] >= FREQ_THRESHOLD]

In [None]:
# Find all proper nouns
prop_nouns = [x[0] for x in words_with_tags if x[1] == "NNP"]

In [None]:
print(named_entities)
print(most_common)
print(prop_nouns)

In [None]:
import itertools
ne = set(named_entities)
mc = set(most_common)
pn = set(prop_nouns)

keywords = set(itertools.chain(ne, mc, pn)) # Combine lists together and convert to set
# Note: converting to a set removes all duplicates

print(keywords)

In [None]:
import sys
sys.path.append("../")
from src.inc.topic_selector import TopicSelector

In [None]:
# TopicSelector class includes all the prior code in the notebook
t = TopicSelector(text=txt, min_freq=4, lang="english")

In [None]:
print(t.get_common_words())
print(t.get_named_entities())
print(t.get_prop_nouns())
print(t.get_keywords())