In [1]:
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tree import Tree
from nltk import FreqDist

In [2]:
txt = "This Washington paper describes the psychosocial effects of a program of Supported Employment (SE) for persons with severe mental illness the SE program involves extended individualized supported employment for clients through a mobile job support worker and jsw who maintains contact with the client after job placement and supports the client in a variety of ways a 50% simple random sample was taken of all persons who entered the thresholds agency between March 1st 93 and February 28th 95 and who met study criteria the resulting 484 cases were randomly assigned to either the SE condition treatment group or the usual protocol control group which consisted of life skills training and employment in an in-house sheltered workshop setting all participants were measured at intake and at three months after beginning employment on two measures of psychological Funk turning the bprs and gas and two measures of self-esteem are Sensa significant treatment effects were found on all four measures but they were in the opposite direction from what was hypothesized instead of functioning better and having more self-esteem persons in SE had lower-functioning levels and lower self-esteem the most likely explanation is that people who work in low-paying service jobs in real-world settings generally do not like them and experienced significant job stress whether they have severe mental illness or not the implications for theory in psychosocial Rehabilitation are considered."

In [3]:
# Filtering stop words
stop_words = set(stopwords.words("english"))
words = word_tokenize(txt, language="english")
filtered_words = [w for w in words if w.casefold() not in stop_words]

In [4]:
# Lemmatizing
lemmatizer = WordNetLemmatizer()
filtered_lematized_words = [lemmatizer.lemmatize(w) for w in filtered_words]

In [5]:
words_with_tags = pos_tag(filtered_lematized_words)

In [6]:
# Find all named entities
tree = ne_chunk(words_with_tags, binary=True)
named_entities = []
current_chunk = []

for i in tree:
    if type(i) == Tree:           
        current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        if current_chunk:
            named_entity = " ".join(current_chunk)                  
            if named_entity not in named_entities:
                named_entities.append(named_entity)
                current_chunk = []
    else:
        continue

In [7]:
# Find all words that have over 3 occurrences
FREQ_THRESHOLD = 3
frequency_distribution = FreqDist(filtered_lematized_words)
most_common = [x[0] for x in frequency_distribution.most_common(20) if x[1] >= FREQ_THRESHOLD]

In [8]:
# Find all proper nouns
prop_nouns = [x[0] for x in words_with_tags if x[1] == "NNP"]

In [9]:
print(named_entities)
print(most_common)
print(prop_nouns)

['Washington']
['SE', 'job', 'person', 'employment', 'client', 'measure', 'self-esteem']
['Washington', 'Employment', 'SE', 'SE', 'March', 'February', 'SE', 'Funk', 'Sensa', 'SE', 'Rehabilitation']
