In [2]:
import pandas as pd
import nltk
from collections import Counter
import string
import wikipedia
import pipeline

## Data Pre-Processing

In [4]:
text = ""
with open('text.txt', 'r') as file:
    lines = file.readlines()
for line in lines:
    if 'J.K. Rowling' in line or line.isupper():
        continue
    else:
        text = text.join(line)
print(text)
# Delete all of the "Capitol 1: ....."

: 

: 

## POS Tagging

In [None]:

text = None
with open('text.txt', 'r') as file:
    text = file.read()
tokens = nltk.word_tokenize(text)
pos = nltk.pos_tag(tokens)

count = Counter(pos)
sort_pos = sorted(count.items(), key=lambda count:count[1], reverse=True)
print('POS Top 10')
print(sort_pos[:10])
print()

filtered_tokens = [token for token in tokens 
                       if token not in string.punctuation 
                       if token not in nltk.corpus.stopwords.words('english')]
tagged = nltk.pos_tag(filtered_tokens)

count = Counter(tagged)
sort_tagged = sorted(count.items(), key=lambda count:count[1], reverse=True)
print('POS Top 10 filtered')
print(sort_tagged[:10])

## NER with entity classification (using nltk.ne_chunk)

In [None]:
text = None
with open('reviews.txt', 'r') as file:
    text = file.read()
    
tokens = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokens)

ne_chunked = nltk.ne_chunk(tagged)
ner = {}
for entity in ne_chunked:
    if isinstance(entity, nltk.tree.Tree):
        text = " ".join([word for word, tag in entity.leaves()])
        ent = entity.label()
        ner[text] = ent
    else:
        continue

sort_ner = sorted(ner.items(), key=lambda entity: entity[1][1], reverse=True)
print('NER Top 20')
print(sort_ner[:20])

## NER with custom patterns

In [None]:
text = None
with open('reviews.txt', 'r') as file:
    text = file.read()
    
tokens = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokens)
entity = []
custom_ner = []
for tagged_entry in tagged:
    if(tagged_entry[1].startswith("NN") or (entity and tagged_entry[1].startswith("IN"))):
        entity.append(tagged_entry)
    else:
        if(entity) and entity[-1][1].startswith("IN"):
            entity.pop()
        if(entity and " ".join(e[0] for e in entity)[0].isupper()):
            custom_ner.append(" ".join(e[0] for e in entity))
        entity = []
count = Counter(custom_ner)
sort_custom_ner = sorted(count.items(), key=lambda count:count[1], reverse=True)
print('Custom NER Top 20')
print(sort_custom_ner[:20])

## Custom entity classification

In [None]:
def wiki(name):
    try:
        page = wikipedia.page(name)
        summary = page.summary
    except:
        return ""
    return nltk.sent_tokenize(summary)[0]

def wikidescription(name):
    sent = wiki(name)
    if sent == "":
        return "a Thing"
    
    text_pos = nltk.pos_tag(nltk.word_tokenize(sent))
    
    grammar = "NP: {<DT>?<JJ>*<NN|NNS>}"
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(text_pos)
    data = {}
    for entity in result:
        if isinstance(entity, nltk.tree.Tree):
            text = " ".join([word for word, tag in entity.leaves()])
            ent = entity.label()
            data[text] = ent
        else:
            continue
    str = ""
    for data in data:
        str+=data
        if (str[-1] != ' '):
            str += ' '
    return str
    

In [None]:
c = 0
for i in sort_ner:
    print(i[0], ' - ', wikidescription(i[0]))
    c+= 1
    if c == 20:
        break

In [None]:
c = 0
for i in sort_custom_ner:
    print(i[0], ' - ', wikidescription(i[0]))
    c+= 1
    if c == 20:
        break

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
text = None
with open('reviews.txt', 'r') as file:
    text = file.read()

ner_results = nlp(text)
print(ner_results)