<a href="https://colab.research.google.com/github/MSaydurRahman/An-Analysis-of-Character-Representation-from-Print-Book-vs-Visual-Movie-Media/blob/master/Name_Entity_Recognition_(Stanford_%26_Spacy).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Connect With Google Drive**

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Data**

In [14]:
t = open('/content/drive/My Drive/499/Sample.txt', "r")
text = t.readline()
print(text)

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn’t think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t met for several years

# **Stanford NLP NER**

**Dependencies**

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
import nltk
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
import os

In [17]:
model = '/content/drive/My Drive/499/stanford-ner-4.0.0/classifiers/english.all.3class.distsim.crf.ser.gz'
jar = '/content/drive/My Drive/499/stanford-ner-4.0.0/stanford-ner.jar'



st = StanfordNERTagger(model, jar,encoding='utf-8')

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)


In [18]:
tokenized_text = nltk.word_tokenize(text)
classified_text = st.tag(tokenized_text)

classified_text_df = pd.DataFrame(classified_text)

classified_text_df.drop_duplicates(keep='first', inplace=True)
classified_text_df.reset_index(drop=True, inplace=True)
classified_text_df.columns = ["Entities", "Labels"]
classified_text_df

Unnamed: 0,Entities,Labels
0,Mr.,O
1,and,O
2,Mrs.,O
3,Dursley,PERSON
4,",",O
...,...,...
185,wrestled,O
186,screaming,O
187,into,O
188,high,O


In [19]:
tokenized_text = nltk.word_tokenize(text)
classified_text = st.tag(tokenized_text)

netagged_words = classified_text

entities = []
labels = []

from itertools import groupby
for tag, chunk in groupby(classified_text, lambda x:x[1]):
    if tag != "O":
        entities.append(' '.join(w for w, t in chunk))
        labels.append(tag)
        
    
entities_all = list(zip(entities, labels))
entities_unique = list(set(zip(entities, labels))) #unique entities   
entities_df = pd.DataFrame(entities_unique)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,Dursleys,PERSON
1,Dursleys,ORGANIZATION
2,Dursley,PERSON
3,Privet Drive,LOCATION
4,Potter,PERSON
5,Dudley,PERSON


# **Using Spacy**

**Dependencies**

In [20]:
import spacy 
from spacy import displacy
#SpaCy 2.x brough significant speed and accuracy improvements
spacy.__version__

'2.2.4'

In [21]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")
#nlp = spacy.load("en_core_web_md")
#nlp = spacy.load("en_core_web_lg")

In [22]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

df

Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,(Dursley),PERSON,13,20
1,"(number, four)",CARDINAL,25,36
2,"(Privet, Drive)",GPE,38,50
3,(Dursley),PERSON,267,274
4,(Grunnings),ORG,309,318
5,(Dursley),PERSON,434,441
6,(Dursleys),PERSON,625,633
7,(Dudley),PERSON,657,663
8,(Dursleys),PERSON,722,730
9,(Potters),ORG,915,922


In [23]:
spacy.explain("PROPN")

'proper noun'

**1st Process To count word frquency**

In [24]:

from collections import Counter
nlp = spacy.load('en')
# all tokens that arent stop words or punctuations
words = [token.text for token in doc if token.is_stop != True and token.is_punct != True]

# Proper noun tokens that arent stop words or punctuations
proper_nouns = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "PROPN"]

# noun tokens that arent stop words or punctuations
nouns = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "NOUN"]

# noun tokens that arent stop words or punctuations
adj = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "ADJ"]

# noun tokens that arent stop words or punctuations
verb = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "VERB"]

# common tokens
word_freq = Counter(words)
common_words = word_freq.most_common()
print("Common Words:", common_words)

# common proper noun tokens
prop_noun_freq = Counter(proper_nouns)
common_prop_nouns = prop_noun_freq.most_common()
print("Common Proper Nouns:", common_prop_nouns)

# common noun tokens
noun_freq = Counter(nouns)
common_nouns = noun_freq.most_common()
print("Common Nouns:", common_nouns)

# common adj tokens
adj_freq = Counter(adj)
common_adj = adj_freq.most_common()
print("Common Adjectives:", common_adj)

# common verb tokens
verb_freq = Counter(verb)
common_verb = verb_freq.most_common()
print("Common Verbs:", common_verb)

Common Words: [('Dursley', 8), ('Mrs.', 7), ('Mr.', 4), ('Dursleys', 4), ('Potters', 4), ('Dudley', 3), ('sister', 3), ('strange', 2), ('mysterious', 2), ('called', 2), ('neck', 2), ('neighbors', 2), ('small', 2), ('son', 2), ('boy', 2), ('think', 2), ('good', 2), ('away', 2), ('number', 1), ('Privet', 1), ('Drive', 1), ('proud', 1), ('perfectly', 1), ('normal', 1), ('thank', 1), ('people', 1), ('expect', 1), ('involved', 1), ('hold', 1), ('nonsense', 1), ('director', 1), ('firm', 1), ('Grunnings', 1), ('drills', 1), ('big', 1), ('beefy', 1), ('man', 1), ('hardly', 1), ('large', 1), ('mustache', 1), ('thin', 1), ('blonde', 1), ('nearly', 1), ('twice', 1), ('usual', 1), ('came', 1), ('useful', 1), ('spent', 1), ('time', 1), ('craning', 1), ('garden', 1), ('fences', 1), ('spying', 1), ('opinion', 1), ('finer', 1), ('wanted', 1), ('secret', 1), ('greatest', 1), ('fear', 1), ('somebody', 1), ('discover', 1), ('bear', 1), ('found', 1), ('Potter', 1), ('met', 1), ('years', 1), ('fact', 1), (

**2nd Process To count word frquency**

In [25]:
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

keyword = []
stopwords = list(STOP_WORDS)
pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
for token in doc:
    if(token.text in stopwords or token.text in punctuation):
        continue
    if(token.pos_ in pos_tag):
        keyword.append(token.text)

freq_word = Counter(keyword)
print(freq_word.most_common())

[('Dursley', 8), ('Mrs.', 7), ('Mr.', 4), ('Dursleys', 4), ('Potters', 4), ('Dudley', 3), ('sister', 3), ('strange', 2), ('mysterious', 2), ('called', 2), ('neck', 2), ('neighbors', 2), ('small', 2), ('son', 2), ('boy', 2), ('think', 2), ('number', 1), ('Privet', 1), ('Drive', 1), ('proud', 1), ('normal', 1), ('thank', 1), ('people', 1), ('expect', 1), ('involved', 1), ('hold', 1), ('nonsense', 1), ('director', 1), ('firm', 1), ('Grunnings', 1), ('drills', 1), ('big', 1), ('beefy', 1), ('man', 1), ('large', 1), ('mustache', 1), ('thin', 1), ('blonde', 1), ('usual', 1), ('came', 1), ('useful', 1), ('spent', 1), ('time', 1), ('craning', 1), ('garden', 1), ('fences', 1), ('spying', 1), ('opinion', 1), ('finer', 1), ('wanted', 1), ('secret', 1), ('greatest', 1), ('fear', 1), ('discover', 1), ('bear', 1), ('found', 1), ('Potter', 1), ('met', 1), ('years', 1), ('fact', 1), ('pretended', 1), ('fornothing', 1), ('husband', 1), ('unDursleyish', 1), ('possible', 1), ('shuddered', 1), ('arrived',

In [26]:
from spacy import displacy

entities=[(i, i.label_, i.label) for i in doc.ents]
entities

[(Dursley, 'PERSON', 380),
 (number four, 'CARDINAL', 397),
 (Privet Drive, 'GPE', 384),
 (Dursley, 'PERSON', 380),
 (Grunnings, 'ORG', 383),
 (Dursley, 'PERSON', 380),
 (Dursleys, 'PERSON', 380),
 (Dudley, 'PERSON', 380),
 (Dursleys, 'PERSON', 380),
 (Potters, 'ORG', 383),
 (Potter, 'PERSON', 380),
 (Dursley, 'PERSON', 380),
 (several years, 'DATE', 391),
 (Dursley, 'PERSON', 380),
 (Potters, 'ORG', 383),
 (Potters, 'ORG', 383),
 (Potters, 'WORK_OF_ART', 388),
 (Dudley, 'PERSON', 380),
 (Dursley, 'PERSON', 380),
 (Tuesday, 'DATE', 391),
 (Dursley, 'PERSON', 380),
 (Dursley, 'PERSON', 380),
 (Dudley, 'PERSON', 380)]

In [27]:
displacy.render(nlp(str(entities)), jupyter=True, style='ent')