# Notebook to develop NER, POS, and SVAO Generation

## 1. Imports

In [1]:
import spacy
import nltk
from spacy import displacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\farja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\farja\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## 2. NER and POS

### 2.1 NER using SpaCy

In [3]:
NER = spacy.load("en_core_web_sm")

In [17]:
raw_text = "I don't know much about COVID and the vaccines for it"
result = NER(raw_text.title())
for word in result.ents:
    print(word.text, word.label_)

In [18]:
displacy.render(result,style="ent",jupyter=True)

### 2.2 POS Tagging using NLTK

In [6]:
stop_words = set(stopwords.words('english'))

In [7]:
def tag_sentence(sentence):
    wordsList = word_tokenize(sentence)
    # wordsList = [w for w in wordsList if not w in stop_words]
    tagged = nltk.pos_tag(wordsList)
    return tagged

In [8]:
while (True):
    raw_text = input("User: ")
    tokenized = sent_tokenize(raw_text)
    
    for sentence in tokenized:
        tagged = tag_sentence(sentence)
        
        print("Model: ", tagged)

User: Who beat New Zealand in the 1992 World Cup Final?
Model:  [('Who', 'WP'), ('beat', 'VBD'), ('New', 'NNP'), ('Zealand', 'NNP'), ('in', 'IN'), ('the', 'DT'), ('1992', 'CD'), ('World', 'NNP'), ('Cup', 'NNP'), ('Final', 'NNP'), ('?', '.')]


KeyboardInterrupt: Interrupted by user

### 2.3 Building a combined Approach

In [10]:
raw_text = '''Cinematography is the science or art of motion-picture photography by recording light or other electromagnetic radiation, either electronically by means of an image sensor, or chemically by means of a light-sensitive material such as film stock.
Typically, a lens is used to repeatedly focus the light reflected from objects into real images on the light-sensitive surface inside a camera during a questioned exposure, creating multiple images.
With an electronic image sensor, this produces an electrical charge at each pixel, which is electronically processed and stored in a video file for subsequent display or processing.'''

entities = []

ner_result = NER(raw_text.title())

sentences = sent_tokenize(raw_text)
for sentence in sentences:
    pos_tagged = tag_sentence(sentence)
    for tags in pos_tagged:
        if tags[1] in ["NNP", "NN"]:
            entities.append(tags[0].lower())
        # End if
    # End for
# End for
print (entities)

['cinematography', 'science', 'art', 'motion-picture', 'photography', 'light', 'radiation', 'image', 'sensor', 'material', 'film', 'stock', 'lens', 'light', 'surface', 'camera', 'exposure', 'image', 'sensor', 'charge', 'pixel', 'video', 'file', 'display', 'processing']


## 3. Subject Verb Object Generation

In [11]:
raw_text = '''Cinematography is the science or art of motion-picture photography by recording light or other electromagnetic radiation, either electronically by means of an image sensor, or chemically by means of a light-sensitive material such as film stock.
Typically, a lens is used to repeatedly focus the light reflected from objects into real images on the light-sensitive surface inside a camera during a questioned exposure, creating multiple images.
With an electronic image sensor, this produces an electrical charge at each pixel, which is electronically processed and stored in a video file for subsequent display or processing.'''

### 3.1 My Custom Library

In [12]:
from subjectVerbObject import findSubjectVerbObjects, findSubjectVerbAdjectiveObjects, nlp
tokens = nlp(raw_text)
svos = findSubjectVerbObjects(tokens)
for svo in svos:
    print (svo)
print ("--------------------------------------------------------")
svaos = findSubjectVerbAdjectiveObjects(tokens)
for svao in svaos:
    print (svao)

('light', 'record', 'the science')
('other electromagnetic radiation', 'record', 'the science')
('the light', 'focus', 'a lens')
('objects into real images on inside a camera', 'reflect', 'the light')
('a questioned exposure', 'reflect', 'the light')
('an electrical charge at each pixel ,', 'produce', 'this')
('which', 'process', 'a video file for subsequent display')
('which', 'store', 'a video file for subsequent display')
('a video file for', 'store', 'each pixel ,')
--------------------------------------------------------
('cinematography', 'is', 'science')
('cinematography', 'is', 'means')
('cinematography', 'is', 'art')
('science', 'recording', 'light')
('science', 'recording', 'other electromagnetic radiation')
('means', 'material', 'stock')
('lens', 'focus', 'repeatedly focus creating')
('lens', 'focus', 'light')
('light', 'reflected', 'objects')
('light', 'reflected', 'questioned exposure')
('light', 'objects', 'real images')
('objects', 'images', 'light sensitive surface')
('

### 3.2 Using an existing codebase

#### 3.2.1 Solution with more SVOs identified

In [13]:
from subject_verb_object_extract import findSVOs, nlp
tokens = nlp(raw_text)
svos = findSVOs(tokens)
for svo in svos:
    print (svo)

ModuleNotFoundError: No module named 'subject_verb_object_extract'

#### 3.2.2 Solution with "IS" SVOs identified

In [14]:
from subject_object_extraction import findSVAOs, findSVOs
import en_core_web_sm
parser = en_core_web_sm.load(disable=['ner','textcat'])
parse = parser(raw_text)
svaos = findSVAOs(parse)
for svao in svaos:
    print (svao)

ModuleNotFoundError: No module named 'subject_object_extraction'

### 3.3 Testing Interface Functions

In [15]:
from subjectVerbObject import extractSimilarSVAOs

In [16]:
raw_text = '''Cinematography is the science or art of motion-picture photography by recording light or other electromagnetic radiation, either electronically by means of an image sensor, or chemically by means of a light-sensitive material such as film stock.
Typically, a lens is used to repeatedly focus the light reflected from objects into real images on the light-sensitive surface inside a camera during a questioned exposure, creating multiple images.
With an electronic image sensor, this produces an electrical charge at each pixel, which is electronically processed and stored in a video file for subsequent display or processing.'''

user_text_1_1 = '''Hi buddy, What do you think about cinematography'''


raw_text_2 = '''Blue is one of the three primary colours of pigments in painting and traditional colour theory, as well as in the RGB colour model.
It lies between violet and green on the spectrum of visible light.
The eye perceives blue when observing light with a dominant wavelength between approximately 450 and 495 nanometres.
Most blues contain a slight mixture of other colors; azure contains some green, while ultramarine contains some violet.
The clear daytime sky and the deep sea appear blue because of an optical effect known as Rayleigh scattering.
An optical effect called Tyndall scattering explains blue eyes.
Distant objects appear more blue because of another optical effect called atmospheric perspective.'''
user_text_2_1 = '''Blue is my favorite primary color.'''

raw_text_3 = '''The Royal Blue was the Baltimore and Ohio Railroad (B&O)'s flagship passenger train between New York City and Washington, D.C., in the United States, beginning in 1890.'''
user_text_3_1 = '''Blue is always nice. I like royal blue.'''


In [17]:
similar_svaos = extractSimilarSVAOs(user_text_2_1, raw_text_2)
for svaos in similar_svaos:
    print (svaos)

('The eye', 'perceives', 'blue')
('Most blues', 'contain', 'a slight mixture of other colors')
('The clear daytime sky', 'appear', 'blue')
('the deep sea', 'appear', 'blue')
('An optical effect', 'explains', 'blue eyes')
('Distant objects', 'appear', 'more blue')
('blue', 'is', 'one')
('eye', 'perceives', 'blue')
('blues', 'contain', 'slight mixture')
('daytime sky', 'appear', 'blue')
('sea', 'appear', 'blue')
('effect', 'explains', 'blue eyes')
('objects', 'appear', 'more blue')


## 4. Intent Recognition

### 4.1 Training a new model

In [1]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Downloading Chat model
nltk.download('nps_chat')

[nltk_data] Downloading package nps_chat to
[nltk_data]     C:\Users\farja\AppData\Roaming\nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!


True

In [2]:
# Dataset
posts = nltk.corpus.nps_chat.xml_posts()
posts_text = [post.text for post in posts]

In [3]:
# Dividing the Train and Test Set in 80:20
train_text = posts_text[:int(len(posts_text)*0.8)]
test_text = posts_text[int(len(posts_text)*0.2):]

In [4]:
# Get TFIDF features
vectorizer = TfidfVectorizer(ngram_range=(1,3), min_df=0.001, max_df=0.7, analyzer='word')

In [5]:
X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)
y = [post.get('class') for post in posts]

y_train = y[:int(len(posts_text) * 0.8)]
y_test = y[int(len(posts_text) * 0.2):]

In [6]:
# Fitting Gradient Boosting classifier to the Training set
gb = GradientBoostingClassifier(n_estimators = 700, random_state=0)
gb.fit(X_train, y_train)

GradientBoostingClassifier(n_estimators=700, random_state=0)

In [7]:
# Saving the model to the file
import pickle
pickle.dump(gb, open('intent.sav', 'wb'))

#### 4.1.2 Making New Prediction

In [None]:
print (gb.predict(vectorizer.transform(["I love cinematography"])))

### 4.2 Code for intent module

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import pickle

# Downloading NLTK Corpus
nltk.download('nps_chat')

# Setting up Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,3), min_df=0.001, max_df=0.7, analyzer='word')
posts = nltk.corpus.nps_chat.xml_posts()
posts_text = [post.text for post in posts]
train_text = posts_text[:int(len(posts_text)*0.8)]
vectorizer.fit_transform(train_text)

# Setting up Model
intentRecognizer = pickle.load(open('intent.sav', 'rb'))

# Testing
text = "I love cinematography"
pred = intentRecognizer.predict(vectorizer.transform([text]))
print (pred)