# NLP Steps

### Tokenizing
### Filtering Stop Words
### Stemming
### Tagging Parts of Speech
### Named Entity Recognition also known as Chunking
### Using Named Entity Recognition (NER)
### Getting Text to Analyze

#### Using NLP we are going to do a sentiment analysis of McDonald's review, in which we are taking some positive, negative and neutral reviews and process them to see it's score

In [1]:
#import warnings library to ignore all the warnings that encounter during program execution
import warnings
warnings.filterwarnings('ignore')

#import necessary nlp libraries and its modules
import nltk
from nltk.tokenize import word_tokenize as wordtoken
from nltk.tokenize import sent_tokenize as sentensetoken
from nltk.corpus import stopwords as stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.chunk import ne_chunk as chunk
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.chunk import RegexpParser

#import tfidf 
from sklearn.feature_extraction.text import TfidfVectorizer

#import bag of words 
from sklearn.feature_extraction.text import CountVectorizer

#download all words lists
nltk.download("stopwords")
nltk.download("words")
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is

True

In [13]:
#create a nlp algorithm to work for your analysis

#filter your words with tokenizations
def word_processing(text):
    text = text.lower()
    tokens = wordtoken(text)
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

#to display the word counts 
def freq_distribution(tokens):
    freq = FreqDist(tokens)
    freq = freq.most_common()
    return freq

#to tag parts of speech
def pos(tokens):
    speech = pos_tag(tokens)
    return speech

#define the named entity such as Noun, Verb,etc
def name_entity(text):
    token = pos_tag(wordtoken(text))
    named_entity = chunk(token)
    return named_entity

#to analyze and get sentiment scores of negative,positive and neutral along with compound
def sentiment_analyze(text):
    sentiment = SentimentIntensityAnalyzer()
    sentiment = sentiment.polarity_scores(text)
    return sentiment

In [23]:
#put the reviews texts of your choice 
neutral_text = "Hey, There I am in McDonalds Fast Food Chain"
neutral_text_2 = "Decent McDonald's location.  The team here has always been friendly.  Drive thru can get pretty backed up during normal meal periods, so often it is better to go inside.  Make sure you go close to regular meal periods for fresher food vs the ones under the heat lamp for a while.Store is always clean."
too_negative = "One star because everytime I go early in the morning this lady is always so rude. Granted I know its early but if u can't handle being up this early then maybe you should consider getting a different job. It's not okay to be rude."
too_negative_2 = "They treated me very badly."
too_positive = "The staff are very friendly and they do their job perfectly"
too_positive_2 = "Excellent location and very good atmosphere, excellent service"

# Taking the examples from above reviews texts
filters = word_processing(too_positive_2)
freq = freq_distribution(filters)
parts_of_speech = pos(filters)
entity = name_entity(too_positive_2)
senti_analysis = sentiment_analyze(too_positive_2)

#print all the features
print("Text filters : ",filters)
print("Frequency Distribution or Stemming : ", freq)
print("Parts of Speech : ", parts_of_speech)
print("Named Entity relationship : ", entity)
print("Semntiment Analysis Scores : ", senti_analysis)

Text filters :  ['excellent', 'location', 'good', 'atmosphere', ',', 'excellent', 'service']
Frequency Distribution or Stemming :  [('excellent', 2), ('location', 1), ('good', 1), ('atmosphere', 1), (',', 1), ('service', 1)]
Parts of Speech :  [('excellent', 'JJ'), ('location', 'NN'), ('good', 'NN'), ('atmosphere', 'NN'), (',', ','), ('excellent', 'JJ'), ('service', 'NN')]
Named Entity relationship :  (S
  Excellent/JJ
  location/NN
  and/CC
  very/RB
  good/JJ
  atmosphere/NN
  ,/,
  excellent/JJ
  service/NN)
Semntiment Analysis Scores :  {'neg': 0.0, 'neu': 0.315, 'pos': 0.685, 'compound': 0.8969}


In [25]:
#make a continous bag of words

cv = CountVectorizer()

sentense = [neutral_text, neutral_text_2, too_positive, too_negative]

BOW = cv.fit_transform(sentense).toarray()
vocab = cv.vocabulary_
features_names = cv.get_feature_names_out()

print("Vocabulary list : ",vocab)
print("Features names : ",features_names)
print(BOW)

Vocabulary list :  {'hey': 36, 'there': 75, 'am': 1, 'in': 38, 'mcdonalds': 51, 'fast': 23, 'food': 24, 'chain': 12, 'decent': 16, 'mcdonald': 50, 'location': 47, 'the': 72, 'team': 71, 'here': 35, 'has': 33, 'always': 0, 'been': 7, 'friendly': 27, 'drive': 19, 'thru': 78, 'can': 11, 'get': 28, 'pretty': 62, 'backed': 4, 'up': 81, 'during': 20, 'normal': 54, 'meal': 52, 'periods': 61, 'so': 66, 'often': 56, 'it': 41, 'is': 40, 'better': 9, 'to': 79, 'go': 30, 'inside': 39, 'make': 48, 'sure': 70, 'you': 85, 'close': 14, 'regular': 63, 'for': 25, 'fresher': 26, 'vs': 83, 'ones': 59, 'under': 80, 'heat': 34, 'lamp': 46, 'while': 84, 'store': 69, 'clean': 13, 'staff': 67, 'are': 3, 'very': 82, 'and': 2, 'they': 76, 'do': 18, 'their': 73, 'job': 43, 'perfectly': 60, 'one': 58, 'star': 68, 'because': 6, 'everytime': 22, 'early': 21, 'morning': 53, 'this': 77, 'lady': 45, 'rude': 64, 'granted': 31, 'know': 44, 'its': 42, 'but': 10, 'if': 37, 'handle': 32, 'being': 8, 'then': 74, 'maybe': 49,

In [26]:
#make a TF-IDF(Term Frequency - Inverse Document Frequency)

vectorize = TfidfVectorizer(norm=None)

tfidf = vectorize.fit_transform(sentense).toarray()
tfidf_vocab = vectorize.vocabulary_
tfidf_feature_names = vectorize.get_feature_names_out()
tfidf_params = vectorize.get_params()

print("TF-IDF vocabulary : ",tfidf_vocab)
print("TF-IDF features : ",tfidf_feature_names)
print("TF-IDF parameters : ",tfidf_params)

TF-IDF vocabulary :  {'hey': 36, 'there': 75, 'am': 1, 'in': 38, 'mcdonalds': 51, 'fast': 23, 'food': 24, 'chain': 12, 'decent': 16, 'mcdonald': 50, 'location': 47, 'the': 72, 'team': 71, 'here': 35, 'has': 33, 'always': 0, 'been': 7, 'friendly': 27, 'drive': 19, 'thru': 78, 'can': 11, 'get': 28, 'pretty': 62, 'backed': 4, 'up': 81, 'during': 20, 'normal': 54, 'meal': 52, 'periods': 61, 'so': 66, 'often': 56, 'it': 41, 'is': 40, 'better': 9, 'to': 79, 'go': 30, 'inside': 39, 'make': 48, 'sure': 70, 'you': 85, 'close': 14, 'regular': 63, 'for': 25, 'fresher': 26, 'vs': 83, 'ones': 59, 'under': 80, 'heat': 34, 'lamp': 46, 'while': 84, 'store': 69, 'clean': 13, 'staff': 67, 'are': 3, 'very': 82, 'and': 2, 'they': 76, 'do': 18, 'their': 73, 'job': 43, 'perfectly': 60, 'one': 58, 'star': 68, 'because': 6, 'everytime': 22, 'early': 21, 'morning': 53, 'this': 77, 'lady': 45, 'rude': 64, 'granted': 31, 'know': 44, 'its': 42, 'but': 10, 'if': 37, 'handle': 32, 'being': 8, 'then': 74, 'maybe': 4