In [1]:
import nltk

In [3]:
text = """Machine learning (ML) is a field of study in artificial intelligence concerned with
the development and study of statistical algorithms that can effectively generalize
and thus perform tasks without explicit instructions. Recently, generative artificial
neural networks have been able to surpass many previous approaches in performance.
Machine learning approaches have been applied to large language models, computer
vision, speech recognition, email filtering, agriculture and medicine, where it
is too costly to develop algorithms to perform the needed tasks.
The mathematical foundations of ML are provided by mathematical optimization
(mathematical programming) methods. Data mining is a related (parallel)
field of study, focusing on exploratory data analysis through unsupervised learning.
ML is known in its application across business problems under the
name predictive analytics. Although not all machine learning
is statistically based, computational statistics is an important
source of the field's methods."""

In [None]:
### Tokenization

In [5]:
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
sentences =  nltk.sent_tokenize(text)
sentences

['Machine learning (ML) is a field of study in artificial intelligence concerned with\nthe development and study of statistical algorithms that can effectively generalize\nand thus perform tasks without explicit instructions.',
 'Recently, generative artificial\nneural networks have been able to surpass many previous approaches in performance.',
 'Machine learning approaches have been applied to large language models, computer\nvision, speech recognition, email filtering, agriculture and medicine, where it\nis too costly to develop algorithms to perform the needed tasks.',
 'The mathematical foundations of ML are provided by mathematical optimization\n(mathematical programming) methods.',
 'Data mining is a related (parallel)\nfield of study, focusing on exploratory data analysis through unsupervised learning.',
 'ML is known in its application across business problems under the\nname predictive analytics.',
 "Although not all machine learning\nis statistically based, computational sta

In [7]:
len(sentences)

7

In [8]:
words = nltk.word_tokenize(text)
words

['Machine',
 'learning',
 '(',
 'ML',
 ')',
 'is',
 'a',
 'field',
 'of',
 'study',
 'in',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'development',
 'and',
 'study',
 'of',
 'statistical',
 'algorithms',
 'that',
 'can',
 'effectively',
 'generalize',
 'and',
 'thus',
 'perform',
 'tasks',
 'without',
 'explicit',
 'instructions',
 '.',
 'Recently',
 ',',
 'generative',
 'artificial',
 'neural',
 'networks',
 'have',
 'been',
 'able',
 'to',
 'surpass',
 'many',
 'previous',
 'approaches',
 'in',
 'performance',
 '.',
 'Machine',
 'learning',
 'approaches',
 'have',
 'been',
 'applied',
 'to',
 'large',
 'language',
 'models',
 ',',
 'computer',
 'vision',
 ',',
 'speech',
 'recognition',
 ',',
 'email',
 'filtering',
 ',',
 'agriculture',
 'and',
 'medicine',
 ',',
 'where',
 'it',
 'is',
 'too',
 'costly',
 'to',
 'develop',
 'algorithms',
 'to',
 'perform',
 'the',
 'needed',
 'tasks',
 '.',
 'The',
 'mathematical',
 'foundations',
 'of',
 'ML',
 'are',
 'provi

In [9]:
len(words)

162

In [None]:
### Lemmatization + remove stop words

In [10]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [18]:
sentences = nltk.sent_tokenize(text)
lemma = WordNetLemmatizer()
for i in range(len(sentences)):
  words = nltk.word_tokenize(sentences[i])
  final_words = []
  for x in words:
    if x not in stopwords.words('english'):
      final_words.append(lemma.lemmatize(x))
      sentences[i] = " ".join(final_words)

In [19]:
sentences

['Machine learning ( ML ) field study artificial intelligence concerned development study statistical algorithm effectively generalize thus perform task without explicit instruction .',
 'Recently , generative artificial neural network able surpass many previous approach performance .',
 'Machine learning approach applied large language model , computer vision , speech recognition , email filtering , agriculture medicine , costly develop algorithm perform needed task .',
 'The mathematical foundation ML provided mathematical optimization ( mathematical programming ) method .',
 'Data mining related ( parallel ) field study , focusing exploratory data analysis unsupervised learning .',
 'ML known application across business problem name predictive analytics .',
 "Although machine learning statistically based , computational statistic important source field 's method ."]

In [None]:
### bag of words

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 500)
features = cv.fit_transform(sentences).toarray()
features

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        2, 0, 1, 0, 1, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0,

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(max_features = 500)
features1 = tf.fit_transform(sentences).toarray()
features1

array([[0.        , 0.        , 0.        , 0.19898575, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.19898575, 0.        , 0.        , 0.        , 0.        ,
        0.23971706, 0.        , 0.        , 0.        , 0.23971706,
        0.23971706, 0.        , 0.23971706, 0.        , 0.17008642,
        0.        , 0.        , 0.        , 0.23971706, 0.        ,
        0.        , 0.23971706, 0.23971706, 0.        , 0.        ,
        0.        , 0.14767036, 0.17008642, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.17008642, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.19898575, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.23971706,
        0.        , 0.39797149, 0.        , 0.19898575, 0.        ,
        0.23971706, 0.        , 0.        , 0.23

In [None]:
### Named Entity Recognition

In [22]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [23]:
text = nlp('Google is hiring a Data Scientist having experience of 5 years in London and offering $50 Million package')

In [25]:
text.ents

(Google, Data Scientist, 5 years, London, $50 Million)

In [26]:
for i in text.ents:
  print(i.text,'-',i.label_,'-',spacy.explain(i.label_))

Google - ORG - Companies, agencies, institutions, etc.
Data Scientist - ORG - Companies, agencies, institutions, etc.
5 years - DATE - Absolute or relative dates or periods
London - GPE - Countries, cities, states
$50 Million - MONEY - Monetary values, including unit


In [None]:
### Sentiment Analysis

In [27]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [28]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [30]:
sia.polarity_scores("good")

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4404}

In [31]:
sia.polarity_scores("best")

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.6369}

In [32]:
sia.polarity_scores("very good")

{'neg': 0.0, 'neu': 0.238, 'pos': 0.762, 'compound': 0.4927}

In [33]:
sia.polarity_scores("bad")

{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.5423}

In [34]:
sia.polarity_scores("gud")

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [35]:
sia.polarity_scores("lol")

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4215}

In [36]:
sia.polarity_scores(":)")

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4588}

In [37]:
sia.polarity_scores(":(")

{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.4404}

In [38]:
sia.polarity_scores("the movie was very much entertaining")

{'neg': 0.0, 'neu': 0.611, 'pos': 0.389, 'compound': 0.4902}

In [39]:
sia.polarity_scores("I got a leakage product")

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [40]:
sia.polarity_scores("I got a leakage product, I will never buy from here again")

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [41]:
sia.polarity_scores("I got a leakage product, It was a very bad experience")

{'neg': 0.351, 'neu': 0.649, 'pos': 0.0, 'compound': -0.5849}

In [42]:
sia.polarity_scores("It's better to be at home instead of going out for that movie")

{'neg': 0.0, 'neu': 0.805, 'pos': 0.195, 'compound': 0.4404}

In [43]:
review = """Staff was good. Food was nice, veg kebabs were nice filling and sweet. Paneer was a little heavy, could have been softer in my preference. Would have liked less sauce on the platter but overall a good choice.

Lots of seating. Well done in decor. Spacious 2 floors

Liked the caramel custard"""
sia.polarity_scores(review)

{'neg': 0.0, 'neu': 0.674, 'pos': 0.326, 'compound': 0.9505}