# preprocessing the text

In [1]:
sentence = 'John has been selected for the trial phase this time. Congrats!!'

sentence = sentence.lower()

In [2]:
positive_words = ['awesome', 'good', 'nice', 'super','fun',
                 'delightful','congrats']
negative_words = ['awful','lame','horrible','bad']
sentence = sentence.replace('!','')
sentence

'john has been selected for the trial phase this time. congrats'

In [4]:
words = sentence.split(' ')
words

['john',
 'has',
 'been',
 'selected',
 'for',
 'the',
 'trial',
 'phase',
 'this',
 'time.',
 'congrats']

In [5]:
result = set(words) - set(positive_words)
result

{'been',
 'for',
 'has',
 'john',
 'phase',
 'selected',
 'the',
 'this',
 'time.',
 'trial'}

In [9]:
result_two = set(words) - set(negative_words)
result_two

{'been',
 'congrats',
 'for',
 'has',
 'john',
 'phase',
 'selected',
 'the',
 'this',
 'time.',
 'trial'}

In [12]:
# accessing text from the web

import urllib3
from bs4 import BeautifulSoup
pool_object = urllib3.PoolManager()
target_url= 'https://www.gutenberg.org/files/2554/2554-h/2554-h.htm#link2HCH0008'
response = pool_object.request('GET', target_url)
final_html_txt= BeautifulSoup(response.data)
#print(final_html_txt)



In [16]:
from urllib.request import urlopen
html = urlopen(target_url).read()

soup = BeautifulSoup(html)


for script in soup(["script", "style"]):

    script.decompose()

strips = list(soup.stripped_strings)

print(strips[:5])

['Crime and Punishment, by Fyodor Dostoevsky', 'The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\r\n\r\nThis eBook is for the use of anyone anywhere at no cost and with\r\nalmost no restrictions whatsoever.  You may copy it, give it away or\r\nre-use it under the terms of the Project Gutenberg License included\r\nwith this eBook or online at www.gutenberg.org\r\n\r\n\r\nTitle: Crime and Punishment\r\n\r\nAuthor: Fyodor Dostoevsky\r\n\r\nRelease Date: March 28, 2006 [EBook #2554]\r\nLast Updated: October 27, 2016\r\n\r\nLanguage: English\r\n\r\nCharacter set encoding: UTF-8\r\n\r\n*** START OF THIS PROJECT GUTENBERG EBOOK CRIME AND PUNISHMENT ***\r\n\r\n\r\n\r\n\r\nProduced by John Bickers; and Dagny and David Widger', 'CRIME AND PUNISHMENT', 'By Fyodor Dostoevsky', 'Translated By Constance Garnett']


In [18]:
len(strips)

4523

# removal of stopwords

In [20]:
import nltk
from nltk import word_tokenize
sentence = "This book is about Deep Learning and Natural Language processing!"
tokens = word_tokenize(sentence)
tokens

['This',
 'book',
 'is',
 'about',
 'Deep',
 'Learning',
 'and',
 'Natural',
 'Language',
 'processing',
 '!']

In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/odemakinde/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [28]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [30]:
# stopword is a commonly used word (such as the)) that a 
#search engine has been programmed to ignore
len(stop_words)

179

In [35]:
list(stop_words)[:10]

['as', 't', 'do', 'ma', 'her', 'whom', "wasn't", 'now', "haven't", "couldn't"]

In [36]:
new_tokens = [w for w in tokens if not w in stop_words]
new_tokens

['This', 'book', 'Deep', 'Learning', 'Natural', 'Language', 'processing', '!']

In [37]:
tokens

['This',
 'book',
 'is',
 'about',
 'Deep',
 'Learning',
 'and',
 'Natural',
 'Language',
 'processing',
 '!']

# count vectorization

This is a scikit-learn library tool that takes any mass of text and returns each unique word as a feature

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
texts = ['Ramiess sings classic songs', 'he listens to old pop',
        'and rock music','and also listens to classic songs']
cv = CountVectorizer()
cv_fit= cv.fit_transform(texts)
print(cv.get_feature_names())

['also', 'and', 'classic', 'he', 'listens', 'music', 'old', 'pop', 'ramiess', 'rock', 'sings', 'songs', 'to']


In [40]:
print(cv_fit.toarray())

[[0 0 1 0 0 0 0 0 1 0 1 1 0]
 [0 0 0 1 1 0 1 1 0 0 0 0 1]
 [0 1 0 0 0 1 0 0 0 1 0 0 0]
 [1 1 1 0 1 0 0 0 0 0 0 1 1]]


In [45]:
cv_fit.toarray().shape

(4, 13)

# TF-IDF Score
TF-IDF is an acronymn of two terms: term frequency and inverse document frequency. TF is the ratio representing the count of specific words to the total number of words in a document.

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
texts= ['Ramiess sings classic songs', 'he listens to old pop',
       'and rock music','and also listens to classical songs']
vect = TfidfVectorizer()
X = vect.fit_transform(texts)
print(X.todense())

[[0.         0.         0.52547275 0.         0.         0.
  0.         0.         0.         0.52547275 0.         0.52547275
  0.41428875 0.        ]
 [0.         0.         0.         0.         0.48546061 0.38274272
  0.         0.48546061 0.48546061 0.         0.         0.
  0.         0.38274272]
 [0.         0.48693426 0.         0.         0.         0.
  0.61761437 0.         0.         0.         0.61761437 0.
  0.         0.        ]
 [0.47212003 0.37222485 0.         0.47212003 0.         0.37222485
  0.         0.         0.         0.         0.         0.
  0.37222485 0.37222485]]


In [44]:
X.toarray().shape

(4, 14)

In [47]:
!pip install textblob

Collecting textblob
[?25l  Downloading https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl (636kB)
[K     |████████████████████████████████| 645kB 379kB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.15.3


In [48]:
# Text Classifier

from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
data = [
    ('I love my country','pos'),
    ('This is an amazing place!', 'pos'),
    ('I do not like the smell of this place.', 'neg'),
    ('I do not like this restaurant','neg'),
    ('I am tired of hearing your nonsense.','neg'),
    ('I always aspire to be like him', 'pos'),
    ("it's a horrible performance.", 'neg')
]

model = NaiveBayesClassifier(data)

In [49]:
model.classify("It's an awesome place!")

'pos'

In [50]:
model.classify("i am here")

'neg'

In [52]:
model.classify("you are not serious")

'neg'

In [53]:
model.classify("I do love you")

'pos'