In [1]:
# Corpora is a large body of text
# NLTK library of python allows you to import large corpora and apply text processing

# Text processing
# a key part of NLP is transforming text into mathematical objects.
# NLTK provides various functions that help us transform the text into vectors. The most
# basic NLTK function for this purpose is tokenization, which splits a document into a list 
# of units. These units could be words, alphabets, or sentences.

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/shivangi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.tokenize import word_tokenize

text = "i will work harder and f*ck that ieee, ill prove them im better!!!"

tokens = word_tokenize(text)
print(tokens)

['i', 'will', 'work', 'harder', 'and', 'f', '*', 'ck', 'that', 'ieee', ',', 'ill', 'prove', 'them', 'im', 'better', '!', '!', '!']


In [4]:
l = text.split(' ')

In [5]:
l

['i',
 'will',
 'work',
 'harder',
 'and',
 'f*ck',
 'that',
 'ieee,',
 'ill',
 'prove',
 'them',
 'im',
 'better!!!']

In [6]:
nltk.download('stopwords')
sw = nltk.corpus.stopwords.words('english')
sw

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shivangi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [7]:
# Stemming and Lemmatization

# Lemmatisation in linguistics is the process of grouping together the inflected forms of 
# a word so they can be analysedas a single item, identified by the word's lemma

# danced, dancing, dancers are all stemmed to the root word 'dance'

In [8]:
from nltk.stem import WordNetLemmatizer

text = "Im going to eats chocolates now"

tokens = word_tokenize(text)
tokens

['Im', 'going', 'to', 'eats', 'chocolates', 'now']

In [9]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
t = [lemmatizer.lemmatize(word) for word in tokens]
t

[nltk_data] Downloading package wordnet to /home/shivangi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['Im', 'going', 'to', 'eats', 'chocolate', 'now']

In [10]:
# Lemmatization was able to only remove plurals here

In [11]:
from nltk.stem import PorterStemmer

text = "im going to eat chocolates and dancing around"
tokens = word_tokenize(text.lower())

ps = PorterStemmer()
t = [ps.stem(word) for word in tokens]
t

['im', 'go', 'to', 'eat', 'chocol', 'and', 'danc', 'around']

In [12]:
# Part of speech tagging (POS tagging)

In [13]:
nltk.download('averaged_perceptron_tagger')

nltk.pos_tag(['eat'])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/shivangi/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('eat', 'NN')]

In [14]:
text = "she eats food delicious one"

tokens = word_tokenize(text)
t = [nltk.pos_tag([word]) for word in tokens]
t

[[('she', 'PRP')],
 [('eats', 'NNS')],
 [('food', 'NN')],
 [('delicious', 'JJ')],
 [('one', 'CD')]]

In [15]:
nltk.download('tagsets')
nltk.help.upenn_tagset()

[nltk_data] Downloading package tagsets to /home/shivangi/nltk_data...


$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data]   Unzipping help/tagsets.zip.


In [17]:
! pip install -U textblob

Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: textblob in /home/shivangi/.local/lib/python3.8/site-packages (0.15.3)


In [18]:
from textblob import TextBlob

TextBlob('I am so happy').sentiment

Sentiment(polarity=0.8, subjectivity=1.0)

In [19]:
TextBlob('I fucking love this frickin pizza').sentiment

Sentiment(polarity=0.5, subjectivity=0.6)

In [21]:
TextBlob('That was terrible').sentiment

Sentiment(polarity=-1.0, subjectivity=1.0)

In [22]:
# TextBlob can do tagging also

In [23]:
TextBlob("I am eating my delicious pizza").tags

[('I', 'PRP'),
 ('am', 'VBP'),
 ('eating', 'VBG'),
 ('my', 'PRP$'),
 ('delicious', 'JJ'),
 ('pizza', 'NN')]

In [24]:
! pip install vaderSentiment

Defaulting to user installation because normal site-packages is not writeable
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 2.2 MB/s eta 0:00:01
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [25]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [26]:
analyser.polarity_scores("This book is very good")

{'neg': 0.0, 'neu': 0.556, 'pos': 0.444, 'compound': 0.4927}

In [31]:
analyser.polarity_scores("OMG! The book is so cool")

{'neg': 0.0, 'neu': 0.604, 'pos': 0.396, 'compound': 0.5079}

In [32]:
analyser.polarity_scores("This book is like very very good")

{'neg': 0.0, 'neu': 0.456, 'pos': 0.544, 'compound': 0.7159}

In [33]:
# WebScraping

In [34]:
# Web Scraping is a technique to programmatically scrape and exract text from the net

In [35]:
# Web scraping is extracting useful information from websites
# To scrape information, we use 'requests' and 'beautifulsoup'

In [37]:
# Parsing info from websites is made easy using beautifulsoup
# Beautiful Soup is a Python library for pulling data out of HTML and XML files

In [38]:
# XML is abbreviation for eXtensible Markup Language whereas HTML stands for 
# Hypertext Markup Language. XML mainly focuses on transfer of data while HTML is focused
# on presentation of the data

In [39]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [41]:
url = 'https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops'
request = requests.get(url)

In [46]:
request

<Response [200]>

In [47]:
# Moethods in beautiful soup
# find : returns the first html tag that matches our condition
# find_all : returns all of them that match, in a list

In [62]:
url = 'https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops'
request = requests.get(url)

# we use the requests library to "get" the source code of the website (webpage that we 
# are viewing)

In [52]:
soup = BeautifulSoup(request.text, "html.parser")

In [66]:
soup.find('div', class_='col-sm-4 col-lg-4 col-md-4')

<div class="col-sm-4 col-lg-4 col-md-4">
<div class="thumbnail">
<img alt="item" class="img-responsive" src="/images/test-sites/e-commerce/items/cart2.png"/>
<div class="caption">
<h4 class="pull-right price">$295.99</h4>
<h4>
<a class="title" href="/test-sites/e-commerce/allinone/product/545" title="Asus VivoBook X441NA-GA190">Asus VivoBook X4...</a>
</h4>
<p class="description">Asus VivoBook X441NA-GA190 Chocolate Black, 14", Celeron N3450, 4GB, 128GB SSD, Endless OS, ENG kbd</p>
</div>
<div class="ratings">
<p class="pull-right">14 reviews</p>
<p data-rating="3">
<span class="glyphicon glyphicon-star"></span>
<span class="glyphicon glyphicon-star"></span>
<span class="glyphicon glyphicon-star"></span>
</p>
</div>
</div>
</div>

In [58]:
len(soup.find_all('div', class_='col-sm-4 col-lg-4 col-md-4'))

117

In [79]:
laptops = soup.find_all('div', class_='col-sm-4 col-lg-4 col-md-4')
titles = []
print(laptops[0].find('a', class_='title').text)
print(laptops[0].find('h4', class_='pull-right price').text)
rating = laptops[0].find('div', class_='ratings')
print(len(rating.find_all('span', class_='glyphicon glyphicon-star')))
'''
for each in laptops:
    titles.append(each.find('a', class_='title').title)
'''

Asus VivoBook X4...
$295.99
3


"\nfor each in laptops:\n    titles.append(each.find('a', class_='title').title)\n"

In [80]:
# we first check for each, then we appen to our lists

In [84]:
titles = []
prices = []
ratings = []

for laptop in laptops:
    titles.append(laptop.find('a', class_='title').text)
    prices.append(laptop.find('h4', class_='pull-right price').text)
    rating = laptop.find('div', class_='ratings')
    ratings.append(len(rating.find_all('span', class_='glyphicon glyphicon-star')))

In [85]:
import pandas as pd

In [87]:
df = pd.DataFrame(zip(titles, prices, ratings), columns=['Title', 'Price', 'Ratings'])
df

Unnamed: 0,Title,Price,Ratings
0,Asus VivoBook X4...,$295.99,3
1,Prestigio SmartB...,$299.00,2
2,Prestigio SmartB...,$299.00,4
3,Aspire E1-510,$306.99,3
4,Lenovo V110-15IA...,$321.94,3
...,...,...,...
112,Lenovo Legion Y7...,$1399.00,3
113,Asus ROG Strix G...,$1399.00,3
114,Asus ROG Strix G...,$1769.00,4
115,Asus ROG Strix G...,$1769.00,1
