In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import import_ipynb
import WikiCrawler as wk
import DataCleaner as dc
import pandas as pd
from bs4 import BeautifulSoup

In [20]:
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
analyzer = SentimentIntensityAnalyzer()
#url = "https://en.wikipedia.org/wiki/2019"

SentimentAnalyzerScore returns the sentiment score of the text.

In [22]:
def sentimentAnalyzerScore(text):
    score = analyzer.polarity_scores(text)
    return score

textMiner extracts the title and context from the Wikipedia page and stores it in a dataframe.

In [23]:
def textMiner(url):
    response = wk.initResp(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    title = wk.titleExtractor(soup)
    context = wk.contextExtractor(soup)
    wikiPageData = pd.DataFrame([[title,context]])
    dc.lowerOnly(wikiPageData)
    dc.alphaCharOnly(wikiPageData)
    return wikiPageData

stopWordDel removes all the stopwords from a given sentence and returns a filtered sentence.

In [24]:
def stopWordDel(sentence):
    stop_words = set(stopwords.words('english'))
    filteredSent = " "
    pageWord = sentence.split()
    for r in pageWord:
        if not r in stop_words:
            filteredSent = filteredSent + " " + r
    return filteredSent


posNegWordList() is a function that prints the top 10 positive and negative words in the text passed to it.

In [30]:
def posNegWordList(sentence):
    wordList = pd.DataFrame(columns=['word','score'])
    allWords = sentence.split()
    for j in allWords:
        if not j in list(wordList.iloc[:,0]):
            wordList = wordList.append({'word':j,'score':sentimentAnalyzerScore(j)['compound']} , ignore_index=True)
    wordList.sort_values(by=['score'] , inplace=True)
    print("\nNegative Words: ","\n", list(wordList.iloc[0:10,0]))
    print("\nPositive Words: ","\n",list(wordList.iloc[-10:-1,0]))

posNegSentList() is a function that prints the top 10 postive and negative sentences in the text passed to it.

In [31]:
def posNegSentList(sentence):
    sentList = pd.DataFrame(columns=['sentence','score'])
    allSent = nltk.tokenize.sent_tokenize(sentence)
    for s in allSent:
        if not s in list(sentList.iloc[:,0]):
            sentList = sentList.append({'sentence':s , 'score':sentimentAnalyzerScore(s)['compound']}, ignore_index=True)
    sentList.sort_values(by=['score'] , inplace=True)
    print("\nNegative Sentences: ", "\n", list(sentList.iloc[0:5,0]))
    print("\nPositive Sentences: ","\n", list(sentList.iloc[-5:-1,0]))

getPageSentiment combines all three functions and returns the sentiment score of the passed Wikipedia page url.

In [32]:
def getPageSentiment(url):
    df = textMiner(url)
    text = stopWordDel(df.loc[0,1])
    posNegWordList(text)
    posNegSentList(text)
    score = sentimentAnalyzerScore(text)
    return score

Here we have done the analysis of three different wikipedia pages. 
<br><br>
The first topic was  black death. It got a compound score of -0.9999. The sentiment score is as expected as the topic is related to a plague which killed 25 million people. The top negative words and positive words are printed along with the top negative sentences and positive sentences.
<br><br>
The next topic was the Indian Independence day. It got a compound sentiment score of 0.7893. It can be said that the page has a neutral sentiment, neither positive nor negative.
<br><br>
The next topic was Mona Lisa. It got a compound sentiment score of 0.9986. The wikipedia page has a overall positive sentiment according to the score given.

In [33]:
print("title: ", textMiner("https://en.wikipedia.org/wiki/Black_Death").iloc[0,0]," \n score:",
    getPageSentiment(url = "https://en.wikipedia.org/wiki/Black_Death"),"\n")

print( "title: ", textMiner("https://en.wikipedia.org/wiki/Independence_Day_(India)").iloc[0,0]," \n score:",
      getPageSentiment(url = "https://en.wikipedia.org/wiki/Independence_Day_(India)"),"\n")

print("title: ", textMiner("https://en.wikipedia.org/wiki/Mona_Lisa").iloc[0,0]," \n score:",
    getPageSentiment(url = "https://en.wikipedia.org/wiki/Mona_Lisa"),"\n")



Negative Words:  
 ['killed', 'fatality', 'killing', 'murdered.', 'dead.', 'dead', 'crisis.', 'disaster', 'worst', 'devastated']

Positive Words:  
 ['survived.', 'winning', 'divine', 'remarkable', 'great.', 'great', '143', 'paradise.', 'best']

Negative Sentences:  
 ['71 ... length came gloucester yea even oxford london finally spread england wasted people scarce tenth person sort left alive.geoffrey baker chronicon angliae 72 plague reportedly first introduced europe via genoese traders port city kaffa crimea 1347. protracted siege city 1345 1346 mongol golden horde army jani beg whose mainly tatar troops suffering disease catapulted infected corpses city walls kaffa infect inhabitants 73 though likely infected rats travelled across siege lines spread epidemic inhabitants.', 'one wept death awaited death.', '113 first outbreak two thirds population contracted illness patients died next half population became ill died third tenth affected many survived fourth occurrence one twenty p