We will create a utility method to read the content of the wikipedia page in plain text. For this we will use `wikipedia` package. To intsall `wikipedia` use the command `pip intsall wikipedia`.

In [77]:
import wikipedia
import nltk

def get_page_content_as_text(page_text):
    p = wikipedia.page(page_text)
    print('fetched content from {}'.format(p.url))
    return p.content

In [110]:
raw_data = get_page_content_as_text("World War II").lower()

fetched content from https://en.wikipedia.org/wiki/World_War_I


## Tokenizing 

Now we have loaded the data into `raw_data` varaible, we can start processing it. We start by converting the text which is a string into an array of sentences. This process of breaking a large text into smaller units like sentence or word is called `tokenization`. 

`nltk.tokenize` module provides `sent_tokenize` method which can convert a text into array of sentences.

In [117]:
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(raw_data)

print(sentences)



In [118]:
import re, string, unicodedata 
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.stem.wordnet import WordNetLemmatizer
import warnings
warnings.filterwarnings("ignore")
nltk.download('punkt') 
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


def normalize(sentence):
    remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
    #word tokenization
    word_token = word_tokenize(sentence.lower().translate(remove_punct_dict))
    
    #remove ascii
    new_words = []
    for word in word_token:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    
    #Remove tags
    rmv = []
    for w in new_words:
        sentence = re.sub("&lt;/?.*?&gt;","&lt;&gt;",w)
        rmv.append(sentence)
        
    #pos tagging and lemmatization
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    lmtzr = WordNetLemmatizer()
    lemma_list = []
    rmv = [i for i in rmv if i]
    for token, tag in nltk.pos_tag(rmv):
        lemma = lmtzr.lemmatize(token, tag_map[tag[0]])
        lemma_list.append(lemma)
    return lemma_list

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [123]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel


def generateResponse(user_response):
    robo_response=''
    #sentences.append(user_response)
    TfidfVec = TfidfVectorizer(tokenizer=normalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sentences)
    #vals = cosine_similarity(tfidf[-1], tfidf)
    vals = linear_kernel(tfidf[-1], tfidf)
    idx = vals.argsort()[0][-2]
    #print(sentences)
    #print(sentences[idx:])
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if(req_tfidf == 0 ) or "tell me about" in user_response:
        print("Checking Wikipedia")
        if user_response:
            robo_response = wikipedia_data(user_response)
            return robo_response
    else:
        robo_response = robo_response + sentences[idx]
        return robo_response
#wikipedia search
def wikipedia_data(input):
    reg_ex = re.search('tell me about (.*)', input)
    try:
        if reg_ex:
            topic = reg_ex.group(1)
            wiki = wk.summary(topic, sentences = 3)
            return wiki
    except Exception as e:
            print("No content has been found")

In [124]:
print(generateResponse('what is ww2'))

0.27455556999903485
the german army destroyed 15,000–20,000 buildings—most famously the university library at louvain—and generated a wave of refugees of over a million people.
