# Dependencies


In [2]:
!pip install --quiet flashtext
!pip install --quiet git+https://github.com/boudinfl/pke.git

  Preparing metadata (setup.py) ... [?25l[?25hdone


In [6]:
import json
import requests
import string
import re
import nltk
import string
import itertools
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
import pke
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import traceback
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Text preprocessing

In [18]:
import textwrap
text = """HTML (HyperText Markup Language) serves as the structural backbone of web content, defining elements through tags such as <div>, <p>, and <header>. HTML5, the latest iteration, introduced semantic elements like <article> and <aside>, APIs for offline storage, and multimedia support through <audio> and <video> tags. CSS (Cascading Style Sheets) manages the presentation layer, allowing precise control over typography, spacing, and layouts. CSS3 extended its capabilities with pseudo-classes, media queries for responsive design, and advanced features like Flexbox, Grid, and keyframe-based animations. JavaScript (JS), a versatile programming language, enables interactivity by manipulating the DOM (Document Object Model), handling events, and incorporating asynchronous operations via promises and async/await. Bootstrap, a powerful CSS and JS framework, streamlines web development with a mobile-first grid system, customizable components like modals and carousels, and extensive utility classes, fostering rapid, responsive design.”"""
wrapper = textwrap.TextWrapper(width=150)
word_list = wrapper.wrap(text=text)
for element in word_list:
  print(element)


HTML (HyperText Markup Language) serves as the structural backbone of web content, defining elements through tags such as <div>, <p>, and <header>.
HTML5, the latest iteration, introduced semantic elements like <article> and <aside>, APIs for offline storage, and multimedia support through <audio>
and <video> tags. CSS (Cascading Style Sheets) manages the presentation layer, allowing precise control over typography, spacing, and layouts. CSS3
extended its capabilities with pseudo-classes, media queries for responsive design, and advanced features like Flexbox, Grid, and keyframe-based
animations. JavaScript (JS), a versatile programming language, enables interactivity by manipulating the DOM (Document Object Model), handling events,
and incorporating asynchronous operations via promises and async/await. Bootstrap, a powerful CSS and JS framework, streamlines web development with a
mobile-first grid system, customizable components like modals and carousels, and extensive utility classes

In [19]:
def tokenize_sentence(text):
  sentences = sent_tokenize(text)
  sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
  return sentences

sentences = tokenize_sentence(text)
for element in sentences:
  print(element)

HTML (HyperText Markup Language) serves as the structural backbone of web content, defining elements through tags such as <div>, <p>, and <header>.
HTML5, the latest iteration, introduced semantic elements like <article> and <aside>, APIs for offline storage, and multimedia support through <audio> and <video> tags.
CSS (Cascading Style Sheets) manages the presentation layer, allowing precise control over typography, spacing, and layouts.
CSS3 extended its capabilities with pseudo-classes, media queries for responsive design, and advanced features like Flexbox, Grid, and keyframe-based animations.
JavaScript (JS), a versatile programming language, enables interactivity by manipulating the DOM (Document Object Model), handling events, and incorporating asynchronous operations via promises and async/await.
Bootstrap, a powerful CSS and JS framework, streamlines web development with a mobile-first grid system, customizable components like modals and carousels, and extensive utility classes

In [20]:
def get_noun_adj_verb(text):
  output = []
  try:
    extractor = pke.unsupervised.MultipartiteRank()
    extractor.load_document(input=text,language='en')
    pos = {'VERB', 'ADJ', 'NOUN'}
    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    stoplist += stopwords.words('english')
    extractor.candidate_selection(pos=pos)
    extractor.candidate_weighting(alpha=1.1,
                                  threshold=0.75,
                                  method='average')
    keyphrases = extractor.get_n_best(n=30)
    for val in keyphrases:
            output.append(val[0])
  except:
        out = []
        traceback.print_exc()
  return output

noun_verbs_adj = get_noun_adj_verb(text)
print ("keywords: ",noun_verbs_adj)

keywords:  ['web content', 'defining elements', 'css', 'responsive design', 'structural backbone', 'tags', 'allowing precise control', 'cascading', 'presentation layer', 'typography', 'capabilities', 'video', 'pseudo-classes', 'spacing', 'manages', 'extended', 'media queries', 'layouts', 'offline storage', 'multimedia support', 'audio', 'apis', 'serves', 'promises', 'async', 'enables interactivity', 'incorporating asynchronous operations', 'latest iteration', 'customizable components', 'introduced semantic elements']


In [21]:
from pprint import pprint
def get_sentence_for_keyword(keywords,sentences):
  keyword_processor = KeywordProcessor()
  keyword_sentences = {}
  for word in keywords:
    keyword_sentences[word] = []
    keyword_processor.add_keyword(word)
  for sentence in sentences:
    keywords_found = keyword_processor.extract_keywords(sentence)
    for key in keywords_found:
        keyword_sentences[key].append(sentence)
  for key in keyword_sentences.keys():
    values = keyword_sentences[key]
    values = sorted(values, key=len, reverse=True)
    keyword_sentences[key] = values
  return keyword_sentences

mapping = get_sentence_for_keyword(noun_verbs_adj,sentences)
pprint(mapping)

{'allowing precise control': ['CSS (Cascading Style Sheets) manages the '
                              'presentation layer, allowing precise control '
                              'over typography, spacing, and layouts.'],
 'apis': ['HTML5, the latest iteration, introduced semantic elements like '
          '<article> and <aside>, APIs for offline storage, and multimedia '
          'support through <audio> and <video> tags.'],
 'async': ['JavaScript (JS), a versatile programming language, enables '
           'interactivity by manipulating the DOM (Document Object Model), '
           'handling events, and incorporating asynchronous operations via '
           'promises and async/await.'],
 'audio': ['HTML5, the latest iteration, introduced semantic elements like '
           '<article> and <aside>, APIs for offline storage, and multimedia '
           'support through <audio> and <video> tags.'],
 'capabilities': ['CSS3 extended its capabilities with pseudo-classes, media '
       

In [28]:
def get_fill_in_the_blanks(mapping_sentences):
  output = {}
  blank_sentences = []
  processed = []
  keys=[]
  for key in mapping_sentences:
    if len(mapping_sentences[key])>0:
        sent = mapping_sentences[key][0]
        # Compile a regular expression pattern into a regular expression object, which can be used for matching and other methods
        insensitive_sent = re.compile(re.escape(key), re.IGNORECASE)
        no_of_replacements =  len(re.findall(re.escape(key),sent,re.IGNORECASE))
        line = insensitive_sent.sub(' _________ ', sent)
        if (mapping_sentences[key][0] not in processed) and no_of_replacements<2:
            show = {
                "question":line,
                "answer":key
            }
            blank_sentences.append(show)
            processed.append(mapping_sentences[key][0])
            keys.append(key)
  output["sentences"]=blank_sentences[:10]
  output["keys"]=keys[:10]
  return output

fill_in_the_blanks = get_fill_in_the_blanks(mapping)
print(fill_in_the_blanks['sentences'][3])

{'question': 'CSS (Cascading Style Sheets) manages the presentation layer,  _________  over typography, spacing, and layouts.', 'answer': 'allowing precise control'}
