# NTLK Project: Analysis of Snippets

The aim of this project is to design an approach that makes use of Google and msn snippet in order to compute the semantic similarity between sentences. Given two sentences S1 and S2, the key is to input each of the sentences to the search engine and investigate the overlapping that may exist between the generated snippets. 

Seminar report date: 11.12.2018.
Project delivery deadline: 7.1.2018.

1.	Define two sentences S1 and S2.

  S1: "Several research groups have discovered new pharmaceuticals from nordic berries."
  
  S2: "New substances discovered from fruits, vegetables and berries have positive health effects."
 

In [40]:
import nltk
import string
import math 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer
import pandas as pd
import re
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

pd.set_option('display.max_columns', None)      # or 1000
pd.set_option('display.max_rows', None)         # or 1000
pd.set_option('display.max_colwidth', -1)       # or 199



# snippets / sentences examples ##########

document_0 = "Keywords and their placing versus highly defined featured snippets from Google are more important for getting traffic on webpage."
document_1 = "How to build up super snippets for new web content?"
document_2 = "Webpage will earn new featured snippets from Google if it includes the same keywords like competitors high ranked featured snippets."
document_3 = "Most of the regions in Finland will enjoy white snow and temperatures below freezing point on Christmas Eve."

all_documents = [document_0, document_1, document_2, document_3]

data_all = pd.DataFrame(all_documents)
data_all.columns = ['Academic sentence - short example']
display(data_all.head())  

low_documents = []
for document in all_documents:
    low_documents.append(document.lower())
    
data_low = pd.DataFrame(low_documents)
data_low.columns = ['Lower case sentence']
display(data_low.head())  
    
# tokenization by split # Sentences Tokenized into Words - split by whitespace

sentences_documents = []
#document_counter = 0
for document in low_documents:
    sentences_documents.append(document.split())

printableList1 = []
for sentence1 in sentences_documents:
    sentence1AsString = ''
    for idx1, aWord1 in enumerate(sentence1):        
        if idx1 == len(sentence1) - 1:
            sentence1AsString = sentence1AsString + aWord1
        else:
            str1 = aWord1 + ', '
            sentence1AsString = sentence1AsString + str1
    printableList1.append(sentence1AsString)

data_sentences1 = pd.DataFrame(printableList1)
data_sentences1.columns = ['Sentence tokenized into words   - string form and comma separated for display']
display(data_sentences1.head())     


Unnamed: 0,Academic sentence - short example
0,Keywords and their placing versus highly defined featured snippets from Google are more important for getting traffic on webpage.
1,How to build up super snippets for new web content?
2,Webpage will earn new featured snippets from Google if it includes the same keywords like competitors high ranked featured snippets.
3,Most of the regions in Finland will enjoy white snow and temperatures below freezing point on Christmas Eve.


Unnamed: 0,Lower case sentence
0,keywords and their placing versus highly defined featured snippets from google are more important for getting traffic on webpage.
1,how to build up super snippets for new web content?
2,webpage will earn new featured snippets from google if it includes the same keywords like competitors high ranked featured snippets.
3,most of the regions in finland will enjoy white snow and temperatures below freezing point on christmas eve.


Unnamed: 0,Sentence tokenized into words - string form and comma separated for display
0,"keywords, and, their, placing, versus, highly, defined, featured, snippets, from, google, are, more, important, for, getting, traffic, on, webpage."
1,"how, to, build, up, super, snippets, for, new, web, content?"
2,"webpage, will, earn, new, featured, snippets, from, google, if, it, includes, the, same, keywords, like, competitors, high, ranked, featured, snippets."
3,"most, of, the, regions, in, finland, will, enjoy, white, snow, and, temperatures, below, freezing, point, on, christmas, eve."


In [41]:

# change compound words to separate words ie. 'conditional-statements' -> 'conditional', 'statements' 
print("\n" 'Single words' "\n")
single_word_documents = []
for sentence_words in sentences_documents:
    single_word_list = []
    for word in sentence_words:
        regex = re.compile("[-_]")
        trimmed = regex.sub(' ', word)
        separate = trimmed.split( )
        for item in separate:
            single_word_list.append(item)        
    single_word_documents.append(single_word_list)
print(single_word_documents)

printableList2 = []
for sentence2 in single_word_documents:
    sentence2AsString = ''
    for idx2, aWord2 in enumerate(sentence2):        
        if idx2 == len(sentence2) - 1:
            sentence2AsString = sentence2AsString + aWord2
        else:
            str2 = aWord2 + ', '
            sentence2AsString = sentence2AsString + str2
    printableList2.append(sentence2AsString)

data_sentences2 = pd.DataFrame(printableList2)
data_sentences2.columns = ['Single words   - string form and comma separated for display']
display(data_sentences2.head())      
    
    
    
# remove all tokens that are not alphabetic #############
print("\n" 'Tokenized with alphabetic chars only' "\n")
alpha_documents = []
for single_word_sentence in single_word_documents:
    cleaned_list = []
    for single_word in single_word_sentence:
        regex = re.compile('[^a-zA-Z]')
        #First parameter is the replacement, second parameter is your input string
        nonAlphaRemoved = regex.sub('', single_word)
        # add string to list only if it has content
        if nonAlphaRemoved:
            cleaned_list.append(nonAlphaRemoved)
    alpha_documents.append(cleaned_list)
print(alpha_documents)

printableList3 = []
for sentence3 in alpha_documents:
    sentence3AsString = ''
    for idx3, aWord3 in enumerate(sentence3):        
        if idx3 == len(sentence3) - 1:
            sentence3AsString = sentence3AsString + aWord3
        else:
            str3 = aWord3 + ', '
            sentence3AsString = sentence3AsString + str3
    printableList3.append(sentence3AsString)

data_sentences3 = pd.DataFrame(printableList3)
data_sentences3.columns = ['Tokenized with alphabetic chars only   - string form and comma separated for display']
display(data_sentences3.head())     


# filter out stopwords ########
print("\n" 'English stopwords filtered tokens' "\n")
stop_filtered_tokens = []
english_stop_words = set(stopwords.words('english'))

for fword in alpha_documents:
    fword_list = []
    for sword in fword:
        #fword_list = [sword for sword in alpha_documents if not sword in english_stop_words]
        if not sword in english_stop_words:
            fword_list.append(sword)
    stop_filtered_tokens.append(fword_list)
print(stop_filtered_tokens)  


printableList4 = []
for sentence4 in stop_filtered_tokens:
    sentence4AsString = ''
    for idx4, aWord4 in enumerate(sentence4):        
        if idx4 == len(sentence4) - 1:
            sentence4AsString = sentence4AsString + aWord4
        else:
            str4 = aWord4 + ', '
            sentence4AsString = sentence4AsString + str4
    printableList4.append(sentence4AsString)

data_sentences4 = pd.DataFrame(printableList4)
data_sentences4.columns = ['English stopwords filtered tokens   - comma separated for display']
display(data_sentences4.head())     


# tokenization by PorterStemmer ############
print("\n" 'Word Stemming by PorterStemmer' "\n")
porter_documents = []
for ps_word_list in stop_filtered_tokens:
    PS = PorterStemmer()
    porter_list = []
    for ps_word in ps_word_list:
        porter_list.append(PS.stem(ps_word))
    porter_documents.append(porter_list)
print(porter_documents)

printableList5 = []
for sentence5 in porter_documents:
    sentence5AsString = ''
    for idx5, aWord5 in enumerate(sentence5):        
        if idx5 == len(sentence5) - 1:
            sentence5AsString = sentence5AsString + aWord5
        else:
            str5 = aWord5 + ', '
            sentence5AsString = sentence5AsString + str5
    printableList5.append(sentence5AsString)

data_sentences5 = pd.DataFrame(printableList5)
data_sentences5.columns = ['Word Stemming by PorterStemmer   - comma separated for display']
display(data_sentences5.head())     


Single words

[['keywords', 'and', 'their', 'placing', 'versus', 'highly', 'defined', 'featured', 'snippets', 'from', 'google', 'are', 'more', 'important', 'for', 'getting', 'traffic', 'on', 'webpage.'], ['how', 'to', 'build', 'up', 'super', 'snippets', 'for', 'new', 'web', 'content?'], ['webpage', 'will', 'earn', 'new', 'featured', 'snippets', 'from', 'google', 'if', 'it', 'includes', 'the', 'same', 'keywords', 'like', 'competitors', 'high', 'ranked', 'featured', 'snippets.'], ['most', 'of', 'the', 'regions', 'in', 'finland', 'will', 'enjoy', 'white', 'snow', 'and', 'temperatures', 'below', 'freezing', 'point', 'on', 'christmas', 'eve.']]


Unnamed: 0,Single words - string form and comma separated for display
0,"keywords, and, their, placing, versus, highly, defined, featured, snippets, from, google, are, more, important, for, getting, traffic, on, webpage."
1,"how, to, build, up, super, snippets, for, new, web, content?"
2,"webpage, will, earn, new, featured, snippets, from, google, if, it, includes, the, same, keywords, like, competitors, high, ranked, featured, snippets."
3,"most, of, the, regions, in, finland, will, enjoy, white, snow, and, temperatures, below, freezing, point, on, christmas, eve."



Tokenized with alphabetic chars only

[['keywords', 'and', 'their', 'placing', 'versus', 'highly', 'defined', 'featured', 'snippets', 'from', 'google', 'are', 'more', 'important', 'for', 'getting', 'traffic', 'on', 'webpage'], ['how', 'to', 'build', 'up', 'super', 'snippets', 'for', 'new', 'web', 'content'], ['webpage', 'will', 'earn', 'new', 'featured', 'snippets', 'from', 'google', 'if', 'it', 'includes', 'the', 'same', 'keywords', 'like', 'competitors', 'high', 'ranked', 'featured', 'snippets'], ['most', 'of', 'the', 'regions', 'in', 'finland', 'will', 'enjoy', 'white', 'snow', 'and', 'temperatures', 'below', 'freezing', 'point', 'on', 'christmas', 'eve']]


Unnamed: 0,Tokenized with alphabetic chars only - string form and comma separated for display
0,"keywords, and, their, placing, versus, highly, defined, featured, snippets, from, google, are, more, important, for, getting, traffic, on, webpage"
1,"how, to, build, up, super, snippets, for, new, web, content"
2,"webpage, will, earn, new, featured, snippets, from, google, if, it, includes, the, same, keywords, like, competitors, high, ranked, featured, snippets"
3,"most, of, the, regions, in, finland, will, enjoy, white, snow, and, temperatures, below, freezing, point, on, christmas, eve"



English stopwords filtered tokens

[['keywords', 'placing', 'versus', 'highly', 'defined', 'featured', 'snippets', 'google', 'important', 'getting', 'traffic', 'webpage'], ['build', 'super', 'snippets', 'new', 'web', 'content'], ['webpage', 'earn', 'new', 'featured', 'snippets', 'google', 'includes', 'keywords', 'like', 'competitors', 'high', 'ranked', 'featured', 'snippets'], ['regions', 'finland', 'enjoy', 'white', 'snow', 'temperatures', 'freezing', 'point', 'christmas', 'eve']]


Unnamed: 0,English stopwords filtered tokens - comma separated for display
0,"keywords, placing, versus, highly, defined, featured, snippets, google, important, getting, traffic, webpage"
1,"build, super, snippets, new, web, content"
2,"webpage, earn, new, featured, snippets, google, includes, keywords, like, competitors, high, ranked, featured, snippets"
3,"regions, finland, enjoy, white, snow, temperatures, freezing, point, christmas, eve"



Word Stemming by PorterStemmer

[['keyword', 'place', 'versu', 'highli', 'defin', 'featur', 'snippet', 'googl', 'import', 'get', 'traffic', 'webpag'], ['build', 'super', 'snippet', 'new', 'web', 'content'], ['webpag', 'earn', 'new', 'featur', 'snippet', 'googl', 'includ', 'keyword', 'like', 'competitor', 'high', 'rank', 'featur', 'snippet'], ['region', 'finland', 'enjoy', 'white', 'snow', 'temperatur', 'freez', 'point', 'christma', 'eve']]


Unnamed: 0,Word Stemming by PorterStemmer - comma separated for display
0,"keyword, place, versu, highli, defin, featur, snippet, googl, import, get, traffic, webpag"
1,"build, super, snippet, new, web, content"
2,"webpag, earn, new, featur, snippet, googl, includ, keyword, like, competitor, high, rank, featur, snippet"
3,"region, finland, enjoy, white, snow, temperatur, freez, point, christma, eve"


In [42]:
# define jaccard similarity for python ################
def jaccard_similarity(query, jdoc):
    intersection = set(query).intersection(set(jdoc))
    union = set(query).union(set(jdoc))
    return len(intersection)/len(union)

# calculate jaccard similarity
print('Jaccard similarity')
result = jaccard_similarity(porter_documents[0], porter_documents[1])
j_string = "{:.4f}".format(result)
print(j_string)


#data_table1 = pd.DataFrame(tableJaccSim1)
# data_table1.columns = ['n', 'sentence', 'n', 'sentence', 'JaccardSim'  ]
#display(data_table1.head())

def listToString (sourceList):
    subListAsString = ''    
    for listIndex, listWord in enumerate(sourceList):        
        if listIndex == len(sourceList) - 1:
            subListAsString = subListAsString + listWord
        else:
            strWithComma = listWord + ', '
            subListAsString = subListAsString + strWithComma        
    return subListAsString

# compare the first porter document to the rest in the porter docs list
def printJaccardSimilarities (porterDocs):    
    printableJaccardList = []    
    for porterIndex, porter in enumerate(porterDocs):
        if porterIndex > 0:            
            jresult = jaccard_similarity(porterDocs[0], porter)    
            j_string = "{:.4f}".format(jresult)
            porterParamStr1 = listToString(porterDocs[0])
            porterParamStr2 = listToString(porter)
            data = [0, porterParamStr1, porterIndex, porter, j_string]
            printableJaccardList.append(data)
    df = pd.DataFrame(printableJaccardList,columns=['First Index','First Sentence','Second Index','Second Sentence','Jaccard similarity'])
    display(df.head()) 

printJaccardSimilarities(porter_documents)

    


Jaccard similarity
0.0588


Unnamed: 0,First Index,First Sentence,Second Index,Second Sentence,Jaccard similarity
0,0,"keyword, place, versu, highli, defin, featur, snippet, googl, import, get, traffic, webpag",1,"[build, super, snippet, new, web, content]",0.0588
1,0,"keyword, place, versu, highli, defin, featur, snippet, googl, import, get, traffic, webpag",2,"[webpag, earn, new, featur, snippet, googl, includ, keyword, like, competitor, high, rank, featur, snippet]",0.2632
2,0,"keyword, place, versu, highli, defin, featur, snippet, googl, import, get, traffic, webpag",3,"[region, finland, enjoy, white, snow, temperatur, freez, point, christma, eve]",0.0


2.	Use Google search API and msn search API to generate the first ten snippets associated to each sentence. 

In [78]:
from googleapiclient.discovery import build
import pprint
import json

my_api_key = "AIzaSyBN0zRiSDC_IdQrYWQaTcbCheyKLRopqOA"
my_cse_id = "009592823161165690347:wrkvjhigeuw"

# searchTerms = 'build:featur:snippet:paragraph:question:new:content'
searchTerms = 'Keywords and their placing versus highly defined featured snippets from Google are more important for getting traffic on webpage.' 

def google_search(search_term, api_key, cse_id, **kwargs):    
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    resultItems = res['items']
    gQueries = res.get('queries', [])
    gTotalResultCount = 0
    gRequestObj = gQueries.get('request', [])
    for gReqItems in gRequestObj:
        gJsonItems = json.dumps(gReqItems)        
        gJsonDict = json.loads(gJsonItems)
        for key, value in gJsonDict.items():
            if key == 'totalResults':
                gTotalResultCount = value
                         
    resultDict = {'total':gTotalResultCount,'items':resultItems}    
    return resultDict


for google_doc in all_documents: 
    resultsDict = google_search(
        google_doc, my_api_key, my_cse_id, num=10)
    
    googleSearchSnippetlist = []
    for result in resultsDict['items']:
        jsonResult = json.dumps(result)
        jsonDict = json.loads(jsonResult)    
        for key, value in jsonDict.items():
            if key == 'snippet':
                googleSearchSnippetlist.append(value)

    gSnippetDf = pd.DataFrame(googleSearchSnippetlist, columns=['Google search snippets for search terms : ' + google_doc + ' : Total Result count : ' + resultsDict['total']])
    display(gSnippetDf.head()) 
    

Unnamed: 0,Google search snippets for search terms : Keywords and their placing versus highly defined featured snippets from Google are more important for getting traffic on webpage. : Total Result count : 54600
0,"Aug 23, 2017 ... You have chances to get featured for the terms your pages are already ... health \nor finance, you have the highest probability of getting featured ... When \nperforming keyword research with featured snippets in mind, note that: .... queries \ndetermine the subheadings of the article and thus define its structure."
1,"May 29, 2017 ... Google is actively rolling out featured snippets for a wide variety of search \nqueries because they ... I mean, let's say that you rank #7 for some keyword. ... \nBut if there's a featured snippet opportunity, you can get there almost instantly \nwith just a ... They occupy more real estate, and they clearly catch the eye."
2,"Aug 15, 2018 ... The Google featured snippet was first designed when the Google ... a highlighted \nbox that sometimes appears at the very top of a Google ... types in a question or \nkeywords that Google recognizes as a question. ... featured-snippets-traffic ... \nglance would mean millions of more eyes on your website's name."
3,"Feb 14, 2018 ... A website's traffic reflects how well a business is doing online. It is also an ... To \nbe successful at this, you need to understand how Google defines better content. \n... (PS: doing this well will increase your ability to get featured snippets too) ... \nThis is the most important, most effective SEO tactic or tips there is."
4,"Obtaining traffic from Google means that you've got to keep up with all of \nGoogle's ... Featured snippets are the first thing that most people see when \nsearching for a word ... or finance, your chances of getting featured are higher \nthan if you publish .... Plus, if you already rank high for a particular keyword, it's \nworth finding out if ..."


Unnamed: 0,Google search snippets for search terms : How to build up super snippets for new web content? : Total Result count : 1410000
0,"Jan 2, 2018 ... Here's how to configure WP Super Cache to serve up your site's content from the \ncache ... WordPress can do quite a bit of work creating a page, from making a .... \nWhen I'm setting up a site for a new client that isn't familiar with ..."
1,"Nov 26, 2018 ... The meta description is a snippet of up to about 155 characters – a tag in HTML – \nwhich summarizes a page's content. Search engines show the ... “Hello, we have \nsuch and such new product, and you want it. Find out more!"
2,"Before publishing your new piece of content, reach out to an influencer or \ninfluencers in your industry. ... Create 20+ Snippets for Mega Sharing on Social \nMedia ... How to Build Credibility as a Start-Up SEO Agency – http://t.co/\nHoiqDf0S4b ... Or, if they have a website, contact them via email or via a contact \nform and ask them ..."
3,"Feb 23, 2018 ... Artificial intelligence (AI) is getting much better at identifying the content of \nimages and providing labels. So-called “generative” algorithms go ..."
4,"Aug 16, 2016 ... ST3 makes it super easy to create a snippet. ... ST3 will typically place you in the \nproper directory when you save your new snippet, ... we no longer have to scroll \nup the page to see where the selector is ... Hit tab a second time to put your \ncursor into a beautifuly tabbed spot within the bracket --> <content><!"


Unnamed: 0,Google search snippets for search terms : Webpage will earn new featured snippets from Google if it includes the same keywords like competitors high ranked featured snippets. : Total Result count : 52000
0,"On your site, you've done keyword research, written tons of blog posts, and ... But \nthose high rankings don't mean as much if another competitor owns a ... Here's \nhow you can steal your competitor's featured snippets to earn even better \nrankings. ... It contains a summary of the content on the featured web page \nrelated to ..."
1,"Aug 23, 2017 ... Featured snippets are selected search results that are featured on top of Google's \n... It helps if you use a keyword research tool that shows immediately whether a \nquery ... those for which you or your competitors are already ranking high. But ... \nas to how long each answer should be in order to get featured:."
2,"Apr 19, 2018 ... Do you want more traffic from Google, without creating new content ... If you're \nwondering what featured snippets are, here's an example: ... featured snippet (\nbut a competitor does): only 19.6% of all clicks will go to your page. ..... the same \nSERP filters (as I mentioned earlier) directly in Keywords Explorer?"
3,"Sep 4, 2018 ... Most companies with a presence on Google will have an interest in appearing at \nthe ... some featured snippets, this blog post will give you some tips and \nprocesses to get you started. ... When a page owns a featured snippet, it will sit at \nthe top of the Search Engine Results Page (SERP), and look like this:."
4,"May 29, 2017 ... Google is actively rolling out featured snippets for a wide variety of ... It looks like \nwe're now in the middle of the featured‐snippet‐geddon that ... one of the top‐\nranking pages for that search query and includes that ... Performing a study on 14 \nmillion keywords would be rather ..... Get notified of new articles."


Unnamed: 0,Google search snippets for search terms : Most of the regions in Finland will enjoy white snow and temperatures below freezing point on Christmas Eve. : Total Result count : 72600
0,"In Finland the weather can vary greatly during one day, first sunshine, then rain. \n... A layer of clean white snow increases the brightness by as much as 80%, .... for \nthe start of the permanent snow cover in the Helsinki region is Christmas Day, ... \ntemperature from 15 degrees below zero in the morning to above zero later in ..."
1,"When to go to Rovaniemi? ... for you to travel to the Official Hometown of Santa \nClaus in Lapland, Finland! ... Winter - Christmas, the Polar Night and snowy \nspring from December to March ... At the end of May, nights are white. ... \nTemperatures start dropping below zero in October, and first snow usually .... \nRead more here."
2,"Weather averages, seasons, and tips on the best time to visit. ... In winter, a light \nsnow often falls, which may not even be counted in the statistics (if it ... Here, the \ntemperature remains almost constantly around or below freezing (0 °C or 32 ... In \nLapland, the northernmost part of Finland, the climate is cold for most of the year,\n ..."
3,"As such, the climate of much of the Arctic is moderated by the ocean water, which \ncan never have a temperature below −2 °C (28 °F). In winter, this relatively ..."
4,"But that's no reason to stay indoors - just gear up right and make the most of the \nfresh white season. ... We can't argue below –30 isn't freezing, but enjoying the \nFinnish winter is all about dressing right – and dressing right is all ... Well, the fact \nis there's no real winter without snow and no snow without sub-zero temperatures\n."


In [6]:
import requests

subscription_key = "ec8557b875a046eb8f036276a87cd9b0"
assert subscription_key

search_url = "https://api.cognitive.microsoft.com/bing/v7.0/search"
search_term = "build and featur and snippet and paragraph and question and new and content"

headers = {"Ocp-Apim-Subscription-Key" : subscription_key}
params  = {"q": search_term, "textDecorations":True, "textFormat":"HTML"}
response = requests.get(search_url, headers=headers, params=params)
response.raise_for_status()
search_results = response.json()

bingSearchSnippetlist = []
bingjsonResult = json.dumps(search_results)
bingjsonDict = json.loads(bingjsonResult)

for bingKey, bingValue in bingjsonDict.items():        
        if bingKey == "webPages":
            for webKey, webValueItems in bingValue.items():
                if webKey == "value":
                    for valueItems in webValueItems:
                        for valueKey, valueItem in valueItems.items():
                            if valueKey == "snippet":
                                bingSearchSnippetlist.append(valueItem)
                                
bingdf = pd.DataFrame(bingSearchSnippetlist, columns=['Bing search snippets for search terms : ' + search_term])
display(bingdf.head()) 

HTTPError: 401 Client Error: Access Denied for url: https://api.cognitive.microsoft.com/bing/v7.0/search?q=build+and+featur+and+snippet+and+paragraph+and+question+and+new+and+content&textDecorations=True&textFormat=HTML

3.	Design and implement a similarity measure that computes the number of overlapping words between the total terms of the ten snippets associated to the first sentence S1 and the second sentence S2. 

    Hint: use loop for S1 snippets and S2 snippets similarity measurement. The measurement should be conducted between each snippets for each sentence S1 and S2.

In [None]:
#your code here.

 5. Compare the result with sentence semantic similarity that you have seen in Lab2.
    
    Hint: in lab2, WordNet was used to calculate sentence semantic similarity.

In [1]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn

#example
def penn_to_wn(tag):
    """ Convert between a Penn Treebank tag to a simplified Wordnet tag """
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
def tagged_to_synset(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
 
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None
 
def sentence_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
    # Tokenize and tag
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
 
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]
 
    score, count = 0.0, 0
 
    # For each word in the first sentence
    best_score = [0.0]
    for ss1 in synsets1:
        for ss2 in synsets2:
            best1_score=ss1.path_similarity(ss2)
        if best1_score is not None:
            best_score.append(best1_score)
        max1=max(best_score)
        if best_score is not None:
            score += max1
        if max1 is not 0.0:
            count += 1
        best_score=[0.0]
    print(score/count)      
   
    # Average the values
    score /= count
    return score
 
sentences = [
    "Dogs are awesome.",
    "Some gorgeous creatures are felines.",
    "Dolphins are swimming mammals.",
    "Cats are beautiful animals.",
]
 
focus_sentence = "Cats are beautiful animals."
 
for sentence in sentences:
    print ("Similarity(\"%s\", \"%s\") = %s" % (focus_sentence, sentence, sentence_similarity(focus_sentence, sentence)))
    print ("Similarity(\"%s\", \"%s\") = %s" % (sentence, focus_sentence, sentence_similarity(sentence, focus_sentence)))
    print 

0.3333333333333333
Similarity("Cats are beautiful animals.", "Dogs are awesome.") = 0.3333333333333333
0.2222222222222222
Similarity("Dogs are awesome.", "Cats are beautiful animals.") = 0.2222222222222222
0.23650793650793647
Similarity("Cats are beautiful animals.", "Some gorgeous creatures are felines.") = 0.23650793650793647
0.41798941798941797
Similarity("Some gorgeous creatures are felines.", "Cats are beautiful animals.") = 0.41798941798941797
0.17777777777777778
Similarity("Cats are beautiful animals.", "Dolphins are swimming mammals.") = 0.17777777777777778
0.14027777777777778
Similarity("Dolphins are swimming mammals.", "Cats are beautiful animals.") = 0.14027777777777778
0.41203703703703703
Similarity("Cats are beautiful animals.", "Cats are beautiful animals.") = 0.41203703703703703
0.41203703703703703
Similarity("Cats are beautiful animals.", "Cats are beautiful animals.") = 0.41203703703703703


6. Refine your code in order to expand the terms of each snippets to include all the hyponyms and hypernyms of the associated words by quering the WordNet database, and repeat the overlapping process.

In [None]:
#

7. Wikipedia based similarity.
   Similarly, use Wikipedia dump files in order to design a program that search the Wikipedia documents for each Sentence. The similarity between the sentences is therefore measured as the number of common Wikipedia documents outputted by the queries (S1 and S2) over the total number of documents outputted by the two queries. Repeat the process of calculating the semantic similarity for your set of chosen academic examples.

In [None]:
#your code here.

8. Use a publicly available database of your choice in order to test the usefulness of this similarity measure (Snippets and Wikipedia based similarity) and compare the results with some state of art measures mentioned in the literature employing your chosen publicly database.

In [49]:
# LET'S TRY YOUTUBE SNIPPETS

# Please check in this code
# python search.py / --q = surfing / --max-results = 10 / totalResults

import argparse
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import json

# NOTE: To use the sample, you must provide a developer key obtained in the Google APIs Console. 
# Search for DEVELOPER_KEY in this code to find the correct place to provide that key..
# Please ensure that you have enabled the YouTube Data API for your project from your Google account.

DEVELOPER_KEY = 'AIzaSyBN0zRiSDC_IdQrYWQaTcbCheyKLRopqOA'
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'

# This code collects only ids and snippets of videos 
# (as chennels and playlists didn't include any suitable information for us)

def youtube_search(searchItem, maxResults):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)

    # Call the search.list method to retrieve results matching the specified query term (at the end of the code).
    search_response = youtube.search().list(q=searchItem, part='id,snippet', maxResults=maxResults).execute()

    videos = []     #  channels = []  #  playlists = []

    # Add each result to the appropriate list, and then display the lists of matching videos. (channels, and playlists)
    
#print(str(search_response.get('pageInfo')), '\n\n')
#print(search_response)
    
    info = search_response.get('pageInfo', [])
    total = info.get('totalResults')
    print(total)
    for search_result in search_response.get("items", []):
        if search_result['id']['kind'] == 'youtube#video':
              videos.append('%s' % (search_result['snippet']['title']))

        # Code below saved for later use
        #                              search_result['id']['videoId']))
        #      elif search_result['id']['kind'] == 'youtube#channel':
        #            channels.append('%s (%s)' % (search_result['snippet']['title'],
        #                                 search_result['id']['channelId']))
        #      elif search_result['id']['kind'] == 'youtube#playlist':
        #            playlists.append('%s (%s)' % (search_result['snippet']['title'],
        #                                  search_result['id']['playlistId']))

    print ('Videos:\n', '\n'.join(videos), '\n')
         #   print ('Channels:\n', '\n'.join(channels), '\n')
         #   print ('Playlists:\n', '\n'.join(playlists), '\n')


for youtube_doc in all_documents:
    try:
                        # youtube_search(args)
                        # searchItem = "How to build up super snippets for new web content?"
        maxResults = 10
        youtube_search(youtube_doc, maxResults)
    except HttpError as e:
        print ('An HTTP error %d occurred:\n%s' % (e.resp.status, e.content))

    

263
Videos:
 How to Optimize for Google's Featured Snippet Box
How to Find Pages That Send Your Competitors Organic Search Traffic
How to Do Keyword Research in 2018: Go Beyond Search Volume
13 SEO Tips That ACTUALLY Work in 2018 and Beyond
Google Analytics Individual Qualification Exam Answers 2018📊Live Exam Pass📊100% correct✅
How does Google Search work?
How to Advertise on Google For Beginners | Complete Google AdWords Tutorial for 2018!
How to Get Website Traffic With Evergreen Content and Social Media Marketing
How to Dominate Half of the Google Search Results Page - Google Featured Snippets
SEO 2018 - What you need to know about Content, Schema, Semantically Related Keywords etc  [28:07] 

22507
Videos:
 Stand out in search results with Rich Cards
How to Optimize Google Snippets - MUST DO SEO
Super Junior D&E, Can We See a Snippet of the Dance? [Yu Huiyeol’s Sketchbook Ep 407]
Bootstrap 3 Tutorial 4 - Using Bootsnipp to Copy & Paste HTML5 Web Elements
How-to Add A Custom Function

9. Design a simple GUI interface that allows you to demonstrate your findings

In [90]:

# Be sure you've unzipped the wordnet corpus at nltk_data/corpora/wordnet. 
# This will allow WordNetCorpusReader to access it.
# nltk_data/corpora/wordnet

from nltk.corpus import wordnet
# we use stop_filtered_tokes -list of word lists that is defined above
# as seed for wordnet
print('---Source word lists---')
print(stop_filtered_tokens)
listOfSentences = []


for sft in stop_filtered_tokens:
    listOfTokenHyponymDictionaries = []
    for t in sft:
        synSetList = []
        #print('--- word to Process --- ')
        #print(t)
        mySynSets = wordnet.synsets(t)
        if mySynSets:
            hyponymsForTokensDict = {}
            for mySynset in mySynSets:
                #print (mySynset)
                if(mySynset.hypernyms()):
                    #for aHyponym in mySynset.hypernyms()[0].hyponyms():
                    #    print(aHyponym.name())
                    hyponymsForTokensDict = {'word':t,'hyponym':mySynset.hypernyms()[0].hyponyms()}                    
            if any(hyponymsForTokensDict):        
                listOfTokenHyponymDictionaries.append(hyponymsForTokensDict)        
    listOfSentences.append(listOfTokenHyponymDictionaries)
print(listOfSentences)
            #print(mySynset)
            #print('----hypernyms----')
            #print(mySynset.hypernyms())
            #if mySynset 
                
        #print(syn.name())
        #print(syn.definition())
#wordnet.synset('cookbook.n.01')

#print(wordnet.synsets('cooking')[0].examples())        # wordnet.synsets(word)
#syn.name()                                  # 'cookbook.n.01'
#syn.definition()                            #'a book of recipes and cooking directions'
#wordnet.synsets('snippet')[0].examples()

#  ['cooking can be a great art', 'people are needed who have experience in cookery', 'he left the preparation of meals to his wife']

# Synsets are organized in a structure similar to that of an inheritance tree. 
# More abstract terms are known as hypernyms and more specific terms are hyponyms. 
# This tree can be traced all the way up to a root hypernym
#print(syn.hypernyms())
#print(syn.hypernyms()[0].hyponyms())
#print(syn.hypernym_paths())

#“The hypernym_paths() method returns a list of lists, 
# where each list starts at the root hypernym and ends with the original Synset. 
# Most of the time, you'll only get one nested list of Synsets.”



---Source word lists---
[['keywords', 'placing', 'versus', 'highly', 'defined', 'featured', 'snippets', 'google', 'important', 'getting', 'traffic', 'webpage'], ['build', 'super', 'snippets', 'new', 'web', 'content'], ['webpage', 'earn', 'new', 'featured', 'snippets', 'google', 'includes', 'keywords', 'like', 'competitors', 'high', 'ranked', 'featured', 'snippets'], ['regions', 'finland', 'enjoy', 'white', 'snow', 'temperatures', 'freezing', 'point', 'christmas', 'eve']]
[[{'word': 'placing', 'hyponym': [Synset('belt_out.v.01'), Synset('chant.v.01'), Synset('choir.v.01'), Synset('croon.v.01'), Synset('descant.v.01'), Synset('descant_on.v.01'), Synset('harmonize.v.03'), Synset('hum.v.01'), Synset('minstrel.v.01'), Synset('place.v.16'), Synset('psalm.v.01'), Synset('sing.v.01'), Synset('sing_along.v.01'), Synset('solmizate.v.02'), Synset('treble.v.01'), Synset('troll.v.05'), Synset('tweedle.v.01'), Synset('vocalize.v.02'), Synset('warble.v.01'), Synset('yodel.v.01')]}, {'word': 'defined'