<a href="https://colab.research.google.com/github/gitmystuff/INFO4080/blob/main/Week_14-Wrap_Up/WikiScrape_Text_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WikiScrape Summarizer



## Wikipedia API

If you intend to do any scraping projects or automated requests, consider alternatives such as Pywikipediabot or MediaWiki API, which has other superior features.

* wikipedia.search('keywords', results=2)
* wikipedia.suggest('keyword')
* wikipedia.summary('keywords', sentences=2)
* wikipedia.page('keywords')
* wikipedia.page('keywords').content
* wikipedia.page('keywords').references
* wikipedia.page('keywords').title
* wikipedia.page('keywords').url
* wikipedia.page('keywords').categories
* wikipedia.page('keywords').content
* wikipedia.page('keywords').links
* wikipedia.geosearch(33.2075, 97.1526)
* wikipedia.set_lang('hi')
* wikipedia.languages()
* wikipedia.page('keywords').images[0]
* wikipedia.page('keywords').html()

## Beautiful Soup

In [None]:
pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=e5f5d213cd11af474e737d04c1be0b8f2eebe356ad12383dfce8a2ab3bd1b633
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [None]:
# https://kleiber.me/blog/2017/07/22/tutorial-lda-wikipedia/
import pandas as pd
import random
import wikipedia

# rtitles = wikipedia.random(5)

# get 5 Wikipedia page titles based on keywords
titles = []
keywords = ['ultranationalism', 'religion', 'religious facism', 'state religion', 'deifying rulers']
for key in keywords:
    title = wikipedia.search(key, results=5)
    titles.append(title[0])

# print(titles)
data = []

for title in titles:
    # disambiguous error fix
    try:
        url_title = title.strip().replace(' ', '_')
        url = f'https://en.wikipedia.org/wiki/{url_title}' # left alt, shift, down to duplicate line
        # data.append([title, url, wikipedia.page(title, auto_suggest=False).content, wikipedia.summary(title, auto_suggest=False, sentences=15)])
        data.append([title, url])
    except wikipedia.exceptions.DisambiguationError as e:
        s = random.choice(e.options)
        data.append([title, wikipedia.page(s).content,  wikipedia.summary(title, auto_suggest=False, sentences=15)])

# df = pd.DataFrame(data, columns=['title', 'url', 'content', 'summary'])
pages = pd.DataFrame(data, columns=['title', 'url'])
pages.head()

Unnamed: 0,title,url
0,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism
1,Religion,https://en.wikipedia.org/wiki/Religion
2,Fascism,https://en.wikipedia.org/wiki/Fascism
3,State religion,https://en.wikipedia.org/wiki/State_religion
4,Apotheosis,https://en.wikipedia.org/wiki/Apotheosis


In [None]:
# wikiscrape
from bs4 import BeautifulSoup
import pandas as pd
import requests

data = []

def make_soup(page):
  # global df
  soup = BeautifulSoup(requests.get(page.url).text)
  s = soup.find_all('h2')
  s_list = [x.get_text().replace('[edit]', '') for x in s]
  # print(pd.Series(s_list))
  data.extend([[page.title, page.url, x.get_text().replace('[edit]', '')] for x in s])

x = pages.apply(make_soup, axis=1)
headings = pd.DataFrame(data, columns=['title', 'url', 'heading'])
drop_list = ['Contents', 'See also', 'References', 'External links', 'Notes', 'Sources', 'Further reading', 'Bibliography']
headings = headings[~headings['heading'].isin(drop_list)]
print(headings.shape)
headings.head()

(33, 3)


Unnamed: 0,title,url,heading
1,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Background concepts and broader context
2,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Historical movements and analysis
3,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties
4,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist organizations
5,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist terrorism


In [None]:
headings['title'].value_counts()

title
Religion            8
Apotheosis          8
Ultranationalism    6
Fascism             6
State religion      5
Name: count, dtype: int64

In [None]:
import re

CLEANR = re.compile('<.*?>')
def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

data = []
def get_subs(row):
  heading1 = row['heading']
  title = row['title']
  url = row['url']
  soup = BeautifulSoup(requests.get(url).text)
  txt = ''
  txt1 = ''
  target = soup.find('span', attrs={'id': heading1.replace(' ', '_')}).parent
  for sib in target.find_next_siblings():
      if sib.name=='h2':
          break
      else:
          txt += str(sib)
          if sib.name=='p':
            txt1 += str(sib)

  soup2 = BeautifulSoup(txt)
  s = soup2.find_all('h3')
  s_list2 = [x.get_text().replace('[edit]', '') for x in s]
  # print(f'{heading1}\n')
  if len(s_list2) > 0:
    # print(pd.Series(s_list2))
    for i in range(len(s_list2)):
      txt=''
      heading2 = s_list2[i]
      target2 = soup.find('h3', string=heading2)
      target2 = soup.find('span', attrs={'id': heading2.replace(' ', '_')}).parent
      for sib in target2.find_next_siblings():
          if sib.name=='h3':
              break
          else:
            if sib.name=='p':
              txt += sib.text

      data.append([title, url, heading1, heading2, cleanhtml(txt)])
  else:
      data.append([title, url, heading1, 'None', cleanhtml(txt1)])

x = headings.apply(get_subs, axis=1)
df = pd.DataFrame(data, columns=['title', 'url', 'heading', 'subheading', 'txt'])
print(df.shape)
df.head()

(92, 5)


Unnamed: 0,title,url,heading,subheading,txt
0,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Background concepts and broader context,,British political theorist Roger Griffin has s...
1,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Historical movements and analysis,,American historian Walter Skya has written in ...
2,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties,Currently represented in national governments ...,The following political parties have been char...
3,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties,Represented parties with former ultranationali...,The following political parties historically h...
4,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties,Formerly represented in national governments o...,Arising out of strident Sri Lankan Tamil natio...


## LDA (Latent Dirichlet Allocation)

In natural language processing, latent Dirichlet allocation (LDA) is a Bayesian network (and, therefore, a generative statistical model) for modeling automatically extracted topics in textual corpora. The LDA is an example of a Bayesian topic model. In this, observations (e.g., words) are collected into documents, and each word's presence is attributable to one of the document's topics. Each document will contain a small number of topics.

Sources:
 * https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation
 * https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

results = 10
components = 10
topics = 10

vectorizer = TfidfVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(df['txt'].values.astype('U'))

model = LatentDirichletAllocation(n_components=components)
model.fit(vectors)

topics_dictionary = {}
for index, topic in enumerate(model.components_):
    print(f'Topic {index} top words: {[vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-topics:]]}')
    topics_dictionary[index] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-topics:]]



Topic 0 top words: ['science', 'religions', 'religiō', 'violence', 'factions', 'religion', 'following', 'ultranationalist', 'parties', 'political']
Topic 1 top words: ['germany', 'war', 'islam', 'religion', 'italy', 'state', 'mussolini', 'italian', 'fascist', 'fascism']
Topic 2 top words: ['rights', 'legal', 'middle', 'scholars', 'focused', 'schools', 'understanding', 'field', 'comparative', 'law']
Topic 3 top words: ['transplanting', 'grounded', 'kill', 'students', 'ce', 'glorify', 'ilkhanate', 'soil', 'mongol', 'culture']
Topic 4 top words: ['poem', 'state', 'practice', 'wrote', 'health', 'world', 'criticism', 'religions', 'religious', 'religion']
Topic 5 top words: ['religions', 'mazda', 'includes', 'state', 'good', 'abolished', 'disappeared', 'fell', 'conquered', 'states']
Topic 6 top words: ['countries', 'publication', 'roman', 'morality', 'superstition', 'confucianism', 'religious', 'ultranationalism', 'religion', 'mythology']
Topic 7 top words: ['avatar', 'west', 'anthropologist

In [None]:
def get_topics(row):
  return ', '.join([top for top in topics_dictionary[row.topic_idx]])

topic_results = model.transform(vectors)
df['topic_idx'] = topic_results.argmax(axis=1)

df['topics']= df.apply(get_topics, axis=1)
df.head()

Unnamed: 0,title,url,heading,subheading,txt,topic_idx,topics
0,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Background concepts and broader context,,British political theorist Roger Griffin has s...,5,"religions, mazda, includes, state, good, aboli..."
1,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Historical movements and analysis,,American historian Walter Skya has written in ...,6,"countries, publication, roman, morality, super..."
2,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties,Currently represented in national governments ...,The following political parties have been char...,0,"science, religions, religiō, violence, faction..."
3,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties,Represented parties with former ultranationali...,The following political parties historically h...,0,"science, religions, religiō, violence, faction..."
4,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties,Formerly represented in national governments o...,Arising out of strident Sri Lankan Tamil natio...,6,"countries, publication, roman, morality, super..."


## SpaCy

* https://spacy.io/
* https://medium.com/analytics-vidhya/text-summarization-using-spacy-ca4867c6b744

In [None]:
# uncomment to download
import spacy.cli

spacy.cli.download('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Language Model and Pipelines

en_core_web_sm

* https://www.kdnuggets.com/2021/03/natural-language-processing-pipelines-explained.html
* https://spacy.io/usage/spacy-101
* https://en.wikipedia.org/wiki/Language_model
* https://builtin.com/data-science/beginners-guide-language-models

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

nlp = spacy.load('en_core_web_sm')

In [None]:
# get example text
import textwrap

textwrap.fill(df.iloc[0]['txt'])

'British political theorist Roger Griffin has stated that\nultranationalism is essentially founded on xenophobia in a way that\nfinds supposed legitimacy "through deeply mythicized narratives of\npast cultural or political periods of historical greatness or of old\nscores to settle against alleged enemies". It can also draw on\n"vulgarized forms" of different aspects of the natural sciences such\nas anthropology and genetics, eugenics specifically playing a role, in\norder "to rationalize ideas of national superiority and destiny, of\ndegeneracy and subhumanness" in Griffin\'s opinion. Ultranationalists\nview the modern nation-state as, according to Griffin, a living\norganism directly akin to a physical person such that it can decay,\ngrow, die, and additionally experience rebirth. He has highlighted\nNazi Germany as a regime which was founded on ultranationalism.[3]\nUltranationalist activism can adopt varying attitudes towards\nhistorical traditions within the populace. For instance

In [None]:
import textwrap
import re

# data = []
# summary_text = ' '.join([re.sub("\[.*?\]", "", txt) for txt in df.txt])
# doc = nlp(summary_text)
summary_text = ' '.join([re.sub("\[.*?\]", "", df.iloc[0]['txt'])])
doc = nlp(summary_text)
keyword = []
stopwords = list(STOP_WORDS)
pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
for token in doc:
    if(token.text in stopwords or token.text in punctuation):
        continue
    if(token.pos_ in pos_tag):
        keyword.append(token.text)

freq_word = Counter(keyword)
max_freq = Counter(keyword).most_common(1)[0][1]
for word in freq_word.keys():
    freq_word[word] = (freq_word[word]/max_freq)

sent_strength={}
for sent in doc.sents:
    for word in sent:
        if word.text in freq_word.keys():
            if sent in sent_strength.keys():
                sent_strength[sent] += freq_word[word.text]
            else:
                sent_strength[sent] = freq_word[word.text]

    try:
      data.append([sent_strength[sent], str(sent)])
    except:
      pass
    print(sent_strength[sent])
    print(textwrap.fill(str(sent)))
    print()

# summary = nlargest(10, sent_strength, key=sent_strength.get)
# summary = ' '.join([w.text for w in summary])
# print(textwrap.fill(summary, 100))
# df2 = pd.DataFrame(data, columns=['strength', 'txt'])
# df2.sort_values(by=['strength'], ascending=False).head()

12.0
British political theorist Roger Griffin has stated that
ultranationalism is essentially founded on xenophobia in a way that
finds supposed legitimacy "through deeply mythicized narratives of
past cultural or political periods of historical greatness or of old
scores to settle against alleged enemies".

8.999999999999998
It can also draw on "vulgarized forms" of different aspects of the
natural sciences such as anthropology and genetics, eugenics
specifically playing a role, in order "to rationalize ideas of
national superiority and destiny, of degeneracy and subhumanness" in
Griffin's opinion.

6.999999999999997
Ultranationalists view the modern nation-state as, according to
Griffin, a living organism directly akin to a physical person such
that it can decay, grow, die, and additionally experience rebirth.

2.6666666666666665
He has highlighted Nazi Germany as a regime which was founded on
ultranationalism.

3.0
Ultranationalist activism can adopt varying attitudes towards
histor

In [None]:
len(sent_strength)

11

In [None]:
summary = nlargest(int(len(sent_strength)/2), sent_strength, key=sent_strength.get)
summary = ' '.join([w.text for w in summary])
summary = ' '.join([re.sub("\[.*?\]", "", summary)])
print(textwrap.fill(summary))

According to American scholar Janusz Bugajski, summing up the doctrine
in practical terms, "in its most extreme or developed forms, ultra-
nationalism resembles fascism, marked by a xenophobic disdain of other
nations, support for authoritarian political arrangements verging on
totalitarianism, and a mythical emphasis on the 'organic unity'
between a charismatic leader, an organizationally amorphous movement-
type party, and the nation". British political theorist Roger Griffin
has stated that ultranationalism is essentially founded on xenophobia
in a way that finds supposed legitimacy "through deeply mythicized
narratives of past cultural or political periods of historical
greatness or of old scores to settle against alleged enemies". It can
also draw on "vulgarized forms" of different aspects of the natural
sciences such as anthropology and genetics, eugenics specifically
playing a role, in order "to rationalize ideas of national superiority
and destiny, of degeneracy and subhumannes

In [None]:
# pip install spacy-llm

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

nlp = spacy.blank('en')
nlp.add_pipe('sentencizer')

# https://www.educative.io/answers/text-summarization-in-spacy-and-nltk
# df.iloc[0]['txt']
def summarizer(row):
  txt = row['txt']
  text = ' '.join([re.sub('\[.*?\]|"', '', txt)])
  doc = nlp(text)

  word_frequencies = {}
  for token in doc:
      if token.text not in STOP_WORDS and token.text not in punctuation:
          if token.text not in word_frequencies:
              word_frequencies[token.text] = 1
          else:
              word_frequencies[token.text] += 1


  sorted_sentences = sorted(doc.sents, key=lambda sent: sum(word_frequencies[token.text]
                          for token in sent if token.text in word_frequencies), reverse=True)

  # return str(' '.join(sent.text for sent in sorted_sentences[:int(len(sorted_sentences)/4)]).strip())
  return str(' '.join(sent.text for sent in sorted_sentences[:2]).strip())

# print(textwrap.fill(summarizer(df.iloc[0]['txt'])))

In [None]:
df['summary']= df.apply(summarizer, axis=1)
df.head()

Unnamed: 0,title,url,heading,subheading,txt,topic_idx,topics,summary
0,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Background concepts and broader context,,British political theorist Roger Griffin has s...,5,"religions, mazda, includes, state, good, aboli...","According to American scholar Janusz Bugajski,..."
1,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Historical movements and analysis,,American historian Walter Skya has written in ...,6,"countries, publication, roman, morality, super...","In late 2015, the Israeli political journalist..."
2,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties,Currently represented in national governments ...,The following political parties have been char...,0,"science, religions, religiō, violence, faction...",The following political parties have been desc...
3,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties,Represented parties with former ultranationali...,The following political parties historically h...,0,"science, religions, religiō, violence, faction...",The following political parties historically h...
4,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties,Formerly represented in national governments o...,Arising out of strident Sri Lankan Tamil natio...,6,"countries, publication, roman, morality, super...",The assassination of Pavlos Fyssas in Septembe...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92 entries, 0 to 91
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       92 non-null     object
 1   url         92 non-null     object
 2   heading     92 non-null     object
 3   subheading  92 non-null     object
 4   txt         92 non-null     object
 5   topic_idx   92 non-null     int64 
 6   topics      92 non-null     object
 7   summary     92 non-null     object
dtypes: int64(1), object(7)
memory usage: 5.9+ KB


In [None]:
# df.to_csv('wikiscrape.csv')

In [None]:
print(textwrap.fill(df.iloc[4].summary))

The assassination of Pavlos Fyssas in September 2013, a hip-hop
musician with left-wing views, from stabbing wounds to the heart and
ribs that occurred after his surrounding by multiple dozen Golden Dawn
militants triggered widespread outrage at the Greek political
organization. Arising out of strident Sri Lankan Tamil nationalism,
with differing ethnic and religious groups placed at odds, the
militant faction known as the Liberation Tigers of Tamil Eelam (LTTE)
orchestrated a decades long campaign of terrorism in the country of
Sri Lanka, which is inside of the Indian Ocean and has been influenced
by broader socio-political trends.


In [None]:
print(textwrap.fill(summarizer(df.iloc[0])))

According to American scholar Janusz Bugajski, summing up the doctrine
in practical terms, in its most extreme or developed forms, ultra-
nationalism resembles fascism, marked by a xenophobic disdain of other
nations, support for authoritarian political arrangements verging on
totalitarianism, and a mythical emphasis on the 'organic unity'
between a charismatic leader, an organizationally amorphous movement-
type party, and the nation. British political theorist Roger Griffin
has stated that ultranationalism is essentially founded on xenophobia
in a way that finds supposed legitimacy through deeply mythicized
narratives of past cultural or political periods of historical
greatness or of old scores to settle against alleged enemies.


### Supervised Fine Tuning

In [None]:
def llm_sft_format(row):
  return f'### Human: {row.title} {row.heading}.### Assistant: {row.summary}'

df['text']= df.apply(llm_sft_format, axis=1)
df['text'].head()

0    ### Human: Ultranationalism Background concept...
1    ### Human: Ultranationalism Historical movemen...
2    ### Human: Ultranationalism Ultranationalist p...
3    ### Human: Ultranationalism Ultranationalist p...
4    ### Human: Ultranationalism Ultranationalist p...
Name: text, dtype: object

In [None]:
df['text'].sample(20)

80    ### Human: State religion Former confessional ...
49    ### Human: Fascism History.### Assistant: Laru...
44    ### Human: Fascism History.### Assistant: The ...
71    ### Human: State religion Former state religio...
53    ### Human: Fascism Tenets.### Assistant: Fasci...
23    ### Human: Religion Specific religions.### Ass...
4     ### Human: Ultranationalism Ultranationalist p...
42    ### Human: Fascism History.### Assistant: Geor...
43    ### Human: Fascism History.### Assistant: Mari...
72    ### Human: State religion Former state religio...
40    ### Human: Fascism Definitions.### Assistant: ...
79    ### Human: State religion Former confessional ...
2     ### Human: Ultranationalism Ultranationalist p...
66    ### Human: State religion Current states with ...
20    ### Human: Religion Specific religions.### Ass...
74    ### Human: State religion Former state religio...
32    ### Human: Religion Related aspects.### Assist...
0     ### Human: Ultranationalism Background con

In [None]:
df.iloc[40]

title                                                   Fascism
url                       https://en.wikipedia.org/wiki/Fascism
heading                                             Definitions
subheading                   Position on the political spectrum
txt           Scholars place fascism on the far-right of the...
topic_idx                                                     2
topics        party, world, germany, war, italy, political, ...
summary       In the 1920s, Mussolini and Giovanni Gentile d...
text          ### Human: Fascism Definitions.### Assistant: ...
Name: 40, dtype: object

In [None]:
# df['text'].to_csv('llm_sft_format.csv')

In [None]:
print(df.shape)
df.info()

(92, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92 entries, 0 to 91
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       92 non-null     object
 1   url         92 non-null     object
 2   heading     92 non-null     object
 3   subheading  92 non-null     object
 4   txt         92 non-null     object
 5   topic_idx   92 non-null     int64 
 6   topics      92 non-null     object
 7   summary     92 non-null     object
dtypes: int64(1), object(7)
memory usage: 5.9+ KB


### Question Answering

{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'id': '5733be284776f41900661182',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'title': 'University_of_Notre_Dame'}

In [None]:
def ans_ques(row):
  return f'### Human: {row.title} {row.heading}.### Assistant: {row.summary}'

df['text']= df.apply(ans_ques, axis=1)
df['text'].head()

### Text Emdbedding

* https://huggingface.co/blog/getting-started-with-embeddings

In [None]:
import requests

with open('hf_write_token.txt', 'r') as file:
  data = file.read()

model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = data

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

In [None]:
def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

In [None]:
def get_embeddings(row):
  texts = str(row.summary)
  response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
  return response.json()

In [None]:
df['embeddings']= df.apply(get_embeddings, axis=1)
print(df.shape)
print(df.info())
df.head()

(92, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92 entries, 0 to 91
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       92 non-null     object
 1   url         92 non-null     object
 2   heading     92 non-null     object
 3   subheading  92 non-null     object
 4   txt         92 non-null     object
 5   topic_idx   92 non-null     int64 
 6   topics      92 non-null     object
 7   summary     92 non-null     object
 8   embeddings  92 non-null     object
dtypes: int64(1), object(8)
memory usage: 6.6+ KB
None


Unnamed: 0,title,url,heading,subheading,txt,topic_idx,topics,summary,embeddings
0,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Background concepts and broader context,,British political theorist Roger Griffin has s...,5,"religions, mazda, includes, state, good, aboli...","According to American scholar Janusz Bugajski,...","[0.053972408175468445, -0.05422463268041611, -..."
1,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Historical movements and analysis,,American historian Walter Skya has written in ...,6,"countries, publication, roman, morality, super...","In late 2015, the Israeli political journalist...","[-0.008491200394928455, -0.010670002549886703,..."
2,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties,Currently represented in national governments ...,The following political parties have been char...,0,"science, religions, religiō, violence, faction...",The following political parties have been desc...,"[0.04193244129419327, -0.1433926522731781, -0...."
3,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties,Represented parties with former ultranationali...,The following political parties historically h...,0,"science, religions, religiō, violence, faction...",The following political parties historically h...,"[0.040011122822761536, -0.12799780070781708, -..."
4,Ultranationalism,https://en.wikipedia.org/wiki/Ultranationalism,Ultranationalist political parties,Formerly represented in national governments o...,Arising out of strident Sri Lankan Tamil natio...,6,"countries, publication, roman, morality, super...",The assassination of Pavlos Fyssas in Septembe...,"[-0.04810608923435211, -0.011620273813605309, ..."


In [None]:
prompt = 'What is the connection between ultranationalism and religion and what countries have shown ultranational and religious ties'
embedded_prompt = query(prompt)
embedded_prompt

[0.08784614503383636,
 -0.05754084512591362,
 -0.08186469227075577,
 0.022541724145412445,
 0.016043677926063538,
 -0.008074524812400341,
 -0.057457007467746735,
 -0.09698593616485596,
 0.09277341514825821,
 -0.06427501142024994,
 0.012533015571534634,
 0.031779661774635315,
 -0.04627034068107605,
 0.05866655334830284,
 -0.01006525568664074,
 -0.01038946770131588,
 -0.11564647406339645,
 0.005305199418216944,
 -0.05669048801064491,
 0.03841792047023773,
 0.01037653535604477,
 -0.13828091323375702,
 0.010164120234549046,
 0.027875978499650955,
 0.054574839770793915,
 -0.030997145920991898,
 0.11594699323177338,
 -0.04020339623093605,
 -0.00851795170456171,
 0.04129486903548241,
 -0.00687482301145792,
 -0.027097664773464203,
 -0.09241983294487,
 -0.007062586490064859,
 -0.0071807727217674255,
 0.03697142377495766,
 0.03743596374988556,
 0.04703846946358681,
 -0.005878327880054712,
 -0.048082444816827774,
 0.0895836353302002,
 -0.040838997811079025,
 0.052690520882606506,
 -0.098739132285

In [None]:
import numpy as np

def vector_similarity(vec1, vec2):
  return np.dot(np.array(vec1), np.array(vec2))

In [None]:
df['similarity'] = df['embeddings'].apply(lambda vector: vector_similarity(vector, embedded_prompt))
df.nlargest(1, 'similarity')

67

In [None]:
context = df.nlargest(1, 'similarity').iloc[0]['summary']
print(textwrap.fill(context))

In some countries, there is a political ideology sponsored by the
government that may be called political religion.


In [None]:
context = ' '.join(df.nlargest(3, 'similarity')['summary'])
print(textwrap.fill(context))

In some countries, there is a political ideology sponsored by the
government that may be called political religion. In the field of
comparative religion, a common geographical classification of the main
world religions includes Middle Eastern religions (including
Zoroastrianism and Iranian religions), Indian religions, East Asian
religions, African religions, American religions, Oceanic religions,
and classical Hellenistic religions. Some academics studying the
subject have divided religions into three broad categories: Some
recent scholarship has argued that not all types of religion are
necessarily separated by mutually exclusive philosophies, and
furthermore that the utility of ascribing a practice to a certain
philosophy, or even calling a given practice religious, rather than
cultural, political, or social in nature, is limited. For instance,
India is still one of the most religious countries and religion still
has a strong impact on politics, given that Hindu nationalists have
be

In [None]:
llm_prompt = f'''
Only answer the question below if you have 100% certainty of the facts.
Context: {context}
Q: {prompt}
A:
'''
llm_prompt

'\nOnly answer the question below if you have 100% certainty of the facts.\nContext: In some countries, there is a political ideology sponsored by the government that may be called political religion. In the field of comparative religion, a common geographical classification of the main world religions includes Middle Eastern religions (including Zoroastrianism and Iranian religions), Indian religions, East Asian religions, African religions, American religions, Oceanic religions, and classical Hellenistic religions. Some academics studying the subject have divided religions into three broad categories:\nSome recent scholarship has argued that not all types of religion are necessarily separated by mutually exclusive philosophies, and furthermore that the utility of ascribing a practice to a certain philosophy, or even calling a given practice religious, rather than cultural, political, or social in nature, is limited. For instance, India is still one of the most religious countries and

In [None]:
from transformers import pipeline

qa = pipeline("question-answering")

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
qa(context=context, question=prompt)

{'score': 0.026854772120714188,
 'start': 19,
 'end': 114,
 'answer': 'there is a political ideology sponsored by the government that may be called political religion'}

In [None]:
from transformers import pipeline

gen = pipeline("text-generation", max_length=100)

No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
def wrap(x):
  return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

out = gen(context, max_length=500)
print(wrap(out[0]['generated_text']))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In some countries, there is a political ideology sponsored by the
government that may be called political religion.  In the field of
comparative religion, a common geographical classification of the main
world religions includes Middle Eastern religions (including
Zoroastrianism and Iranian religions), Indian religions, East Asian
religions, African religions, American religions, Oceanic religions,
and classical Hellenistic religions.  Some academics studying the
subject have divided religions into three broad categories:
Some
recent scholarship has argued that not all types of religion are
necessarily separated by mutually exclusive philosophies, and
furthermore that the utility of ascribing a practice to a certain
philosophy, or even calling a given practice religious, rather than
cultural, political, or social in nature, is limited.  For instance,
India is still one of the most religious countries and religion still
has a strong impact on politics, given that Hindu nationalists have

### Final Version

In [None]:
from transformers import pipeline

gen = pipeline("text-generation", max_length=500)

No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
import requests
import numpy as np
from transformers import pipeline

gen = pipeline("text-generation", max_length=500)

with open('hf_write_token.txt', 'r') as file:
  data = file.read()

model_id = 'sentence-transformers/all-MiniLM-L6-v2'
hf_token = data

api_embedding = f'https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}'
api_gen = 'https://api-inference.huggingface.co/models/gpt2'
headers = {'Authorization': f'Bearer {hf_token}'}

def wrap(x):
  return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

def vector_similarity(vec1, vec2):
  return np.dot(np.array(vec1), np.array(vec2))

def query(payload):
    response = requests.post(api_gen, headers=headers, json=payload)
    return response.json()

def embed(texts):
    response = requests.post(api_embedding, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

def ask_me_something():
  question = input('Ask me something: ')
  embedded_prompt = embed(question)
  df['similarity'] = df['embeddings'].apply(lambda vector: vector_similarity(vector, embedded_prompt))
  context = ' '.join(df.nlargest(3, 'similarity')['summary'])
  prompt = f'''
    Only answer the question below if you have 100% certainty of the facts.
    Context: {context}
    Q: {question}
    A:
    '''
  out = gen(prompt, max_length=500)
  print(wrap(out[0]['generated_text']))


No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


What is the connection between ultranationalism and religion and what countries have implemented ultranational and religious policies

In [None]:
ask_me_something()

Ask me something: What is the connection between ultranationalism and religion and what countries have implemented ultranational and religious policies


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



    Only answer the question below if you have 100% certainty of the
facts.
    Context: In some countries, there is a political ideology
sponsored by the government that may be called political religion.
For instance, India is still one of the most religious countries and
religion still has a strong impact on politics, given that Hindu
nationalists have been targeting minorities like the Muslims and the
Christians, who historically belonged to the lower castes.  By
contrast, countries such as China or Japan are largely secular and
thus religion has a much smaller impact on politics.  In the field of
comparative religion, a common geographical classification of the main
world religions includes Middle Eastern religions (including
Zoroastrianism and Iranian religions), Indian religions, East Asian
religions, African religions, American religions, Oceanic religions,
and classical Hellenistic religions.  Some academics studying the
subject have divided religions into three broad categori

In [None]:
import requests
import numpy as np
from transformers import pipeline

with open('hf_write_token.txt', 'r') as file:
  data = file.read()

model_id = 'sentence-transformers/all-MiniLM-L6-v2'
hf_token = data

api_embedding = f'https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}'
api_gen = 'https://api-inference.huggingface.co/models/gpt2'
headers = {'Authorization': f'Bearer {hf_token}'}

def wrap(x):
  return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

def vector_similarity(vec1, vec2):
  return np.dot(np.array(vec1), np.array(vec2))

def query(payload):
    response = requests.post(api_gen, headers=headers, json=payload)
    return response.json()

def embed(texts):
    response = requests.post(api_embedding, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

def ask_me_something_else():
  question = input('Ask me something: ')
  embedded_prompt = embed(question)
  df['similarity'] = df['embeddings'].apply(lambda vector: vector_similarity(vector, embedded_prompt))
  context = ' '.join(df.nlargest(2, 'similarity')['summary'])

  prompt = f'''
    Only answer the question below if you have 100% certainty of the facts.
    Context: {context}
    Q: {question}
    A:
    '''

  data = query(
      {
          "inputs": prompt,
          "parameters": {"max_length": 500},
      }
  )

  print(wrap(data[0]['generated_text']))


In [None]:
ask_me_something_else()

Ask me something: What is the connection between ultranationalism and religion and what countries have implemented ultranational and religious policies

    Only answer the question below if you have 100% certainty of the
facts.
    Context: In some countries, there is a political ideology
sponsored by the government that may be called political religion.
For instance, India is still one of the most religious countries and
religion still has a strong impact on politics, given that Hindu
nationalists have been targeting minorities like the Muslims and the
Christians, who historically belonged to the lower castes.  By
contrast, countries such as China or Japan are largely secular and
thus religion has a much smaller impact on politics.
    Q: What is
the connection between ultranationalism and religion and what
countries have implemented ultranational and religious policies
    A:
                     
     \    
_________________________________________
A:   In China, there is
still an 

### HF Example Continued

In [None]:
texts = ["How do I get a replacement Medicare card?",
        "What is the monthly premium for Medicare Part B?",
        "How do I terminate my Medicare Part B (medical insurance)?",
        "How do I sign up for Medicare?",
        "Can I sign up for Medicare Part B if I am working and have health insurance through an employer?",
        "How do I sign up for Medicare Part B if I already have Part A?",
        "What are Medicare late enrollment penalties?",
        "What is Medicare and who can get it?",
        "How can I get help with my Medicare Part A and Part B premiums?",
        "What are the different parts of Medicare?",
        "Will my Medicare premiums be higher because of my higher income?",
        "What is TRICARE ?",
        "Should I sign up for Medicare Part B if I have Veterans' Benefits?"]

output = query(texts)

In [None]:
import pandas as pd

embeddings = pd.DataFrame(output)
embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.023889,0.055259,-0.011655,-0.033414,-0.012261,-0.024873,-0.012663,0.025346,0.018508,-0.083508,...,-0.161688,-0.046426,0.006004,0.005281,-0.003342,0.027754,0.020411,0.005778,0.034098,-0.006889
1,-0.012688,0.046874,-0.010502,-0.020384,-0.013361,0.042322,0.016628,-0.004099,-0.002607,-0.010188,...,-0.061594,-0.020717,-0.009082,-0.02926,-0.066253,0.065257,0.013229,-0.023103,-0.002785,0.010474
2,0.000494,0.119412,0.005229,-0.092734,0.007773,-0.005325,0.034506,-0.051981,-0.006265,-0.006111,...,-0.108326,-0.049646,-0.073399,-0.029898,-0.102734,0.062121,0.034606,0.016877,-0.023861,0.005264
3,-0.029711,0.023298,-0.057041,-0.012183,-0.01371,0.029796,0.063739,0.001101,-0.045124,-0.040748,...,-0.117682,0.031924,0.000854,0.0202,-0.020666,-0.005167,0.03837,0.003617,0.033993,-0.010255
4,-0.025628,0.070389,-0.01738,-0.056567,0.028577,0.052823,0.067063,-0.052618,-0.054702,-0.11623,...,-0.118145,0.013343,-0.055188,-0.032723,0.008436,0.019169,0.048212,-0.040412,0.083346,0.026855


In [None]:
txt_embed1 = embeddings.loc[0, :].values.tolist()
len(txt_embed1)

[-0.023889480158686638,
 0.055258527398109436,
 -0.0116548677906394,
 -0.0334143303334713,
 -0.012260551564395428,
 -0.024872807785868645,
 -0.01266338862478733,
 0.025345906615257263,
 0.018508488312363625,
 -0.08350811153650284,
 -0.09301997721195221,
 0.014486283995211124,
 -0.01741090416908264,
 -0.08834366500377655,
 -0.0044790650717914104,
 -0.046325840055942535,
 -0.013193867169320583,
 0.03538176044821739,
 0.06231117621064186,
 0.048589665442705154,
 -0.05911843478679657,
 0.05413534492254257,
 -0.0643969178199768,
 0.03402400389313698,
 0.006636396050453186,
 0.035917047411203384,
 -0.06783764064311981,
 -0.017735272645950317,
 -0.012721833772957325,
 0.046462420374155045,
 0.10864363610744476,
 0.02382138930261135,
 -0.02699640952050686,
 0.037173956632614136,
 0.097598135471344,
 -0.027030102908611298,
 -0.04542989656329155,
 0.031817320734262466,
 -0.033746276050806046,
 -0.015198479406535625,
 -0.02153564803302288,
 0.014811238273978233,
 -0.02089184895157814,
 0.06885715