- Models
    - distilbert
        - https://huggingface.co/distilbert-base-uncased-distilled-squad
        - word count restricted to 512
        - appropriate for page summaries
    - look into
        - longform
            - https://medium.com/dair-ai/longformer-what-bert-should-have-been-78f4cd595be9
    - model open directory
        - https://huggingface.co/

# Setup

In [1]:
# for distilbert - answer questions
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# for choosing the correct article to answer question
from sentence_transformers import SentenceTransformer, util


# for getting wikipedia articles
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('en')

# data
import pandas as pd


# utils
import tqdm
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=1)

# Data input

## Curated collection

In [None]:
# compile corpus
curated_pages = '''
Artificial intelligence
Natural language processing
Deep learning
Supervised learning
Semi-supervised learning
Unsupervised learning
Statistical classification
Regression analysis
Federated learning
k-anonymity
Data anonymization
k-means clustering
DBSCAN
Dimensionality reduction
Silhouette (clustering)
Davies–Bouldin index
Multidimensional scaling
Cluster analysis
Principal component analysis
Isolation forest
Unsupervised learning
Hierarchical clustering
Local outlier factor
Kaiser–Meyer–Olkin test
Bartlett's test

Affinity propagation
Automatic clustering algorithms
BFR algorithm
BIRCH
Canopy clustering algorithm
Chinese whispers (clustering method)
Cluster-weighted modeling
Cobweb (clustering)
Complete-linkage clustering
Constrained clustering
CURE algorithm
Data stream clustering
DBSCAN
Expectation–maximization algorithm
FLAME clustering
Fuzzy clustering
Hierarchical clustering
Hoshen–Kopelman algorithm
Information bottleneck method
Jenks natural breaks optimization
K q-flats
K-means clustering
K-means++
K-medians clustering
K-medoids
K-SVD
Linde–Buzo–Gray algorithm
Low-energy adaptive clustering hierarchy
Mean shift
Nearest-neighbor chain algorithm
Neighbor joining
OPTICS algorithm
Pitman–Yor process
Quantum clustering
Self-organizing map
SimRank
Single-linkage clustering
Spectral clustering
SUBCLU
UPGMA
Ward's method
WPGMA



Support-vector machine
Boosting (machine learning)
Random forest
Linear regression
Logistic regression
Naive Bayes classifier
Artificial neural network
Perceptron
k-nearest neighbors algorithm
Semi-supervised learning
Ensemble learning
Bootstrap aggregating



'''

curated_pages = curated_pages.strip().splitlines()  # string to list of strings
curated_pages = [p for p in curated_pages if p.strip() != '']  # remove blank lines

## Collect all pages under Machine learning

In [None]:
cats_open = ['Category:Machine learning']
cats_close = []
all_pages = []
while len(cats_open) > 0:
    c = cats_open.pop()
    if c in cats_close:
        continue
    cats_close.append(c)
    cat = wiki_wiki.page(c)
    members = list(cat.categorymembers.keys())
    subcats = filter(lambda m: 'Category:' in m, members)
    pages = filter(lambda m: 'Category:' not in m, members)
    all_pages.extend(pages)
    cats_open.extend(subcats)
    
# remove duplicates
auto_pages = []
for p in all_pages:
    if p not in auto_pages:
        auto_pages.append(p)


## Prepare dataset

In [None]:
CORPUS_TO_USE = 'auto'

In [None]:
corpus_types = {'curated': curated_pages,
                'auto': auto_pages}

In [None]:
wikipedia_pages = corpus_types[CORPUS_TO_USE]

In [None]:
df = pd.DataFrame({'title_input': wikipedia_pages})
df['title'] = ''
df['summary'] = ''
df['text'] = ''

In [None]:
for idx, line in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
    page_py = wiki_wiki.page(line['title_input'])
    df.at[idx, 'title'] = page_py.title
    df.at[idx, 'text'] = page_py.text
    df.at[idx, 'summary'] = page_py.summary
    
df

In [None]:
df.to_csv(f'corpus_wikipedia_{CORPUS_TO_USE}.csv')

# Match article to input

## Train sentence transformer on corpus and store model

In [None]:
embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)
corpus = df["title"]+df["text"]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
corpus_embeddings.shape

torch.save(corpus_embeddings, f'corpus_embeddings_{CORPUS_TO_USE}.pt')


## Load model

In [None]:
corpus_embeddingsLoaded = torch.load(f'corpus_embeddings_{CORPUS_TO_USE}.pt')

In [None]:
query = 'what is the metric used in k means'
query_embedding = embedder.encode(query, convert_to_tensor=True)

top_k = 10

hits = util.semantic_search(query_embedding, corpus_embeddingsLoaded, top_k=top_k)
hits_idx = list(map(lambda x: x['corpus_id'], hits[0]))

for hit in hits:
    hit_id = hit ['corpus_id']
    article_data = df.iloc[hit_id]
    title = article_data ['title']
    print ("-", title, hit ['score'], hit_id)

In [3]:
def get_related_articles_top_k(query: str, corpus: pd.DataFrame, embedder, model, top_k: int = 10):
    corpus_embeddings_loaded = model
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    
    hits = util.semantic_search(query_embedding, corpus_embeddings_loaded, top_k=top_k)
    hits_idx = list(map(lambda x: x['corpus_id'], hits[0]))
    
    return corpus.iloc[hits_idx]

In [None]:
query_test = 'what is the metric used in k means'
get_related_articles_top_k(query=query_test,
                           corpus=df,
                           model=corpus_embeddingsLoaded,
                           embedder=embedder,
                           top_k=7
                          )

# Ask Questions on specific text

## Distilbert - max 512 words

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

In [4]:
def distilbert_ask(question, text, tokenizer, model):
    inputs = tokenizer(question, text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)

    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    return tokenizer.decode(predict_answer_tokens)


## Distilbert - Demo

In [None]:
page_title = input('wikipedia page title:')
page_py = wiki_wiki.page(page_title)
if not page_py.exists():
    print('page does not exist')
else:
    print('\npage title:', page_py.title)
    print('\nsummary:\n')
    print(page_py.summary)


In [None]:
page_py.summary

In [None]:
question = input('question:\n')
distilbert_ask(question, page_py.summary, tokenizer=tokenizer, model=model)

# Asking any question

In [5]:
from typing import Iterable

In [6]:
CORPUS_TO_USE = 'auto'

# related articles
corpus_df = pd.read_csv(f'corpus_wikipedia_{CORPUS_TO_USE}.csv')
corpus_embeddingsLoaded = torch.load(f'corpus_embeddings_{CORPUS_TO_USE}.pt')
embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# question asking
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
question_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

question_models = {
    'distillbert': {'model': question_model, 'max_words': 512},
}

In [23]:
def ask_question(question: str,
                 related_articles_model,
                 related_articles_embedder,
                 related_articles_corpus: pd.DataFrame,
                 question_models: dict,
                 question_tokenizer,
                 related_articles_top_k: int = 5,
                 ):
    # get all relevant articles
    related = get_related_articles_top_k(query=question,
                                         corpus=related_articles_corpus,
                                         embedder=related_articles_embedder,
                                         model=related_articles_model,
                                         top_k=related_articles_top_k
                                        )
    results = pd.DataFrame()
    # for each relevant article
    for idx, page in related.iterrows():        
        results.at[idx, 'article title'] = page['title']
        results.at[idx, 'article summary'] = page['summary']
        
        # for each question asking model
        for model_name, model_data in question_models.items():
            model = model_data['model']
            max_words = model_data['max_words']
            
            # run model on questin with text
            # TODO validate text has less than max_words (crop)
            
            text = page['summary']
            assert len(text.split()) < max_words
            assert text.strip() != ''  # context must be provided
            answer = distilbert_ask(question=question,
                                    text=text,
                                    tokenizer=question_tokenizer,
                                    model=model)
            
            results.at[idx, model_name] = answer
        
    
    return related, results
    

In [9]:
related, results = \
ask_question('what is k-means metric?',
             related_articles_model=corpus_embeddingsLoaded,
             related_articles_embedder=embedder,
             related_articles_corpus=corpus_df,
             question_models=question_models,
             question_tokenizer=tokenizer,
             related_articles_top_k= 10,
            )

260
219
236
89
236
86
41
316
60
110


In [None]:
distilbert_ask(question='what is k-means metric?',
               text=results.loc[1245]['article summary'],
               tokenizer=tokenizer,
               model=model)

In [103]:
question_answer('kmeans metric?')

Unnamed: 0,article title,distillbert
1247,K-medians clustering,[CLS]
1248,K-medoids,[CLS]
1245,K-means clustering,[CLS]
1203,Determining the number of clusters in a data set,[CLS]
1215,Dunn index,the dunn index ( di ) ( introduced by j. c. du...
864,Principal geodesic analysis,principal geodesic analysis
725,IDistance,
1236,Data stream clustering,data stream clustering
1211,Automatic clustering algorithms,kmeans metric? [SEP] automatic clustering algo...
573,Hans-Peter Kriegel,german computer scientist and professor at the...


## Model served with Gradio

In [104]:
def question_answer(question):
    related, results = \
        ask_question(question,
                     related_articles_model=corpus_embeddingsLoaded,
                     related_articles_embedder=embedder,
                     related_articles_corpus=corpus_df,
                     question_models=question_models,
                     question_tokenizer=tokenizer,
                     related_articles_top_k= 10,
                    )
    #return results.iloc[0]['answer']
    return results[['article title', 'distillbert']]

gr_interface = gr.Interface(fn=question_answer, inputs=[ "text"], outputs=["dataframe"])

In [85]:
del gr_interface

In [105]:
gr.close_all()
gr_interface.close()

Closing server running on port: 7860
Closing server running on port: 7860


In [106]:
gr_interface.launch(server_port=7860)

Running on local URL:  http://127.0.0.1:7860/

To create a public link, set `share=True` in `launch()`.


(<gradio.routes.App at 0x7fae5b2bb580>, 'http://127.0.0.1:7860/', None)

Traceback (most recent call last):
  File "/home/ciafa/mnt_point_3/dasilva/deep_learning_iseg/venv/lib/python3.8/site-packages/gradio/routes.py", line 255, in run_predict
    output = await app.blocks.process_api(
  File "/home/ciafa/mnt_point_3/dasilva/deep_learning_iseg/venv/lib/python3.8/site-packages/gradio/blocks.py", line 599, in process_api
    predictions, duration = await self.call_function(fn_index, processed_input)
  File "/home/ciafa/mnt_point_3/dasilva/deep_learning_iseg/venv/lib/python3.8/site-packages/gradio/blocks.py", line 514, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/home/ciafa/mnt_point_3/dasilva/deep_learning_iseg/venv/lib/python3.8/site-packages/anyio/to_thread.py", line 31, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/home/ciafa/mnt_point_3/dasilva/deep_learning_iseg/venv/lib/python3.8/site-packages/anyio/_backends/_asyncio.py", line 937, in run_sync_in_worker_thread
    return await future
  Fil