In [2]:
from serpapi import GoogleSearch

def get_query(query):
    params = {
        "q": query,
        "engine": "google",
        "api_key": "d5283d8a6c6640c36e5228ae57e8baa9170859f8b5fa73e3c941cdb51afa9e0f"
    }

    search = GoogleSearch(params)
    results = search.get_dict()
    organic_results = results['organic_results']
    return organic_results

organic_results = get_query("professor Lawrence Angrave")



In [4]:
import trafilatura as tr
import pandas as pd

def parse_results(link):
    downloaded = tr.fetch_url(link)
    text = tr.extract(downloaded)
    if text == None:
        return ''
    return text

def get_scholar_results(author_id):
    params = {
        'engine': 'google_scholar_author',
        'author_id': author_id,
        'api_key' : 'd5283d8a6c6640c36e5228ae57e8baa9170859f8b5fa73e3c941cdb51afa9e0f'
    }
    search = GoogleSearch(params)
    results = search.get_dict()
    return results

dataset = {'titles': [], 'text': []}

for result in organic_results:
    if 'scholar.google.com' in result['link']:
        # get user id from link
        user_id = result['link'].split('user=')[1].split('&')[0]
        # get scholar results
        scholar_results = get_scholar_results(user_id)
        print(scholar_results["author"])
    else:
        with open("Documents/" + result['title'] + '.txt', 'w') as f:
            f.write(parse_results(result['link']))
        dataset['titles'].append(result['title'])
        dataset['text'].append(parse_results(result['link']))

df = pd.DataFrame(dataset)
df.to_csv('Dataset.csv', index=False, sep='\t')


{'name': 'Lawrence Angrave', 'affiliations': 'Teaching Professor of Computer Science, University of Illinois at Urbana Champaign', 'email': 'Verified email at illinois.edu', 'thumbnail': 'https://scholar.google.com/citations/images/avatar_scholar_128.png'}


In [5]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-base')

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [6]:
def split_text(text: str, n=100, character=" "):
    """Split the text every ``n``-th occurrence of ``character``"""
    text = text.split(character)
    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]


def split_documents(documents: dict) -> dict:
    """Split documents into passages"""
    titles, texts = [], []
    for title, text in zip(documents["titles"], documents["text"]):
        if text is not None:
            for passage in split_text(text):
                titles.append(title if title is not None else "")
                texts.append(passage)
    return {"title": titles, "text": texts}

documents = df.to_dict('list')
documents = split_documents(documents)
documents['embeddings'] = [model.encode(i) for i in documents['text']]
new_df_embed = pd.DataFrame(documents)
print(new_df_embed.shape)



(43, 3)


In [7]:
label_to_id = {label: i for i, label in enumerate(new_df_embed['title'].unique())}
print(label_to_id)
id_to_label = {i: label for label, i in label_to_id.items()}

{'Lawrence Angrave - Computer Science | UIUC': 0, 'Lawrence Angrave - Illinois Experts': 1, 'Lawrence Angrave at University Of Illinois at Urbana': 2, 'Lawrence Angrave': 3, 'My name is Lawrence Angrave and I think we can do better': 4, 'ClassTranscribe Demonstration by Lawrence Angrave': 5, 'Lawrence Angrave Professor of Computer Science': 6, 'education at the University of Illinois - ERIC': 7, 'CS 241 Lawrence Angrave - University of Illinois': 8}


In [8]:
new_df_embed.shape

(43, 3)

In [9]:
import numpy as np


train_features = np.array(new_df_embed['embeddings'].tolist())
train_labels = np.array([label_to_id[i] for i in new_df_embed['title'].tolist()])
print("train_features shape = ", train_features.shape)
print("train_labels shape = ", train_labels.shape)

train_features shape =  (43, 768)
train_labels shape =  (43,)


In [10]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_features, train_labels)

In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model_flan = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base",max_length=2048)

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [21]:
def get_answer(question, context):
    input_text = "Answer based on context \n\n context: v1 \n\n question: v2" 
    input_text = input_text.replace('v1', context)
    input_text = input_text.replace('v2', question)
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    outputs = model_flan.generate(input_ids)
    return tokenizer.decode(outputs[0])

question = "Who is professor Lawrence Angrave?"
question_instruction = "represent the question for retrieving supporting documents:"

q_embedding = model.encode([[question_instruction, question]])[0]


# get the top k most relevant documents
k = 2
top_k = knn.predict_proba([q_embedding])[0].argsort()[-k:][::-1]

for i in top_k:
    print(id_to_label[i])



# concatenate the text from the top 5 documents
context = df.iloc[top_k]['text'].str.cat(sep=' ')
# print(context)



# get the answer
answer = get_answer(question, context)
print(answer)


Lawrence Angrave Professor of Computer Science
Lawrence Angrave
<pad> a Teaching Professor</s>


