In [2]:
from serpapi import GoogleSearch

def get_query(query):
    params = {
        "q": query,
        "engine": "google",
        "api_key": "d5283d8a6c6640c36e5228ae57e8baa9170859f8b5fa73e3c941cdb51afa9e0f"
    }

    search = GoogleSearch(params)
    results = search.get_dict()
    organic_results = results['organic_results']
    return organic_results

organic_results = get_query("professor Lawrence Angrave")



In [3]:
import trafilatura as tr
import pandas as pd

def parse_results(link):
    downloaded = tr.fetch_url(link)
    text = tr.extract(downloaded)
    if text == None:
        return ''
    return text

def get_scholar_results(author_id):
    params = {
        'engine': 'google_scholar_author',
        'author_id': author_id,
        'api_key' : 'd5283d8a6c6640c36e5228ae57e8baa9170859f8b5fa73e3c941cdb51afa9e0f'
    }
    search = GoogleSearch(params)
    results = search.get_dict()
    return results

dataset = {'titles': [], 'text': []}

for result in organic_results:
    if 'scholar.google.com' in result['link']:
        # get user id from link
        user_id = result['link'].split('user=')[1].split('&')[0]
        # get scholar results
        scholar_results = get_scholar_results(user_id)
        print(scholar_results["author"])
    else:
        with open("Documents/" + result['title'] + '.txt', 'w') as f:
            f.write(parse_results(result['link']))
        dataset['titles'].append(result['title'])
        dataset['text'].append(parse_results(result['link']))

df = pd.DataFrame(dataset)
df.to_csv('Dataset.csv', index=False, sep='\t')


{'name': 'Lawrence Angrave', 'affiliations': 'Teaching Professor of Computer Science, University of Illinois at Urbana Champaign', 'email': 'Verified email at illinois.edu', 'thumbnail': 'https://scholar.google.com/citations/images/avatar_scholar_128.png'}


In [4]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-base')

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [17]:
instruction = "represent the document for retrieval"

# embed the text column of the dataset in in lengths of 100
texts = ' '.join(df['text'].tolist())

embeddings = {'text' : [] ,'embeddings': []}

# divide the text into chunks of 100 and embed them
for i in range(0, len(texts), 100):
    embedding = model.encode([[instruction, texts[i:i+100]]])
    embeddings['text'].append(texts[i:i+100])
    embeddings['embeddings'].append(embedding[0])

df_embed = pd.DataFrame(embeddings, index=None)
df_embed.to_csv('Embeddings.csv', index=False, sep='\t')
df_embed.head()



Unnamed: 0,text,embeddings
0,"Lawrence Angrave\nTeaching Professor, Gies RC ...","[-0.032172404, -0.03031627, 0.021431386, 0.079..."
1,7 Siebel Center for Comp Sci\nUndergraduate Re...,"[-0.016080884, -0.003913253, 0.023245882, 0.03..."
2,search Opportunities through ClassTranscribe p...,"[-0.044072248, -0.00553426, 0.049387984, 0.046..."
3,"ternships, independent study, senior thesis an...","[-0.031310268, -0.005674911, 0.0009308431, 0.0..."
4,ch Interests\n- Accessible Education\n- Effect...,"[-0.02270496, -0.0022362072, 0.037036084, 0.04..."


In [18]:
import numpy as np


train_features = np.array(df_embed['embeddings'].tolist())
train_labels = np.array([i for i in range(len(train_features))])
print("train_features shape = ", train_features.shape)
print("train_labels shape = ", train_labels.shape)

train_features shape =  (321, 768)
train_labels shape =  (321,)


In [19]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_features, train_labels)

In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model_flan = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base",max_length=2048)

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [26]:
def get_answer(question, context):
    input_text = "Answer based on context \n\n context: v1 \n\n question: v2" 
    input_text = input_text.replace('v1', context)
    input_text = input_text.replace('v2', question)
    print(input_text)
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    outputs = model_flan.generate(input_ids)
    return tokenizer.decode(outputs[0])

question = "Who is professor Lawrence Angrave?"
question_instruction = "represent the question for retrieving supporting documents:"

q_embedding = model.encode([[question_instruction, question]])[0]


# get the top 5 documents
top_5 = knn.kneighbors([q_embedding], n_neighbors=5, return_distance=False)[0]

# concatenate the text from the top 5 documents
context = ' '.join(df_embed.iloc[top_5]['text'].tolist())
# print(context[:500])

# get the answer
answer = get_answer(question, context[:500])
print(answer)




Answer based on context 

 context: grave
Lawrence Angrave is a well-known professor in the UIUC Computer Science department. Having tau 
Lawrence Angrave is a Teaching Professor at the Computer Science department of the University of Il ality Based on
111 ratings
Lawrence
Angrave
Professor in the
Computer Science department
at
Universi LLC Lawrence Angrave was a highly successful science student at RLS during the 1980’s. His Mum tells The amazing class! Angrave is really a enthusiastic teacher and willing to tell you everything ! 

 question: Who is professor Lawrence Angrave?




<pad> a well-known professor in the UIUC Computer Science department</s>


In [22]:
df_embed.iloc[[0,1,2,3,4]]['text'].to_list()

['Lawrence Angrave\nTeaching Professor, Gies RC Evans Innovation Fellow, CITL Fellow\n(217) 333-1424\n221',
 '7 Siebel Center for Comp Sci\nUndergraduate Research Opportunities\nLawrence provides Undergraduate Re',
 'search Opportunities through ClassTranscribe project and Scribe Augmented Reality Project. Summer in',
 'ternships, independent study, senior thesis and other credit earning activities are possible.\nResear',
 'ch Interests\n- Accessible Education\n- Effective computer science instruction at scale\nResearch Areas']