In [1]:
from serpapi import GoogleSearch

def get_query(query):
    params = {
        "q": query,
        "engine": "google",
        "api_key": "d5283d8a6c6640c36e5228ae57e8baa9170859f8b5fa73e3c941cdb51afa9e0f"
    }

    search = GoogleSearch(params)
    results = search.get_dict()
    organic_results = results['organic_results']
    return organic_results

organic_results = get_query("professor Lawrence Angrave")



In [2]:
import trafilatura as tr
import pandas as pd

def parse_results(link):
    downloaded = tr.fetch_url(link)
    text = tr.extract(downloaded, include_formatting=True)
    if text == None:
        return ''
    return text

def get_scholar_results(author_id):
    params = {
        'engine': 'google_scholar_author',
        'author_id': author_id,
        'api_key' : 'd5283d8a6c6640c36e5228ae57e8baa9170859f8b5fa73e3c941cdb51afa9e0f'
    }
    search = GoogleSearch(params)
    results = search.get_dict()
    return results

dataset = {'titles': [], 'text': []}

for result in organic_results:
    if 'scholar.google.com' in result['link']:
        # get user id from link
        user_id = result['link'].split('user=')[1].split('&')[0]
        # get scholar results
        scholar_results = get_scholar_results(user_id)
        print(scholar_results["author"])
    else:
        with open("Documents/" + result['title'] + '.txt', 'w') as f:
            f.write(parse_results(result['link']))
        dataset['titles'].append(result['title'])
        dataset['text'].append(parse_results(result['link']))

df = pd.DataFrame(dataset)
# df.to_csv('Dataset.csv', index=False, sep='\t')


{'name': 'Lawrence Angrave', 'affiliations': 'Teaching Professor of Computer Science, University of Illinois at Urbana Champaign', 'email': 'Verified email at illinois.edu', 'thumbnail': 'https://scholar.google.com/citations/images/avatar_scholar_128.png'}


In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model_flan = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base",max_length=2048)

  from .autonotebook import tqdm as notebook_tqdm
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [4]:
# split the text into passages

def split_text(text: str, n=100, character=" "):
    """Split the text every ``n``-th occurrence of ``character``"""
    text = text.split(character)
    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]


def split_documents(documents: dict) -> dict:
    """Split documents into passages"""
    titles, texts = [], []
    for title, text in zip(documents["titles"], documents["text"]):
        if text is not None:
            for passage in split_text(text):
                titles.append(title if title is not None else "")
                texts.append(passage)
    return {"title": titles, "text": texts}

documents = df.to_dict('list')
documents = split_documents(documents)


In [5]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

def embed_documents(documents):
    # Load pre-trained BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Initialize an empty list to store the document embeddings
    document_embeddings = []

    # Loop through each document and embed it
    for doc in documents:
        # Tokenize the document and convert it to tensors
        inputs = tokenizer(doc, return_tensors='pt', truncation=True, padding=True)
        
        # Forward pass through the BERT model
        with torch.no_grad():
            outputs = model(**inputs)
            # Extract the last layer embeddings for each token
            embeddings = outputs.last_hidden_state
        
        # Calculate the mean of token embeddings to get document-level embedding
        doc_embedding = torch.mean(embeddings, dim=1).squeeze().numpy()
        document_embeddings.append(doc_embedding)

    return document_embeddings

documents["embedding"] = embed_documents(documents["text"])
print(len(documents["embedding"]))


47


In [6]:
# documents['embeddings'] = [model.encode([["represent this document for retrieval",i]])[0] for i in documents['text']]
new_df_embed = pd.DataFrame(documents)
print(new_df_embed.shape)

(47, 3)


In [7]:
label_to_id = {label: i for i, label in enumerate(new_df_embed['title'].unique())}
print(label_to_id)
id_to_label = {i: label for label, i in label_to_id.items()}

{'Lawrence Angrave - Computer Science | UIUC': 0, 'Lawrence Angrave - Illinois Experts': 1, 'Lawrence Angrave at University Of Illinois at Urbana': 2, 'Lawrence Angrave': 3, 'My name is Lawrence Angrave and I think we can do better': 4, 'ClassTranscribe Demonstration by Lawrence Angrave': 5, 'Shravan Goli x Lawrence Angrave: CS Education in Industry ...': 6, 'Lawrence Angrave Professor of Computer Science': 7, 'Explore CS: November Sessions - IllinoisWCS - Medium': 8}


In [8]:
new_df_embed.head()

Unnamed: 0,title,text,embedding
0,Lawrence Angrave - Computer Science | UIUC,"# Lawrence Angrave\n**Teaching Professor, Gies...","[-0.32503524, 0.10525391, 0.31873307, -0.01484..."
1,Lawrence Angrave - Computer Science | UIUC,"(2022). ""Exploring collaborative caption editi...","[-0.50963396, -0.08074478, 0.32562825, 0.01393..."
2,Lawrence Angrave - Computer Science | UIUC,(2021). c substantive responses from faking: S...,"[-0.2840345, 0.018994091, 0.28550598, -0.12809..."
3,Lawrence Angrave - Computer Science | UIUC,"Diarization, Captions, and Visualization"" Pape...","[-0.25274646, 0.07164032, 0.3489835, -0.067136..."
4,Lawrence Angrave - Computer Science | UIUC,Learning in Undergraduate Engineering and othe...,"[-0.38943624, -0.03058345, 0.38179272, -0.1804..."


In [9]:
import numpy as np


train_features = np.array(new_df_embed['embedding'].tolist())
train_labels = np.array([label_to_id[i] for i in new_df_embed['title'].tolist()])
print("train_features shape = ", train_features.shape)
print("train_labels shape = ", train_labels.shape)

train_features shape =  (47, 768)
train_labels shape =  (47,)


In [10]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

# Example usage
# Assuming you have document embeddings and a question embedding as NumPy arrays
# doc_embeddings: (num_documents, embedding_dimension)
# question_embedding: (1, embedding_dimension)

# Create a KNN classifier with cosine similarity as the metric
knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')

# Fit the KNN model with document embeddings
knn.fit(train_features, train_labels) 

In [11]:
def encode_question(question, max_length=128):
    # Load pre-trained BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Tokenize the question
    tokens = tokenizer.encode(question, add_special_tokens=True, max_length=max_length, truncation=True)

    # Convert tokens to tensors
    input_ids = torch.tensor([tokens])
    
    # Forward pass through the BERT model
    with torch.no_grad():
        outputs = model(input_ids)
        # Extract the last layer embeddings for the [CLS] token (the first token)
        question_embedding = outputs.last_hidden_state[:, 0, :]

    # Convert the PyTorch tensor to a NumPy array
    question_embedding_np = question_embedding.numpy()

    return question_embedding_np

# Example usage
question = "List all recent courses taught by Professor Lawrence Angrave"
question_embedding = encode_question(question)[0]
print(question_embedding.shape)  


(768,)


In [12]:
def get_answer(question, context):
    input_text = "Answer based on context \n\n context: v1 \n\n question: v2" 
    input_text = input_text.replace('v1', context)
    input_text = input_text.replace('v2', question)
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    outputs = model_flan.generate(input_ids)
    return tokenizer.decode(outputs[0])





# get the top k most relevant documents
k = 2
top_k = knn.predict_proba([question_embedding])[0].argsort()[-k:][::-1]

for i in top_k:
    print(id_to_label[i])



# concatenate the text from the top 5 documents
context = df.iloc[top_k]['text'].str.cat(sep=' ')
# print(context)



# get the answer
answer = get_answer(question, context)
print(answer)


Token indices sequence length is longer than the specified maximum sequence length for this model (4061 > 512). Running this sequence through the model will result in indexing errors


Lawrence Angrave - Computer Science | UIUC
Shravan Goli x Lawrence Angrave: CS Education in Industry ...




<pad> CS 125 AL1 (CS 125 AL2, CS 125 AL3) - Intro to Computer Science - CS 199 241 (CS 199 341, CS 199 DYB) - De-Bug your Brain - CS 241 (CS 341) - System Programming - CS 296 41 - Honors Course - CS 498 RK1 (CS 498 RK2, CS 498 VR3, CS 498 VR4)</s>


In [13]:
# clean the documents folder
import os

for file in os.listdir('Documents'):
    os.remove('Documents/' + file)
