In [1]:
import torch
from transformers import BertTokenizer, BertModel, AutoModel, AutoTokenizer
import numpy as np


class DocumentEmbedder:
    def __init__(self, model_name='bert-large-uncased'):
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
    def embed_doc(self, documents):
        # Initialize an empty list to store the document embeddings
        document_embeddings = []

        # Loop through each document and embed it
        for doc in documents:
            # Tokenize the document and convert it to tensors
            inputs = self.tokenizer(doc, return_tensors='pt', truncation=True, padding=True)
            
            # Forward pass through the BERT model
            with torch.no_grad():
                outputs = self.model(**inputs)
                # Extract the last layer embeddings for each token
                embeddings = outputs.last_hidden_state
            
            # Calculate the mean of token embeddings to get document-level embedding
            doc_embedding = torch.mean(embeddings, dim=1).squeeze().numpy()
            document_embeddings.append(doc_embedding)
        return document_embeddings
    
    def encode_question(self, question, max_length=128):
        tokens = self.tokenizer.encode(question, add_special_tokens=True, max_length=max_length, truncation=True)

        # Convert tokens to tensors
        input_ids = torch.tensor([tokens])
        
        with torch.no_grad():
            outputs = self.model(input_ids)
            question_embedding = outputs.last_hidden_state[:, 0, :]

        # Convert the PyTorch tensor to a NumPy array
        question_embedding_np = question_embedding.numpy()

        return question_embedding_np

bert = DocumentEmbedder()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from serpapi import GoogleSearch

def get_query(query):
    params = {
        "q": query,
        "engine": "google",
        "api_key": "d5283d8a6c6640c36e5228ae57e8baa9170859f8b5fa73e3c941cdb51afa9e0f"
    }

    search = GoogleSearch(params)
    results = search.get_dict()
    organic_results = results['organic_results']
    return organic_results

Query = "Professor Kevin Chang uiuc"
organic_results = get_query(Query)



In [3]:
import trafilatura as tr
import pandas as pd
import wikipedia as wiki

def parse_results(link):
    downloaded = tr.fetch_url(link)
    text = tr.extract(downloaded, include_formatting=True)
    if text == None:
        return ''
    return text

def get_scholar_results(author_id):
    params = {
        'engine': 'google_scholar_author',
        'author_id': author_id,
        'api_key' : 'd5283d8a6c6640c36e5228ae57e8baa9170859f8b5fa73e3c941cdb51afa9e0f'
    }
    search = GoogleSearch(params)
    results = search.get_dict()
    return results

dataset = {'titles': [], 'text': []}

named_entity = ""
for result in organic_results:
    if 'wikipedia.org' in result['link']:
        with open("Documents/wiki.txt", 'w') as f:
            wiki_results = wiki.search(result['title'])
            page = wiki.page(wiki_results[0], auto_suggest=False)
            f.write(page.content)
        dataset['titles'].append("wikiresult")
        dataset['text'].append(page.content)
            
    # if 'scholar.google.com' in result['link']:
    #     # get user id from link
    #     user_id = result['link'].split('user=')[1].split('&')[0]
    #     # get scholar results
    #     scholar_results = get_scholar_results(user_id)
    #     print(scholar_results["author"])
    else:
        text = parse_results(result['link'])
        if named_entity.lower() in text.lower():
            with open("Documents/" + result['title'] + '.txt', 'w') as f:
                f.write(parse_results(result['link']))
            dataset['titles'].append(result['title'])
            dataset['text'].append(parse_results(result['link']))

df = pd.DataFrame(dataset)
# df.to_csv('Dataset.csv', index=False, sep='\t')


In [4]:
# split the text into passages

def split_text(text: str, n=100, character=" "):
    """Split the text every ``n``-th occurrence of ``character``"""
    text = text.split(character)
    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]


def split_documents(documents: dict) -> dict:
    """Split documents into passages"""
    titles, texts = [], []
    for title, text in zip(documents["titles"], documents["text"]):
        if text is not None:
            for passage in split_text(text):
                titles.append(title if title is not None else "")
                texts.append(passage)
    return {"title": titles, "text": texts}

documents = df.to_dict('list')
documents = split_documents(documents)


In [5]:
# 
documents["embedding"] = bert.embed_doc(documents["text"])
print(len(documents["embedding"]))


38


In [6]:
new_df_embed = pd.DataFrame(documents)
print(new_df_embed.shape)

(38, 3)


In [7]:
label_to_id = {label: i for i, label in enumerate(new_df_embed['title'].unique())}
print(label_to_id)
id_to_label = {i: label for label, i in label_to_id.items()}

{'Kevin Chenchuan Chang - Computer Science | UIUC': 0, 'Kevin C.C. Chang - Professor - University of Illinois at ...': 1, 'Kevin Chen-Chuan Chang': 2, 'Kevin Chang at University Of Illinois at Urbana - Champaign': 3, 'Kevin Chang - Professor @ University of Illinois Urbana ...': 4, 'Kevin Chen-Chuan Chang | IEEE Xplore Author Details': 5}


In [8]:
new_df_embed.head(-1)

Unnamed: 0,title,text,embedding
0,Kevin Chenchuan Chang - Computer Science | UIUC,# Kevin Chenchuan Chang\n## For More Informati...,"[-0.3566605, -0.19457227, -0.0086028995, -0.19..."
1,Kevin Chenchuan Chang - Computer Science | UIUC,"2002, NCSA Faculty Fellow Award in 2003, IBM F...","[-0.3238592, -0.20354173, -0.20277858, -0.1660..."
2,Kevin Chenchuan Chang - Computer Science | UIUC,National Institutes of Health.\n## Professiona...,"[-0.13917094, -0.09737851, -0.010339749, -0.10..."
3,Kevin Chenchuan Chang - Computer Science | UIUC,"Award Committee, ACM International Conference ...","[-0.22010683, -0.1606884, -0.054008003, -0.221..."
4,Kevin Chenchuan Chang - Computer Science | UIUC,"Retrieval and Integration (WIRI 2006) at ICDE,...","[-0.3644765, -0.22623837, -0.014934646, -0.180..."
5,Kevin Chenchuan Chang - Computer Science | UIUC,"*machine learning*. As our objectives, we aim ...","[-0.17899835, -0.3444038, -0.21057726, -0.1656..."
6,Kevin Chenchuan Chang - Computer Science | UIUC,Conference Proceedings\n- Are Large Pre-Traine...,"[-0.3591724, -0.22857207, -0.026724678, -0.151..."
7,Kevin Chenchuan Chang - Computer Science | UIUC,"in Natural Language Processing, EMNLP 2022, Ab...","[-0.35645, -0.18544164, 0.05279978, -0.0866148..."
8,Kevin Chenchuan Chang - Computer Science | UIUC,"2022, 2022.\n- Open Relation Modeling: Learnin...","[-0.27726692, -0.20850885, -0.040968973, -0.15..."
9,Kevin Chenchuan Chang - Computer Science | UIUC,Association for Computational Linguistics and ...,"[-0.28778446, -0.16929694, 0.01353539, -0.1753..."


In [9]:
import numpy as np


train_features = np.array(new_df_embed['embedding'].tolist())
train_labels = np.array([label_to_id[i] for i in new_df_embed['title'].tolist()])
print("train_features shape = ", train_features.shape)
print("train_labels shape = ", train_labels.shape)

train_features shape =  (38, 1024)
train_labels shape =  (38,)


In [10]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

# Create a KNN classifier with cosine similarity as the metric
knn = KNeighborsClassifier(metric='cosine')

# Fit the KNN model with document embeddings
knn.fit(train_features, train_labels) 

In [15]:
questions = ["List the recent courses taught?", "Where did Professor get his PhD?", "Where did professor study from?", "What does the professor teach?", "What is the professor's research interest?"]
question_embedding = [bert.encode_question(question)[0] for question in questions]
  


In [16]:
# get the top k most relevant documents
k = 1
top_k = knn.predict(question_embedding)

for i in top_k:
    print(id_to_label[i])

# concatenate the text from the top k documents
context = df.iloc[top_k]['text'].str.cat(sep=' ')
# print(context)

Kevin Chenchuan Chang - Computer Science | UIUC
Kevin Chenchuan Chang - Computer Science | UIUC
Kevin Chenchuan Chang - Computer Science | UIUC
Kevin Chenchuan Chang - Computer Science | UIUC
Kevin Chenchuan Chang - Computer Science | UIUC


In [18]:
import re
def split_sections(text):
    ''' given a text split the text into sections where each section starts with some a certain number of hashtags'''

    sections = re.split(r'#+', text)
    sections = [section.strip() for section in sections if section.strip() != '']
    return sections

contexts = split_sections(context)
contexts

['Kevin Chenchuan Chang',
 'For More Information',
 'Education\n- Ph.D. Electrial Engineering, Stanford University, 2001',
 'Biography\nKevin Chen-Chuan Chang is a Professor in Computer Science, University of Illinois at Urbana-Champaign. He received a BS from National Taiwan University and PhD from Stanford University in Electrical Engineering. His research addresses large-scale information access and knowledge acquisition, for search, mining, and integration across structured and unstructured big data, with current focuses on Web search/mining and social media analytics. He received ICDE 10-Year Test of Time Award in 2022 and Best Paper Selection/Awards in VLDB 2000 and 2013 and ASONAM 2019, NSF CAREER Award in 2002, NCSA Faculty Fellow Award in 2003, IBM Faculty Awards in 2004 and 2005, Academy for Entrepreneurial Leadership Faculty Fellow Award in 2008, and the Incomplete List of Excellent Teachers at University of Illinois in 2001, 2004, 2005, 2006, 2010, 2011, 2019, 2022, 2023. H

In [19]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
def get_answer(question, context):
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
    model_flan = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base",max_length=2048)
    input_text = "Answer the question as precisely as possible based on the context and say unanswerable if you can't find it \n\n context: v1 \n\n question: v2" 
    input_text = input_text.replace('v1', context)
    input_text = input_text.replace('v2', question)
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    outputs = model_flan.generate(input_ids)
    return tokenizer.decode(outputs[0])


# get the answer
for i in contexts:
    if len(questions) == 0:
        break
    for question in questions:
        answer = get_answer(question, i)
        if answer != "<pad> unanswerable</s>":
            print("Question: ", question)
            print("Answer: ", answer)
            print("\n")
            questions.remove(question)




Question:  Where did Professor get his PhD?
Answer:  <pad> Stanford University</s>


Question:  What does the professor teach?
Answer:  <pad> Electrial Engineering</s>


Question:  Where did professor study from?
Answer:  <pad> National Taiwan University</s>


Question:  List the recent courses taught?
Answer:  <pad> ICDE, APWeb 2007, 2007 - 2009.</s>


Question:  What is the professor's research interest?
Answer:  <pad> bridging *structured* and *unstructured data*— to bring structured/semantic-rich access to the myriad and massive unstructured data which accounts for most of the world’s information</s>




In [20]:
# clean the documents folder
import os

for file in os.listdir('Documents'):
    os.remove('Documents/' + file)
