In [20]:
import duckdb
import pandas as pd
import numpy as np
import os
import re
import groq
from IPython.display import Markdown

In [21]:
def query_llama(query_string):
    client = groq.Groq(
        api_key= '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'
    )
    
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": query_string
            }
        ],
        model="llama3-8b-8192",
        #temperature=0 # Try uncommenting this and running the call to query_llama below many times. What do you notice? Recomment it out afterwards.
    )

    return chat_completion.choices[0].message.content


In [22]:
def tokenize(string):
    string1 = string.lower()
    return re.findall(r'\b\w+\b', string1)

In [23]:
def helper(path):
    return [f for f in os.listdir(path) if f.endswith('.txt')]

def count_tokens(tokens):
    Numtoken = {}
    for token in tokens:
        if token in Numtoken:
            Numtoken[token] += 1
        else:
            Numtoken[token] = 1
    return Numtoken

def files_to_bow(path):
    bow_data = {}
    files = helper(path)
    all_tokens = set()
    
    for file in files:
        with open(os.path.join(path, file), 'r') as f:
            tokens = tokenize(f.read())
            token_counts = count_tokens(tokens)
            bow_data[file] = token_counts
            all_tokens.update(tokens)
    sorted_tokens = sorted(all_tokens)
    df = pd.DataFrame(index=sorted(bow_data.keys()), columns=sorted_tokens).fillna(0)
    for file_name, counts in bow_data.items():
        for token, count in counts.items():
            df.at[file_name, token] = count
    
    return df

In [26]:
def compute_idfs(bow):
    d = bow.shape[0]
    a = bow.map(lambda x: 1 if x != 0 else x)
    b = a.sum(axis=0)
    return np.log(d / b) 

def bow_to_tfidf(bow):
    tf = bow.div(bow.sum(axis=1), axis=0)
    idf = compute_idfs(bow)
    tfidf = tf.multiply(idf, axis=1)
    return tfidf


In [27]:
def new_query_to_tfidf(query_string, bow=bow):
    tokens = tokenize(query_string)
    query_bow = pd.Series(count_tokens(tokens))
    missing_words = [word for word in query_bow.index if word not in bow.columns]
    
    if missing_words:
        extended_columns = bow.columns.tolist() + missing_words
    else:
        extended_columns = bow.columns.tolist()
    query_bow = query_bow.reindex(extended_columns, fill_value=0)

    query_tf = query_bow / query_bow.sum()
    
    idfs = compute_idfs(bow)

    for word in missing_words:
        idfs[word] = 0
    
    idfs = idfs.reindex(extended_columns, fill_value=0)

    query_tfidf = query_tf * idfs
    
    return query_tfidf.reindex(bow.columns, fill_value=0)

NameError: name 'bow' is not defined

In [28]:

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    b = np.linalg.norm(vec1)
    a = np.linalg.norm(vec2)
    
    if a == 0 or b == 0:
        return 0
    return dot_product / (b * a)

def top_n_similar_documents(query_string, n, bow=bow):
    query_tfidf = new_query_to_tfidf(query_string, bow)

    doc_tfidf_matrix = bow_to_tfidf(bow) 
    
    similarities = {}
    for doc_name in doc_tfidf_matrix.index:
        doc_vector = doc_tfidf_matrix.loc[doc_name]
        similarity = cosine_similarity(query_tfidf, doc_vector)
        similarities[doc_name] = similarity

    sorted_docs = sorted(similarities, key=similarities.get, reverse=True)

    return sorted_docs[:n]

NameError: name 'bow' is not defined

In [29]:
def ask_gpteecs(query_string, n=3, bow=bow):
    top_docs = top_n_similar_documents(query_string, n, bow)
    syllabi_contents = []
    for doc in top_docs:
        try:
            with open(f'data/syllabi/{doc}', 'r') as file:
                syllabus_text = file.read()
                syllabi_contents.append(f"Here is the syllabus for course {doc}:\n{syllabus_text}")
        except FileNotFoundError:
            syllabi_contents.append(f"Here is {doc}: [Syllabus not found]")

    combined_query = f"""
    Hi! I'm looking to answer this query about EECS courses at the University of Michigan:
    

    {query_string}

    Here are some relevant courses from the syllabi:
    Include the Proffesors who teach the classes
    And conclude which the course the fits the query best
    """
    
    random.shuffle(syllabi_contents)
    combined_query += "\n\n".join(syllabi_contents)
    combined_query += f"\n\nQuery Variation Seed: {random.randint(1000, 9999)}"
    try:
        response = query_llama(combined_query)
        return response
    except groq.RateLimitError:
        return "Rate limit reached. Please try again later."


NameError: name 'bow' is not defined