In [1]:
import os
import dotenv

dotenv.load_dotenv()

True

In [11]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_parse import LlamaParse
import nest_asyncio; nest_asyncio.apply()

llama_cloud_api_key = os.environ["LLAMA_CLOUD_API_KEY"]
openai_api_key = os.environ["OPENAI_API_KEY"]

# set up parser
parser = LlamaParse(
    result_type="markdown",  # "markdown" and "text" are available
    api_key=llama_cloud_api_key,
    gpt4o_mode=True,
    gpt4o_api_key=openai_api_key,
)

# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
# reader = SimpleDirectoryReader(input_dir='./datasets/cds/pdf', file_extractor=file_extractor)
# reader = SimpleDirectoryReader(input_files=['./datasets/cds/pdf/Brigham Young University CDS_2023-2024.pdf'], file_extractor=file_extractor)
reader = SimpleDirectoryReader(input_files=['./datasets/cds/md/mississippi-state.md'])
documents = await reader.aload_data()

documents
# print("documents loaded:", len(documents))

[Document(id_='09a93438-991e-4020-9a42-db09f7d3997c', embedding=None, metadata={'file_path': 'datasets/cds/md/mississippi-state.md', 'file_name': 'mississippi-state.md', 'file_size': 67952, 'creation_date': '2024-06-12', 'last_modified_date': '2024-06-12'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='\n\nMississippi State University\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='a28b3117-cdec-4c1d-b1fa-42edfc7a7cfa', embedding=None, metadata={'file_path': 'datasets/cds/md/mississippi-state.md', 'file_name': 'mississippi-state.md', 'file_size': 67952, 'creation_date': '2024-06-12', 'last_modified_date': '2024-06-12'}, excluded_embed_m

In [71]:
parser.get_json_result('./datasets/cds/pdf/Brigham Young University CDS_2023-2024.pdf')

Started parsing the file under job_id cac11eca-937d-4da2-a80d-c283183746f6


[{'pages': [{'page': 1,
    'text': "Common Data Set 2023-2024\n\n\n                                                A. General Information\n\n\nA0     Respondent Information (Not for Publication)\n       Name:                                                         Scott Briggs\n       Title:                                                        Data and Reporting Manager\n       Office:                                                       BYU Office of Assessment and Planning\n       Mailing Address:                                              ASB B-356, Brigham Young University\n       City/State/Zip/Country:                                       Provo, UT 84602\n       Phone:                                                        801-422-7987\n       Fax:\n       E-mail Address:                                               scott.briggs@byu.edu\n\n\n       Are your responses to the CDS posted for                  x Yes\n       reference on your institution's Web site?            

In [73]:
parser.get_json_result('./datasets/cds/pdf/Penn State CDS_2023-2024.pdf')

Started parsing the file under job_id cac11eca-30f4-457e-8cdc-bc2f8a3d6117


[{'pages': [{'page': 1,
    'text': "                                                                          CDS_2023_2024_FINAL\nUP                                                                    A. General Information\nSELECT CAMPUS IN A1\nA0. Respondent Information (not for publication)\n\n\n                First Name:                                          Matthew\n                Last Name:                                           Gueguen\n                Title:                                               Analysis and Planning Consultant\n                Office:                                              OPAIR\n                Address:                                             203 Rider Building\n                City:                                                University Park\n                State:                                               PA\n                Zip:                                                 16802\n                Country:                

In [12]:
# extract features
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import  MarkdownNodeParser
from llama_index.core.schema import TransformComponent
from llama_index.core import PromptTemplate
from llama_index.llms.cohere import Cohere
from llama_index.llms.openai import OpenAI
from llama_index.core.extractors import SummaryExtractor

cohere_api_key = os.environ["COHERE_API_KEY"]
openai_api_key = os.environ["OPENAI_API_KEY"]

# llm = Cohere(model="command-r-plus", api_key=cohere_api_key)
llm = OpenAI(api_key=openai_api_key)

class TableCleaner(TransformComponent):    
    def _format_prompt(self, text: str):
        template = PromptTemplate((
            "This is a node from a markdown file:\n"
            "---------------------\n"
            "{node}"
            "\n---------------------\n"
            "Return the node from the document and ensure any markdown tables are valid. Reformat the tables if necessary.\n"
            "Only return the text from the document. Do not respond to the prompt.\n"
        ))

        return template.format(node=text)
    
    def __call__(self, nodes, **kwargs):
        for node in nodes:
            completion = llm.complete(self._format_prompt(node.text), formatted=True)
            print("completion:", completion)
            
            node.text = completion.text

        return nodes
    
    async def acall(self, nodes, **kwargs):
        for node in nodes:
            completion = await llm.acomplete(self._format_prompt(node.text), formatted=True)
            print("completion:", completion)
            
            node.text = completion.text

        return nodes

# pipeline = IngestionPipeline(transformations=[MarkdownNodeParser(), TableCleaner()])
# pipeline = IngestionPipeline(transformations=[MarkdownNodeParser(), SummaryExtractor(llm=llm)])
pipeline = IngestionPipeline(transformations=[MarkdownNodeParser()])

nodes = await pipeline.arun(documents=documents)

for i, node in enumerate(nodes):
    print(f"Node {i+1}:")
    print(node.text)

Node 1:
Mississippi State University
Node 2:
Office of Institutional Research and Effectiveness
Node 3:
A. General Information
Node 4:
A0. Respondent Information

| **Office:** | Office of Institutional Research and Effectiveness |
|-------------|----------------------------------------------------|
| **Address:** | P.O. Drawer EY |
| **City:** | Mississippi State |
| **State:** | Mississippi |
| **Zip:** | 39762 |
| **Country:** | United States |
| **Phone Number:** | 662-325-3920 |
| **Email Address:** | oir@ir.msstate.edu |

Are your responses to the CDS posted for reference on your institution's website?  
**Yes**

If yes, please provide a direct link to the posted CDS responses:  
https://ir.msstate.edu/cdsets.php
Node 5:
A1. Address Information

**Please enter general institution information below:**

| **Name of College or University** | Mississippi State University |
|-----------------------------------|------------------------------|
| **Street Address:** | 75 B.S. Hood Road |

In [16]:
import time

from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding, OpenAIEmbeddingModelType
from pinecone import Pinecone, ServerlessSpec

embed_model = OpenAIEmbedding(
    api_key=openai_api_key,
    model=OpenAIEmbeddingModelType.TEXT_EMBED_3_LARGE,
    input_type="search_query",
)

# embed_model = CohereEmbedding(
#     cohere_api_key=cohere_api_key,
#     model_name="embed-english-v3.0",
#     input_type="search_query",
# )


index = VectorStoreIndex(nodes=nodes, embed_model=embed_model)
# batch_size = 40
# index = VectorStoreIndex(nodes=[], embed_model=embed_model)

# NOTE: to load vector store
# for i in range(0, len(nodes), batch_size):
#     batch = nodes[i:i+batch_size]
#     index.insert_nodes(batch)
#     time.sleep(5)

In [22]:
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.llms.cohere import Cohere
from llama_index.llms.openai import OpenAI

# llm = Cohere(model="command-r-plus", api_key=cohere_api_key)
llm = OpenAI(api_key=openai_api_key)

chat_engine = index.as_chat_engine(
    llm=llm,
    verbose=True,
    # similarity_top_k=10,
    # node_postprocessors=[CohereRerank(api_key=cohere_api_key, top_n=4)],
)

chat_engine.chat("What is the breakdown of application fees for applying to this school?")

Added user message to memory: Does this school have a waitlist?
=== Calling Function ===
Calling function: query_engine_tool with args: {"input":"Does this school have a waitlist?"}
Got output: This school does not have a waitlist.



AgentChatResponse(response='This school does not have a waitlist.', sources=[ToolOutput(content='This school does not have a waitlist.', tool_name='query_engine_tool', raw_input={'input': 'Does this school have a waitlist?'}, raw_output=Response(response='This school does not have a waitlist.', source_nodes=[NodeWithScore(node=TextNode(id_='2e281eb8-ffd8-4cfc-a59d-899d02d18add', embedding=None, metadata={'file_path': 'datasets/cds/md/mississippi-state.md', 'file_name': 'mississippi-state.md', 'file_size': 67952, 'creation_date': '2024-06-12', 'last_modified_date': '2024-06-12'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='083481a6-6e3a-431d-a965-d4bd9ae668d6', node_type=<ObjectType.DOCUMENT: '4'>, metad

In [21]:
from enum import Enum

from pydantic import BaseModel, Field
from llama_index.core.program import LLMTextCompletionProgram

def convert_response_to_class(response_str, output_cls):
    prompt_template_str = """\
    Generate an {cls_name}, from this string: \
    {cls_str}\
    """
    program = LLMTextCompletionProgram.from_defaults(
        llm=llm,
        output_cls=output_cls,
        prompt_template_str=prompt_template_str,
        verbose=True,
    )

    return program(cls_name=output_cls.__name__, cls_str=response_str)

# total first year full time applied male
# total first year full time applied female
# total first year part time applied male
# total first year part time applied female
# total first year full time accepted male
# total first year full time accepted female
# total first year part time accepted male
# total first year part time accepted female
# total first year full time enrolled male
# total first year full time enrolled female
# total first year part time enrolled male
# total first year part time enrolled female
class AcceptedApplicantInfo(BaseModel):
    """Represents male and female applicant information"""

    first_year_applied_male: int = Field(description="Total first year full time male applicants")
    first_year_applied_female: int = Field(description="Total first year full time female applicants")
    first_year_accepted_male: int = Field(description="Total first year full time male applicants who were accepted")
    first_year_accepted_female: int = Field(description="Total first year full time female applicants who were accepted")
    first_year_enrolled_male: int = Field(description="Total first year full time male applicants who enrolled")
    first_year_enrolled_female: int = Field(description="Total first year full time female applicants who enrolled")

applicant_query_engine = index.as_query_engine(
    llm=llm,
    verbose=True,
    response_mode="compact_accumulate",
    output_cls=AcceptedApplicantInfo,
)

response = applicant_query_engine.query("What is the breakdown of male and female first time, first year applicants, how many were accepted, and how many enrolled?")
print(response)

# first year applied in state
# first year applied out of state
# first year applied international
# first year accepted in state
# first year accepted out of state
# first year accepted international
# first year enrolled in state
# first year enrolled out of state
# first year enrolled international

# wait list size
# wait list size admitted
# wait list size ranked

# high school diploma or ged required
# ged accepted

# high school english credits recommended
# high school math credits recommended
# high school science credits recommended
# high school social studies credits recommended
# high school history credits recommended
# high school elective credits recommended
# high school performing arts credits recommended
# high school computer science credits recommended
# high school foreign language credits recommended

# has open admissions policy
# has open admissions policy for out of state

# class rank weight
# gpa weight
# standardized test score weight
# essay weight
# class rank weight
# interview weight
# extracurricular, work or volunteer weight
# first gen weight
# residence weight
# religion weight
class AdmissionFactorWeightClass(str, Enum):
    very_important = "very important"
    important = "important"
    considered = "considered"
    not_considered = "not considered"

class AdmissionsFactorWeights(BaseModel):
    """Represents male and female applicant information"""

    class_rank: AdmissionFactorWeightClass = Field(description="Importance weighted on class rank")
    gpa: AdmissionFactorWeightClass = Field(description="Importance weighted on gpa")
    standardized_test_score: AdmissionFactorWeightClass = Field(description="Importance weighted on standardized test scores")
    essay: AdmissionFactorWeightClass = Field(description="Importance weighted on quality of essay")
    interview: AdmissionFactorWeightClass = Field(description="Importance weighted on quality of essay")
    extracurriculars: AdmissionFactorWeightClass = Field(description="Importance weighted on extracurriculars, volunteer, or work experience")
    first_gen: AdmissionFactorWeightClass = Field(description="Importance weighted on being a first generation student")
    alumni: AdmissionFactorWeightClass = Field(description="Importance weighted on if the student's parents are alumni")
    residence: AdmissionFactorWeightClass = Field(description="Importance weighted on if the student lives out of state")
    religion: AdmissionFactorWeightClass = Field(description="Importance weighted on what the student's religion is")

admission_factor_weight_query_engine = index.as_query_engine(
    llm=llm,
    verbose=True,
    response_mode="compact_accumulate",
    output_cls=AdmissionsFactorWeights,
)

response = admission_factor_weight_query_engine.query("What is the breakdown of relative importance of academic and nonacademic factors for admitted students?")
print(response)

convert_response_to_class(str(response), AdmissionsFactorWeights)

# sat or act required
# sat accepted
# act accepted

# sat composite 25%
# sat composite 50%
# sat composite 75%
# sat math 25%
# sat math 50%
# sat math 75%
# sat reading 25%
# sat reading 50%
# sat reading 75%
# act composite 25%
# act composite 50%
# act composite 75%
# act math 25%
# act math 50%
# act math 75%
# act english 25%
# act english 50%
# act english 75%
# act reading 25%
# act reading 50%
# act reading 75%
# act science 25%
# act science 50%
# act science 75%
# act writing 25%
# act writing 50%
# act writing 75%

# average GPA
# top tenth percent
# top quarter percent
# top half percent
# bottom half percent
# bottom quarter percent

# application fee
# application fee waivable
# application due
# application priority date
# early action
# early action due date
# early decision
# early decision due date

# private or public
# in state tutition
# reciprocity tuition
# religious affiliation tuition
# states with reciprocity
# out of state tuition
# campus housing fees
# campus food fees
# books and supplies fees

# percent of first time freshmen awarded financial aid
# average aid package total
# average scholarship total
# average self help award total
# average need based loan award total

# aid types
# aid deadlines

Response 1: first_year_applied_male=8895 first_year_applied_female=11937 first_year_accepted_male=6857 first_year_accepted_female=9049 first_year_enrolled_male=1682 first_year_enrolled_female=2069
Response 1: class_rank=<AdmissionFactorWeightClass.important: 'important'> gpa=<AdmissionFactorWeightClass.very_important: 'very important'> standardized_test_score=<AdmissionFactorWeightClass.considered: 'considered'> essay=<AdmissionFactorWeightClass.not_considered: 'not considered'> interview=<AdmissionFactorWeightClass.not_considered: 'not considered'> extracurriculars=<AdmissionFactorWeightClass.not_considered: 'not considered'> first_gen=<AdmissionFactorWeightClass.not_considered: 'not considered'> alumni=<AdmissionFactorWeightClass.not_considered: 'not considered'> residence=<AdmissionFactorWeightClass.not_considered: 'not considered'> religion=<AdmissionFactorWeightClass.not_considered: 'not considered'>


AdmissionsFactorWeights(class_rank=<AdmissionFactorWeightClass.important: 'important'>, gpa=<AdmissionFactorWeightClass.very_important: 'very important'>, standardized_test_score=<AdmissionFactorWeightClass.considered: 'considered'>, essay=<AdmissionFactorWeightClass.not_considered: 'not considered'>, interview=<AdmissionFactorWeightClass.not_considered: 'not considered'>, extracurriculars=<AdmissionFactorWeightClass.not_considered: 'not considered'>, first_gen=<AdmissionFactorWeightClass.not_considered: 'not considered'>, alumni=<AdmissionFactorWeightClass.not_considered: 'not considered'>, residence=<AdmissionFactorWeightClass.not_considered: 'not considered'>, religion=<AdmissionFactorWeightClass.not_considered: 'not considered'>)