In [1]:
import pandas as pd
from bs4 import BeautifulSoup

Process WSJ Info

In [2]:
def html_to_df(dir_path, name_convention, college_name_class):
    all_colleges = []
    file_index = 1
    
    while True:
        # Construct file path
        file_name = f"{name_convention}{file_index}.xml"
        full_path = f"{dir_path}/{file_name}"
        
        try:
            # Read and parse XML file
            with open(full_path, 'r') as file:
                file_content = file.read()
            
            # Parse with BeautifulSoup
            soup = BeautifulSoup(file_content, 'html.parser')
            
            # Get college names using existing pattern
            colleges = soup.find_all("p", class_=college_name_class)
            college_list = [name.text.strip() for name in colleges]
            
            # Add to master list
            all_colleges.extend(college_list)
            
            file_index += 1
            
        except FileNotFoundError:
            break
    
    # Create final dataframe
    df = pd.DataFrame(all_colleges, columns=['College Name'])
    return df

In [None]:
dir_path = "Data/WSJ2025"
name_convention = "WSJpage"
college_name_class = "css-95l4o0"


df = html_to_df(dir_path, name_convention, college_name_class)
df['WSJ Rank'] = df.index + 1

Sets up Gemini

In [2]:
import os
from dotenv import load_dotenv
import google.generativeai as genai
from google.ai.generativelanguage_v1beta.types import content
import json
import time

# Load environment variables from .env file
load_dotenv()

# Get the API key
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

if GEMINI_API_KEY is None:
    raise ValueError("GEMINI_API_KEY not found in .env file")

genai.configure(api_key=GEMINI_API_KEY)

Program to retrieve acceptance rate and save to pickle

In [5]:
# Create the model
acceptance_rate_generation_config = {
  "temperature": 0,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_schema": content.Schema(
    type = content.Type.OBJECT,
    properties = {
      "acceptance rate": content.Schema(
        type = content.Type.STRING,
      ),
      "confidence": content.Schema(
        type = content.Type.STRING,
      ),
    },
  ),
  "response_mime_type": "application/json",
}

acceptance_rate_model = genai.GenerativeModel(
  model_name="gemini-2.0-flash-exp",
  generation_config=gacceptance_rate_generation_config,
  system_instruction="Seartch the web for the acceptance rate in 2024 for the provided university undergraduate name. Provide the response as a JSON.",
)

acceptance_rate_chat_session = acceptance_rate_model.start_chat() 

In [6]:
def get_acceptance_rate(college_name):
    # Send message to chat session and get response
    response = acceptance_rate_chat_session.send_message(college_name)
    
    # Get the response text and extract acceptance rate
    try:
        # Parse the response text as JSON
        response_text = response.text
        result = json.loads(response_text)
        
        # Extract acceptance rate and convert to decimal
        rate_str = result['acceptance rate'].strip('%')  # Remove % sign
        rate_decimal = float(rate_str) / 100  # Convert to decimal
        
        return rate_decimal
    except (KeyError, ValueError, json.JSONDecodeError):
        # Return None if there's any error in parsing
        return None
    
    # Add a new column 'Acceptance Rate' by applying the function to each college name with rate limiting
def get_acceptance_rates_with_limit(df):
    rates = []
    for i, college in enumerate(df['College Name']):
        # Get acceptance rate for current college
        rate = get_acceptance_rate(college)
        rates.append(rate)

        print (college,  " acceptance rate is ", rate)
        
        # Add delay after every 10 requests
        if (i + 1) % 8 == 0:
            time.sleep(60)  # Wait 60 seconds after every 10 requests
            
    return rates


In [None]:
# Apply the rate-limited function
df['Acceptance Rate'] = get_acceptance_rates_with_limit(df)
df.to_pickle("Data/ar_df.pkl")

Program to retrieve lacrosse information and save to pickle

In [3]:
df = pd.read_pickle("Data/my_df.pkl")

In [4]:

generation_config_lacrosse = {
  "temperature": 0,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_schema": content.Schema(
    type = content.Type.OBJECT,
    properties = {
      "has_lacrosse": content.Schema(
        type = content.Type.BOOLEAN,
      ),
      "division": content.Schema(
        type = content.Type.STRING,
      ),
      "conference": content.Schema(
        type = content.Type.STRING,
      ),
    },
  ),
  "response_mime_type": "application/json",
}

lacrosse_model = genai.GenerativeModel(
  model_name="gemini-2.0-flash-exp",
  generation_config=generation_config_lacrosse,
  system_instruction="Seartch the web to determine if the school provided has a women's lacrosse program.  If it does, provide the NCAA divion and conference.",
)

lacrosse_chat_session = lacrosse_model.start_chat()


In [17]:
def get_lacrosse_info(college_name):
    # Send message to chat session and get response
    response = lacrosse_chat_session.send_message(college_name)
    
    try:
        # Parse the response text as JSON
        response_text = response.text
        result = json.loads(response_text)
        
        # Create a dictionary with the three values
        lacrosse_data = {
            'Has Lacrosse': [result.get('has_lacrosse', False)],
            'Division': [result.get('division', '')],
            'Conference': [result.get('conference', '')]
        }
        
        # Convert to DataFrame
        return pd.DataFrame(lacrosse_data)
        
    except (KeyError, ValueError, json.JSONDecodeError):
        # Return empty DataFrame with the same structure if there's any error
        return pd.DataFrame({
            'Has Lacrosse': [False],
            'Division': [''],
            'Conference': ['']
        })
    
def get_lacrosse_info_with_limit(df):
    all_info = pd.DataFrame()
    request_quota = 10  # requests per minute

    sleep_time = 60 / request_quota
    
    for i, college in enumerate(df['College Name']):
        # Get lacrosse info for current college
        info_df = get_lacrosse_info(college)
        
        # Concatenate with previous results
        all_info = pd.concat([all_info, info_df], ignore_index=True)
        
        print(f"{college} - Has lacrosse: {info_df['Has Lacrosse'].iloc[0]}, Division: {info_df['Division'].iloc[0]}, Conference: {info_df['Conference'].iloc[0]}")
        
        # Add delay after every requests to avoiding hitting the rate limit
        time.sleep(sleep_time)
            
    return all_info  

In [18]:
# Apply the rate-limited function and add the columns to df
lacrosse_info = get_lacrosse_info_with_limit(df)
df[['Has Lacrosse', 'Division', 'Conference']] = lacrosse_info
df.to_pickle("Data/lac_df.pkl")

Princeton University - Has lacrosse: True, Division: NCAA Division I, Conference: Ivy League
Babson College - Has lacrosse: True, Division: NCAA Division III, Conference: New England Women's and Men's Athletic Conference
Stanford University - Has lacrosse: True, Division: NCAA Division I, Conference: Pac-12 Conference
Yale University - Has lacrosse: True, Division: NCAA Division I, Conference: Ivy League
Claremont McKenna College - Has lacrosse: True, Division: NCAA Division III, Conference: Southern California Intercollegiate Athletic Conference
Massachusetts Institute of Technology - Has lacrosse: True, Division: NCAA Division III, Conference: New England Women's and Men's Athletic Conference
Harvard University - Has lacrosse: True, Division: NCAA Division I, Conference: Ivy League
University of California, Berkeley - Has lacrosse: True, Division: NCAA Division I, Conference: Pac-12 Conference
Georgia Institute of Technology, Main Campus - Has lacrosse: True, Division: NCAA Division 

US News Rank   

In [3]:
df = pd.read_pickle("Data/lac_df.pkl")

In [7]:
generation_config_usnews = {
  "temperature": 0,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_schema": content.Schema(
    type = content.Type.OBJECT,
    properties = {
      "US News Rank": content.Schema(
        type = content.Type.STRING,
      )
    }
  ),
  "response_mime_type": "application/json",
  "candidate_count": 1
}

usnews_model = genai.GenerativeModel(
  model_name="gemini-2.0-flash-exp",
  generation_config=generation_config_usnews,
  system_instruction="Search the web to determine the national ranking of the college according to US News Overall Score rank. Do not consider other rankings like in National Liberal Arts schools",
)

usnews_chat_session = usnews_model.start_chat()


In [8]:
def get_usnews_info(college_name):
    # Send message to chat session and get response
    response = usnews_chat_session.send_message(college_name)
    
    try:
        # Parse the response text as JSON
        response_text = response.text
        result = json.loads(response_text)
        
        rank_str = result['US News Rank'].strip('%')
        
        return rank_str
        
    except (KeyError, ValueError, json.JSONDecodeError):
        # Return empty DataFrame with the same structure if there's any error
        return None
    
def get_usnews_info_with_limit(df):
    all_info = []
    request_quota = 10  # requests per minute

    sleep_time = 60 / request_quota
    
    for i, college in enumerate(df['College Name']):
        info = get_usnews_info(college)
        all_info.append(info)
        
        print(f"{college} rank is {info}")
        
        # Add delay after every requests to avoiding hitting the rate limit
        time.sleep(sleep_time)
    
    return all_info  

In [9]:
df['US News Rank'] = get_usnews_info_with_limit(df)
df.to_pickle("Data/un_df.pkl")

Princeton University rank is 1
Babson College rank is 34
Stanford University rank is 3
Yale University rank is 5
Claremont McKenna College rank is 8
Massachusetts Institute of Technology rank is 2
Harvard University rank is 3
University of California, Berkeley rank is 15
Georgia Institute of Technology, Main Campus rank is 33
Davidson College rank is 17
Bentley University rank is 68
University of California, Davis rank is 28
University of Pennsylvania rank is 7
Columbia University rank is 12
Lehigh University rank is 51
San José State University rank is 187
University of Notre Dame rank is 20
University of California, Merced rank is 97
Virginia Tech rank is 47
Harvey Mudd College rank is 36
California State Polytechnic University, Pomona rank is 151
University of Michigan, Ann Arbor rank is 21
Loyola University Maryland rank is 115
California State University, Stanislaus rank is 187
Colgate University rank is 25
University of Delaware rank is 76
Cornell University rank is 12
Texas A & 

Data Exploration

In [10]:
# Filter df to only include schools with lacrosse programs
lacrosse_schools = df[df['Has Lacrosse'] == True]
print(f"Number of schools with lacrosse programs: {len(lacrosse_schools)}")
display(lacrosse_schools)

Number of schools with lacrosse programs: 167


Unnamed: 0,College Name,Acceptance Rate,WSJ Rank,Has Lacrosse,Division,Conference,US News Rank
0,Princeton University,0.057,1,True,NCAA Division I,Ivy League,1
1,Babson College,0.250,2,True,NCAA Division III,New England Women's and Men's Athletic Conference,34
2,Stanford University,0.039,3,True,NCAA Division I,Pac-12 Conference,3
3,Yale University,0.045,4,True,NCAA Division I,Ivy League,5
4,Claremont McKenna College,0.100,5,True,NCAA Division III,Southern California Intercollegiate Athletic C...,8
...,...,...,...,...,...,...,...
195,Temple University,0.710,196,True,NCAA Division I,American Athletic Conference,89
196,St. Lawrence University,0.500,197,True,NCAA Division III,Liberty League,67
197,"University at Buffalo, SUNY",0.710,198,True,NCAA Division I,Mid-American Conference,89
198,Florida State University,0.250,199,True,NCAA Division I,Atlantic Coast Conference,53


In [11]:
filtered_schools = lacrosse_schools[lacrosse_schools['Acceptance Rate'] >= 0.07]
print(f"Number of schools with lacrosse programs and acceptance rate >= 7%: {len(filtered_schools)}")
display(filtered_schools)

Number of schools with lacrosse programs and acceptance rate >= 5%: 154


Unnamed: 0,College Name,Acceptance Rate,WSJ Rank,Has Lacrosse,Division,Conference,US News Rank
1,Babson College,0.250,2,True,NCAA Division III,New England Women's and Men's Athletic Conference,34
4,Claremont McKenna College,0.100,5,True,NCAA Division III,Southern California Intercollegiate Athletic C...,8
7,"University of California, Berkeley",0.113,8,True,NCAA Division I,Pac-12 Conference,15
8,"Georgia Institute of Technology, Main Campus",0.160,9,True,NCAA Division I,Atlantic Coast Conference,33
9,Davidson College,0.130,10,True,NCAA Division I,Atlantic 10 Conference,17
...,...,...,...,...,...,...,...
195,Temple University,0.710,196,True,NCAA Division I,American Athletic Conference,89
196,St. Lawrence University,0.500,197,True,NCAA Division III,Liberty League,67
197,"University at Buffalo, SUNY",0.710,198,True,NCAA Division I,Mid-American Conference,89
198,Florida State University,0.250,199,True,NCAA Division I,Atlantic Coast Conference,53


In [13]:
# Filter schools by Division I and II
d2_d3_schools = filtered_schools[filtered_schools['Division'].isin(['NCAA Division II', 'NCAA Division III'])]
print(f"Number of Division II and III schools with lacrosse programs and acceptance rate >= 7%: {len(d2_d3_schools)}")
display(d2_d3_schools)

Number of Division II and III schools with lacrosse programs and acceptance rate >= 7%: 52


Unnamed: 0,College Name,Acceptance Rate,WSJ Rank,Has Lacrosse,Division,Conference,US News Rank
1,Babson College,0.25,2,True,NCAA Division III,New England Women's and Men's Athletic Conference,34
4,Claremont McKenna College,0.1,5,True,NCAA Division III,Southern California Intercollegiate Athletic C...,8
10,Bentley University,0.45,11,True,NCAA Division II,Northeast-10 Conference,68
19,Harvey Mudd College,0.13,20,True,NCAA Division III,Southern California Intercollegiate Athletic C...,36
31,Washington University in St. Louis,0.11,32,True,NCAA Division III,University Athletic Association,24
34,Swarthmore College,0.08,35,True,NCAA Division III,Centennial Conference,4
36,Augustana College,0.68,37,True,NCAA Division III,College Conference of Illinois and Wisconsin,105
49,Washington and Lee University,0.16,50,True,NCAA Division III,Old Dominion Athletic Conference,17
53,Albion College,0.68,54,True,NCAA Division III,Michigan Intercollegiate Athletic Association,127
55,Carnegie Mellon University,0.11,56,True,NCAA Division III,University Athletic Association,22


In [28]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(d2_d3_schools)
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

Unnamed: 0,College Name,Acceptance Rate,WSJ Rank,Has Lacrosse,Division,Conference
1,Babson College,0.25,2,True,NCAA Division III,New England Women's and Men's Athletic Conference
4,Claremont McKenna College,0.1,5,True,NCAA Division III,Southern California Intercollegiate Athletic C...
10,Bentley University,0.45,11,True,NCAA Division II,Northeast-10 Conference
19,Harvey Mudd College,0.13,20,True,NCAA Division III,Southern California Intercollegiate Athletic C...
31,Washington University in St. Louis,0.11,32,True,NCAA Division III,University Athletic Association
34,Swarthmore College,0.08,35,True,NCAA Division III,Centennial Conference
36,Augustana College,0.68,37,True,NCAA Division III,College Conference of Illinois and Wisconsin
49,Washington and Lee University,0.16,50,True,NCAA Division III,Old Dominion Athletic Conference
53,Albion College,0.68,54,True,NCAA Division III,Michigan Intercollegiate Athletic Association
55,Carnegie Mellon University,0.11,56,True,NCAA Division III,University Athletic Association


In [14]:
df

Unnamed: 0,College Name,Acceptance Rate,WSJ Rank,Has Lacrosse,Division,Conference,US News Rank
0,Princeton University,0.057,1,True,NCAA Division I,Ivy League,1
1,Babson College,0.250,2,True,NCAA Division III,New England Women's and Men's Athletic Conference,34
2,Stanford University,0.039,3,True,NCAA Division I,Pac-12 Conference,3
3,Yale University,0.045,4,True,NCAA Division I,Ivy League,5
4,Claremont McKenna College,0.100,5,True,NCAA Division III,Southern California Intercollegiate Athletic C...,8
...,...,...,...,...,...,...,...
195,Temple University,0.710,196,True,NCAA Division I,American Athletic Conference,89
196,St. Lawrence University,0.500,197,True,NCAA Division III,Liberty League,67
197,"University at Buffalo, SUNY",0.710,198,True,NCAA Division I,Mid-American Conference,89
198,Florida State University,0.250,199,True,NCAA Division I,Atlantic Coast Conference,53
