In [2]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import json
import os

def get_university_names(file_path, num_universities=1002):
    """
    Extracts and cleans a list of university names from an Excel file.
    
    Parameters:
    - file_path (str): The path to the Excel file containing university names.
    - num_universities (int): The number of universities to extract. Defaults to 1002.
    
    Returns:
    - list: A list of cleaned university names.
    
    The function reads the Excel file, skipping the first two rows as they might contain headers or irrelevant information.
    It then extracts university names from the third column, assuming the first university name starts at the second row of the data frame.
    Cleaning involves removing text within parentheses to simplify the names.
    """
    # Load Excel file, skip first 2 rows, and select column with university names
    df = pd.read_excel(file_path, skiprows=2)  
    # Extract and clean university names, adjusting index to match file structure
    university_names = df.iloc[1:num_universities+1, 2].dropna().tolist()
    cleaned_names = [re.sub(r"\s*\([^)]*\)", "", str(name)).strip() for name in university_names]
    return cleaned_names

def get_first_paragraph_wikipedia(university_name):
    """
    Retrieves the first non-empty paragraph from a university's Wikipedia page.
    
    Parameters:
    - university_name (str): The name of the university.
    
    Returns:
    - str: The first paragraph of the university's Wikipedia page, or an error message.
    
    The function formats the university name into a Wikipedia URL-friendly format,
    then makes an HTTP request to the corresponding Wikipedia page.
    It uses BeautifulSoup to parse the HTML content and find the first non-empty paragraph.
    """
    # Format university name for URL
    safe_name = re.sub(r'[^\w\s]', '', university_name.replace(" ", "_"))
    url = f"https://en.wikipedia.org/wiki/{safe_name}"
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            paragraphs = soup.find_all('p')
            for paragraph in paragraphs:
                if paragraph.text and not paragraph.text.isspace():
                    return paragraph.text.strip()
            return "Valid paragraph not found"
        else:
            return "Page not found or error"
    except requests.RequestException as e:
        return f"Request error: {e}"

def save_corpus_individual_files(corpus, directory):
    """
    Saves each university's first paragraph into individual JSON files within a specified directory.
    
    Parameters:
    - corpus (dict): A dictionary with university names as keys and their first paragraphs as values.
    - directory (str): The path to the directory where the JSON files will be saved.
    
    The function creates the directory if it doesn't exist, then iterates over the corpus dictionary,
    creating a JSON file for each university containing its first paragraph.
    """
    # Ensure the directory exists
    os.makedirs(directory, exist_ok=True)
    # Iterate through the corpus and save each entry as a JSON file
    for university, paragraph in corpus.items():
        filename = re.sub(r'[^\w\s]', '', university).replace(" ", "_") + ".json"
        file_path = os.path.join(directory, filename)
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump({university: paragraph}, f, ensure_ascii=False, indent=4)
        print(f"Saved: {file_path}")

# Main execution
if __name__ == "__main__":
    file_path = '/COLX_523_zhiyang_yushun_huiyin_trang/2024 QS World University Rankings.xlsx'
    
    # Extract and clean university names from the Excel file
    university_names_list = get_university_names(file_path)
    
    # Fetch and store the first paragraph for each university from Wikipedia
    corpus = {}
    for university in university_names_list:
        paragraph = get_first_paragraph_wikipedia(university)
        corpus[university] = paragraph
    
    # Directory to save the JSON files containing each university's first paragraph
    corpus_directory = '/COLX_523_zhiyang_yushun_huiyin_trang/complete_university_corpus'
    
    # Save each university's first paragraph to individual JSON files
    save_corpus_individual_files(corpus, corpus_directory)

Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/complete_university_corpus/Massachusetts_Institute_of_Technology.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/complete_university_corpus/University_of_Cambridge.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/complete_university_corpus/University_of_Oxford.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/complete_university_corpus/Harvard_University.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/complete_university_corpus/Stanford_University.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/complete_university_corpus/Imperial_College_London.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/complete_university_corpus/ETH_Zurich.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/complete_university_corpus/National_University_of_Singapore.json
Saved: /Us