In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import os
import spacy

# Load the English tokenizer, tagger, parser, NER and word vectors from spaCy
nlp = spacy.load("en_core_web_sm")

def get_university_names(file_path, num_universities=1000):
    # Load university names from an Excel file
    df = pd.read_excel(file_path, skiprows=2)  # Skip the first two rows assuming they are headers
    # Extract university names from a specific column, adjusting index as needed, and limit to specified number
    university_names = df.iloc[1:num_universities+1, 2].dropna().tolist()
    # Clean the university names by stripping leading and trailing spaces
    cleaned_names = [str(name).strip() for name in university_names]
    return cleaned_names

def get_first_paragraph_wikipedia(university_name):
    # Format the university name to fit Wikipedia URL format
    safe_name = university_name.replace(" ", "_").replace("(", "%28").replace(")", "%29")
    url = f"https://en.wikipedia.org/wiki/{safe_name}"
    try:
        # Send a GET request to the Wikipedia page
        response = requests.get(url)
        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, 'html.parser')
            # Find all paragraph elements
            paragraphs = soup.find_all('p')
            # Return the text of the first non-empty paragraph
            for paragraph in paragraphs:
                if paragraph.text and not paragraph.text.isspace():
                    return paragraph.text.strip()
            return "Valid paragraph not found"
        else:
            return "Page not found or error"
    except requests.RequestException as e:
        # Handle request errors
        return f"Request error: {e}"

def tokenize_text_spacy(text):
    # Tokenize the text using spaCy
    doc = nlp(text)
    tokens = []
    # Extract named entities, excluding any that contain punctuation
    for ent in doc.ents:
        if not any(token.is_punct for token in ent):
            tokens.append(ent.text)
    # Extract noun chunks, excluding those covered by entities and containing punctuation
    for chunk in doc.noun_chunks:
        if not any(chunk.text in ent.text for ent in doc.ents) and not any(token.is_punct for token in chunk):
            tokens.append(chunk.text)
    # Process remaining tokens, combining specific cases
    len_doc = len(doc)
    i = 0
    while i < len_doc:
        token = doc[i]
        if i + 1 < len_doc and (token.text.lower() == "bachelor" and doc[i + 1].text == "'s"):
            tokens.append(token.text + doc[i + 1].text)
            i += 2  # Skip next token
            continue
        if not any(token.text in ent.text for ent in doc.ents) and not any(token.text in chunk.text for chunk in doc.noun_chunks) and not token.is_punct and not token.is_space:
            tokens.append(token.text)
        i += 1
    return tokens

def save_corpus_individual_files(corpus, directory):
    # Create the directory if it does not exist
    os.makedirs(directory, exist_ok=True)
    for university, paragraph in corpus.items():
        # Tokenize the paragraph text
        tokens = tokenize_text_spacy(paragraph)
        # Clean the university name to create a valid filename
        filename = "".join([c for c in university if c.isalpha() or c.isdigit() or c==' ']).rstrip()
        filename = "_".join(filename.split()) + ".json"
        file_path = os.path.join(directory, filename)
        # Save the tokens in a JSON file
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump({'university': university, 'tokens': tokens}, f, ensure_ascii=False, indent=4)
        print(f"Saved: {file_path}")

if __name__ == "__main__":
    # Specify the path to the Excel file containing university names
    file_path = '/COLX_523_zhiyang_yushun_huiyin_trang/2024 QS World University Rankings.xlsx'
    # Retrieve the list of university names
    university_names_list = get_university_names(file_path)
    corpus = {}
    # For each university, get the first paragraph from its Wikipedia page
    for university in university_names_list:
        paragraph = get_first_paragraph_wikipedia(university)
        corpus[university] = paragraph

    # Specify the directory to save the tokenized paragraphs
    corpus_directory = '/COLX_523_zhiyang_yushun_huiyin_trang/tokenized_corpus'
    # Save each tokenized paragraph in individual files
    save_corpus_individual_files(corpus, corpus_directory)

Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/tokenized_corpus/Massachusetts_Institute_of_Technology.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/tokenized_corpus/University_of_Cambridge.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/tokenized_corpus/University_of_Oxford.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/tokenized_corpus/Harvard_University.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/tokenized_corpus/Stanford_University.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/tokenized_corpus/Imperial_College_London.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/tokenized_corpus/ETH_Zurich.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/tokenized_corpus/National_University_of_Singapore.json
Saved: /Users/billchou/Desktop/COLX_523_zhiyang_yushun_huiyin_trang/tokenized_corpus/Unive