In [None]:
import httplib
import gensim
import json

# Load categories from JSON file
with open("categories.json", "r") as f:
    categories = json.load(f)

# Shuffle categories and select the first 20
random.shuffle(categories)
categories = categories[:20]

# Build URL for Toolforge API request
URL = "https://randomincategory.toolforge.org/Random_page_in_category?"
for i, cat in enumerate(categories):
    URL += f"&category{i}={urllib.parse.quote(cat.lower())}"
URL += "&server=en.wikipedia.org&cmnamespace=0&cmtype=page&returntype="

# Load URLs from CSV file
urls_df = pd.read_csv("./peoplelinks.csv", error_bad_lines=False)
urls = [url for url in urls_df.values if "wikipedia" in url]

# Load word2vec model and create a dictionary for caching results
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
cached_results = {}

# Iterate over URLs and summarize each page
for url in tqdm(urls):
    # Check if the URL is in the cache
    if url in cached_results:
        summary = cached_results[url]
    else:
        # Use httplib to make a GET request to the URL
        conn = httplib.HTTPSConnection(url)
        conn.request("GET", "/")
        response = conn.getresponse()
        page_html = response.read()

        # Use BeautifulSoup to parse the HTML and extract the page text
        soup = BeautifulSoup(page_html, "html.parser")
        page_text = soup.get_text()

        # Use Gensim's word2vec model to summarize the text
        summary = model.summarize(page_text, ratio=0.1)

        # Cache the result
        cached_results[url] = summary

# Save the cache to a JSON file
with open("cached_results.json", "w") as f:
    json.dump(cached_results, f)
