In [None]:
import json

with open("llm_papers.json") as json_file:
    llm_papers = json.load(json_file)

In [None]:
import collections
from typing import Dict, Tuple, Optional
from geopy.geocoders import Nominatim, GoogleV3
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import time
import json
import os
import folium
from folium.plugins import MarkerCluster
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import ollama
import pickle
from tqdm import tqdm

RATE_LIMIT = 1.1
USER_AGENT = "geocode_cache_llm_papers/1.0 (@gmail.com)"
CACHE_FILE = "geocode_cache_llm_papers.json"
MAP_FILE = "papers_map.html"

geolocator = Nominatim(user_agent=USER_AGENT)
google_geolocator = GoogleV3(api_key="")


def load_cache() -> Dict:
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, "r") as f:
            return json.load(f)
    return {}


def save_cache(cache: Dict) -> None:
    with open(CACHE_FILE, "w") as f:
        json.dump(cache, f, indent=4)


def geocode_with_fallback(
    query: str, cache: Dict, address: Optional[str] = None
) -> Tuple[Optional[Tuple[float, float]], Optional[str]]:
    if query not in cache:
        cache[query] = {"address": None, "location": None}

    if cache[query]["location"] is None:
        try:
            if address:
                location = geolocator.geocode(address) or google_geolocator.geocode(
                    address
                )
            else:
                location = geolocator.geocode(query)
                if location is not None and not verify_location(
                    query, (location.latitude, location.longitude)
                ):
                    print(
                        f"Location verification failed for {query}. Searching for correct address..."
                    )
                    correct_address = find_correct_address(query)
                    location, _ = geocode_with_fallback(query, cache, correct_address)

            if location:
                cache[query]["location"] = (location.latitude, location.longitude)
                if address:
                    cache[query]["address"] = address
            else:
                cache[query]["location"] = None

            save_cache(cache)
        except (GeocoderTimedOut, GeocoderServiceError, AttributeError) as e:
            print(f"Error geocoding {query}: {e}")
        time.sleep(RATE_LIMIT)

    return cache[query]["location"], cache[query]["address"]


def search_google(query: str) -> str:
    encoded_query = urllib.parse.quote(query)
    url = f"https://www.google.com/search?q={encoded_query}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    req = urllib.request.Request(url, headers=headers)

    try:
        with urllib.request.urlopen(req) as response:
            html = response.read()
        soup = BeautifulSoup(html, "html.parser")
        text_content = soup.get_text(separator=" ", strip=True)
        kw = "g.co/privacytools"
        privacy_tools_index = text_content.find(kw)
        if privacy_tools_index != -1:
            text_content = text_content[privacy_tools_index + len(kw) :]
        lines = [line.strip() for line in text_content.split("\n") if line.strip()]
        filtered_lines = [
            line
            for line in lines
            if not line.startswith(
                ("https://", "http://", "www.", "Images", "Videos", "Maps", "News")
            )
        ]
        clean_text = " ".join(filtered_lines)
        return clean_text
    except Exception as e:
        return f"An error occurred: {str(e)}"


def verify_location(query: str, location: Tuple[float, float]) -> bool:
    try:
        reverse_location = geolocator.reverse(f"{location[0]}, {location[1]}")
        prompt = f"""Verify if the following address matches the query. Respond with 'yes' or 'no' only.
        Query: {query}
        Address: {str(reverse_location)}
        """
        response = (
            ollama.generate(model="gemma2", prompt=prompt)["response"].strip().lower()
        )
        return response == "yes"
    except (GeocoderTimedOut, GeocoderServiceError) as e:
        print(f"Error verifying location for {query}: {e}")
        return False


def find_correct_address(query: str) -> str:
    search_results = search_google(f"{query} address")
    prompt = f"""Find the address for the query from the search output below. Be concise, only output the most important parts of the address. 
    Query: {query}
    Search results: {search_results}
    """
    return (
        ollama.generate(model="gemma2", prompt=prompt)["response"]
        .replace("\n", "")
        .strip()
    )


def process_single_institution(inst: str, cache: Dict) -> Optional[Tuple[float, float]]:
    query = inst
    location, _ = geocode_with_fallback(query, cache)

    if not location:
        print(f"Location not found for {query}. Searching for correct address...")
        correct_address = find_correct_address(query)
        location, _ = geocode_with_fallback(query, cache, correct_address)

    return location


def process_single_paper(paper_id: str, authors_data: Dict, cache: Dict) -> Dict:
    processed = {"papers": collections.defaultdict(list), "unknown": set()}

    for author_info, inst_list in authors_data.items():
        for inst in inst_list:
            loc = process_single_institution(inst, cache)
            if loc:
                processed["papers"][paper_id].append({"name": inst, "location": loc})
            else:
                processed["unknown"].add(inst)

    return processed


def create_map(processed: Dict) -> None:
    m = folium.Map(location=[0, 0], zoom_start=2)
    marker_cluster = MarkerCluster().add_to(m)

    for submission_id, locations in processed["papers"].items():
        for loc in locations:
            folium.Marker(
                loc["location"],
                popup=f"<b>{loc['name']}</b><br>Paper: {submission_id}",
                tooltip=f"Paper: {submission_id}",
            ).add_to(marker_cluster)

    m.save(MAP_FILE)
    return m


def print_summary(processed: Dict) -> None:
    print(f"Papers: {len(processed['papers'])}")
    print(
        f"Mapped locations: {sum(len(locs) for locs in processed['papers'].values())}"
    )
    print(f"Unknown locations: {len(processed['unknown'])}")
    print(f"Map saved as {MAP_FILE}")


def load_processed_data() -> Dict:
    try:
        with open("processed_papers_cache.pkl", "rb") as f:
            return pickle.load(f)
    except (FileNotFoundError, pickle.PickleError):
        return {"papers": collections.defaultdict(list), "unknown": set()}


def save_processed_data(data: Dict) -> None:
    with open("processed_papers_cache.pkl", "wb") as f:
        pickle.dump(data, f)


def process_all_papers(res: Dict) -> Dict:
    all_processed = load_processed_data()
    unique_papers = set(res.keys())

    already_processed = set(all_processed["papers"].keys())
    to_process = unique_papers - already_processed
    cache = load_cache()

    for paper_id in tqdm(to_process):
        paper_processed = process_single_paper(paper_id, res[paper_id], cache)
        all_processed["papers"].update(paper_processed["papers"])
        all_processed["unknown"].update(paper_processed["unknown"])
        save_processed_data(all_processed)

    return all_processed


def main(res: Dict) -> None:
    processed = process_all_papers(res)
    create_map(processed)
    print_summary(processed)


main(llm_papers)

In [None]:
import json
from fuzzywuzzy import fuzz
from tqdm import tqdm


def find_best_match(institution, locations_data, threshold=80):
    best_match = None
    best_score = 0

    for location in locations_data:
        score = fuzz.ratio(institution.lower(), location.lower())
        if score > best_score and score >= threshold:
            best_score = score
            best_match = location

    return best_match


def process_data(papers_file, locations_file):
    # Load the papers data
    with open(papers_file, "r") as f:
        papers_data = json.load(f)

    # Load the locations data
    with open(locations_file, "r") as f:
        locations_data = json.load(f)

    # Create a dictionary to store institution -> papers mapping
    institution_papers = {}

    # Process the papers data
    for paper_id, authors in papers_data.items():
        for author, institutions in authors.items():
            for institution in institutions:
                if institution not in institution_papers:
                    institution_papers[institution] = []
                institution_papers[institution].append(paper_id)

    # Combine with location data using fuzzy matching
    result = {}
    unmatched = []

    for institution, papers in tqdm(institution_papers.items()):
        best_match = find_best_match(institution, locations_data)

        if best_match:
            if best_match not in result:
                result[best_match] = {
                    "papers": papers,
                    "location": locations_data[best_match]["location"],
                }
            else:
                result[best_match]["papers"].extend(papers)
        else:
            unmatched.append(institution)

    # Print unmatched institutions
    if unmatched:
        print(f"Unmatched institutions: ({len(unmatched)})")
        for inst in unmatched:
            print(f"- {inst}")

    return result, unmatched


# Example usage
papers_file = "papers.json"
locations_file = "locations.json"
processed_data, unmatched = process_data(papers_file, locations_file)

In [None]:
markers = {}
wrong_loc = []
for k, v in processed_data.items():
    if v["location"] is not None:
        markers[k] = {
            k1: list(set(v1)) if k1 == "papers" else v1 for k1, v1 in v.items()
        }
    else:
        wrong_loc.append([k, v])
# Save the processed data to a new JSON file
with open("markers.json", "w") as f:
    json.dump(markers, f, indent=2)