## Download the dataset (title + abstract)

The titles are in the paper folder while the abstracts are in the abstract folder.


In [None]:
import requests
import json
import os
import gzip
import pandas as pd

# Set your S2 API key
api_key = "09CuB8BwpZ8j8vzF3WtEK9aMSLHM98HM77BzJNym"

# Headers for authentication
headers = {
    "x-api-key": api_key
}

# Base output directory locally
base_output_dir = "./s2ag_dataset_title_abstracts"
os.makedirs(base_output_dir, exist_ok=True)


# Step 1: Get the latest release ID
latest_release_url = "https://api.semanticscholar.org/datasets/v1/release/latest"
response = requests.get(latest_release_url, headers=headers)
if response.status_code != 200:
    print(f"Failed to fetch latest release: {response.status_code} - {response.text}")
    exit()

latest_release = response.json()
release_id = latest_release["release_id"]
print("Latest release ID:", release_id)

# Step 2: Datasets to download
datasets = ["papers", "abstracts"]

# Dictionary to store data
data = {"title": [], "abstract": []}

for dataset_name in datasets:
    # Create dataset-specific directory
    output_dir = os.path.join(base_output_dir, dataset_name)
    os.makedirs(output_dir, exist_ok=True)

    # Step 3: Get metadata for the dataset
    dataset_url = f"https://api.semanticscholar.org/datasets/v1/release/{release_id}/dataset/{dataset_name}"
    response = requests.get(dataset_url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch metadata for {dataset_name}: {response.status_code} - {response.text}")
        continue

    dataset_info = response.json()
    print(f"\n{dataset_name.capitalize()} dataset metadata:")
    print(json.dumps(dataset_info, indent=2))

    # Step 4: Download all dataset files with simplified file names
    for index, file_url in enumerate(dataset_info["files"]):  # Process all files
        # Generate a shorter file name (e.g., papers_0.gz, abstracts_1.gz)
        file_name = f"{dataset_name}_{index}.gz"
        output_path = os.path.join(output_dir, file_name)

        print(f"Downloading {dataset_name}/{file_name}...")
        file_response = requests.get(file_url, headers=headers, stream=True)

        if file_response.status_code == 200:
            with open(output_path, "wb") as f:
                for chunk in file_response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            print(f"Saved {file_name} to {output_path}")
        else:
            print(f"Failed to download {file_name}: {file_response.status_code} - {file_response.text}")

# Step 5: Process the datasets to extract titles and abstracts for Computer Science papers
print("\nProcessing datasets to extract titles and abstracts for Computer Science papers...")

# Dictionary to store titles and abstracts by corpusid
title_dict = {}
abstract_dict = {}

# Process papers dataset (for titles and field of study)
papers_dir = os.path.join(base_output_dir, "papers")
if os.path.exists(papers_dir):
    for file_name in os.listdir(papers_dir):
        if file_name.endswith(".gz"):
            file_path = os.path.join(papers_dir, file_name)
            try:
                with gzip.open(file_path, "rt", encoding="utf-8") as f:
                    for line in f:
                        try:
                            record = json.loads(line.strip())
                            corpusid = record.get("corpusid")
                            title = record.get("title")
                            fields_of_study = record.get("fieldsOfStudy", [])
                            # Filter for Computer Science
                            if corpusid and title and "Computer Science" in fields_of_study:
                                title_dict[corpusid] = title
                        except json.JSONDecodeError as e:
                            print(f"Error decoding JSON in {file_name}: {e}")
            except Exception as e:
                print(f"Error reading {file_name}: {e}")
else:
    print("Papers directory not found. Skipping paper processing.")

# Process abstracts dataset
abstracts_dir = os.path.join(base_output_dir, "abstracts")
if os.path.exists(abstracts_dir):
    for file_name in os.listdir(abstracts_dir):
        if file_name.endswith(".gz"):
            file_path = os.path.join(abstracts_dir, file_name)
            try:
                with gzip.open(file_path, "rt", encoding="utf-8") as f:
                    for line in f:
                        try:
                            record = json.loads(line.strip())
                            corpusid = record.get("corpusid")
                            abstract = record.get("abstract")
                            if corpusid and abstract:
                                abstract_dict[corpusid] = abstract
                        except json.JSONDecodeError as e:
                            print(f"Error decoding JSON in {file_name}: {e}")
            except Exception as e:
                print(f"Error reading {file_name}: {e}")
else:
    print("Abstracts directory not found. Skipping abstract processing.")

# Step 6: Combine titles and abstracts by corpusid for Computer Science papers
for corpusid in title_dict:
    if corpusid in abstract_dict:
        data["title"].append(title_dict[corpusid])
        data["abstract"].append(abstract_dict[corpusid])

# Step 7: Create DataFrame and save to CSV
if data["title"]:
    df = pd.DataFrame(data)
    print("\nSample of DataFrame (Computer Science papers):")
    print(df.head())

    # Save DataFrame to CSV
    df.to_csv(csv_output_path, index=False, encoding="utf-8")
    print(f"\nSaved DataFrame to {csv_output_path}")
else:
    print("\nNo matching title and abstract pairs found for Computer Science papers. CSV not created.")

print("\nProcessing complete!")

## Combine the datasets and filter English Only papers

In [None]:
import pandas as pd
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
import os
import logging

# Setup logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

def is_english(text):
    """
    Detect if the text is in English using langdetect.
    """
    try:
        return detect(text.strip()) == 'en'
    except LangDetectException:
        return False

def combine_english_title_abstracts(titles_csv, abstracts_csv, output_csv):
    """
    Combine English title-abstract pairs from CSV files into a single output CSV,
    processing data in chunks to handle large datasets.
    """
    english_pairs = []
    chunk_size = 50000  # Adjust based on memory; process 10,000 rows at a time

    logger.info(f"Starting processing of {titles_csv} and {abstracts_csv}")
    for chunk_titles in pd.read_csv(titles_csv, usecols=[1], chunksize=chunk_size):
        # Align abstracts chunk by skipping rows and matching length
        start_row = chunk_titles.index[0]
        chunk_abstracts = pd.read_csv(abstracts_csv, usecols=[1], skiprows=range(1, start_row + 1), nrows=chunk_size).squeeze()
        min_len = min(len(chunk_titles), len(chunk_abstracts))
        titles = chunk_titles.iloc[:min_len].squeeze()
        abstracts = chunk_abstracts.iloc[:min_len]

        # Process each pair in the chunk
        for title, abstract in zip(titles, abstracts):
            if pd.notna(title) and pd.notna(abstract):
                if is_english(title) and is_english(abstract):
                    english_pairs.append({'title': title.strip(), 'abstract': abstract.strip()})

        logger.info(f"Processed chunk starting at row {start_row} with {min_len} pairs")

    # Write to CSV file
    if english_pairs:
        df_output = pd.DataFrame(english_pairs)
        df_output.to_csv(output_csv, index=False, encoding='utf-8')
        logger.info(f"English-only combined file written to: {output_csv}")
        print(f"English-only combined file written to: {output_csv}")
    else:
        logger.warning(f"No English entries found. No file written to: {output_csv}")
        print(f"No English entries found. No file written to: {output_csv}")

if __name__ == "__main__":
    # Example usage with file paths
    titles_csv = 's2ag_dataset_title_abstracts/output/papers_csv/papers_0.csv'
    abstracts_csv = 's2ag_dataset_title_abstracts/output/abstracts_csv/abstracts_0.csv'
    output_csv = 'paper_dataset/paper_dataset_v0.csv'

    # Validate input files exist
    if not (os.path.exists(titles_csv) and os.path.exists(abstracts_csv)):
        logger.error("One or both input CSV files not found")
        print("Error: One or both input CSV files not found")
    else:
        combine_english_title_abstracts(titles_csv, abstracts_csv, output_csv)

In [None]:
# for to complete whole process, 10000 rows took 185,3 seconds

# Saved Model Experiments

In [28]:
#pip install matplotlib numpy scikit-learn

In [62]:
from gensim.models import KeyedVectors
model_path = '9M[256-10]_sg.bin'
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [63]:
test_word = 'semantic_web'# image_segmentation, web, learning, science, semantic_web
if test_word in model:
    similar_words = model.most_similar(test_word, topn=10)
    print(f"Test word: {test_word}")
    for word, score in similar_words:
        print(f"{word}: {score:.4f}")
else:
    print(f"'{test_word}' not in vocabulary")

Test word: semantic_web
semantic_web_technologies: 0.8505
web_ontology_language: 0.8075
mashups: 0.8045
semantic_web_services: 0.8000
query_languages: 0.7976
world_wide_web: 0.7876
owl: 0.7862
service-based: 0.7839
domain_ontologies: 0.7814
knowledge_representation: 0.7806


## concepts from CSO

In [19]:
import sys

# Add path to ontology.py
sys.path.append(r"C:\Users\Faisal Ramzan\Desktop\kmi_project_cso\cso-reader-main\cso_reader")

from ontology import Ontology

# Initialize
ontology = Ontology()

# Target word to find related topics
target = "semantic web"

# Find related topics by keyword match
related_by_name = [topic for topic in ontology.topics if target.lower() in topic.lower()]

# Get descendants (children, subtopics)
descendants = ontology.get_all_descendants_of_topics([target]) if target in ontology.topics else []

# Combine and deduplicate
similar_terms = list(set(related_by_name + descendants))

# Show top N
top_n = 10
print(f"Top similar terms to '{target}':")
for i, term in enumerate(similar_terms[:top_n]):
    print(f"{i+1}. {term}")


Computer Science Ontology loaded.
Top similar terms to 'semantic web':
1. rdf graph
2. ontology alignment
3. rdf data
4. ontology pattern
5. semantic web rule languages
6. graphrag
7. semantic web applications
8. semantic search engine
9. sparql queries
10. ontology creation


In [None]:
import sys
import json
from rapidfuzz.distance import Levenshtein
from gensim.models import KeyedVectors

# Add ontology path
sys.path.append(r"C:\Users\Faisal Ramzan\Desktop\kmi_project_cso\cso-reader-main\cso_reader")
from ontology import Ontology


def load_word2vec_model(path: str):
    try:
        model = KeyedVectors.load_word2vec_format(path, binary=True)
        print(f"✅ Loaded Word2Vec model from: {path}")
        return model
    except FileNotFoundError:
        raise FileNotFoundError(f"❌ Model file not found: {path}")


def load_ontology():
    try:
        ontology = Ontology()
        print("✅ CSO Ontology loaded.")
        return ontology
    except Exception as e:
        raise RuntimeError(f"❌ Failed to load ontology: {e}")


def get_related_topics(wet2: str, ontology: Ontology):
    related_by_name = [topic for topic in ontology.topics if wet2.lower() in topic.lower()]
    descendants = ontology.get_all_descendants_of_topics([wet2]) if wet2 in ontology.topics else []
    return list(set(related_by_name + descendants))


def match_terms(model, ontology, top_n=10, word_similarity=0.7, min_similarity=0.90):
    output = {}
    total_vocab = len(model.key_to_index)

    for i, wet in enumerate(model.key_to_index):
        if i % 1000 == 0:
            print(f"🔁 Processing {i}/{total_vocab} | Matches so far: {len(output)}")

        output[wet] = []
        similar_words = [(wet, 1.0)]

        try:
            similar_words.extend(model.most_similar(wet, topn=top_n))
        except KeyError:
            continue

        for wet2, sim_w in similar_words:
            if sim_w < word_similarity:
                continue

            related_topics = get_related_topics(wet2, ontology)

            for topic in related_topics:
                sim_t = Levenshtein.normalized_similarity(topic, wet2)
                if sim_t >= min_similarity:
                    output[wet].append({
                        "topic": topic,
                        "sim_t": round(sim_t, 4),
                        "wet": wet2,
                        "sim_w": round(sim_w, 4)
                    })
    return output


def save_output(output: dict, filename: str):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(output, f, indent=4)
        print(f"✅ Results saved to {filename}")
    except Exception as e:
        print(f"❌ Failed to save results: {e}")


if __name__ == "__main__":
    # === CONFIGURATION ===
    model_path = '9M[256-10]_sg.bin'
    top_n_similar_words = 10
    word2vec_threshold = 0.7
    levenshtein_threshold = 0.90
    output_file = 'token-to-cso-combined_production.json'

    # === EXECUTION ===
    model = load_word2vec_model(model_path)
    ontology = load_ontology()
    results = match_terms(
        model,
        ontology,
        top_n=top_n_similar_words,
        word_similarity=word2vec_threshold,
        min_similarity=levenshtein_threshold
    )
    save_output(results, output_file)


In [12]:
"""
Updated script to combine Word2Vec and CSO ontology
"""

import sys
from rapidfuzz.distance import Levenshtein  # For Levenshtein similarity
import json
from gensim.models import KeyedVectors

# Add path to ontology.py
sys.path.append(r"C:\Users\Faisal Ramzan\Desktop\kmi_project_cso\cso-reader-main\cso_reader")

from ontology import Ontology

# Load Word2Vec model
model_path = '9M[256-10]_sg.bin'
try:
    model = KeyedVectors.load_word2vec_format(model_path, binary=True)
except FileNotFoundError:
    print(f"Error: Model file '{model_path}' not found.")
    raise

# Load ontology
try:
    ontology = Ontology()
    print("Computer Science Ontology loaded.")
except Exception as e:
    print(f"Error loading ontology: {e}")
    raise

# Parameters
min_similarity = 0.90  # Levenshtein similarity threshold (0 to 1)
word_similarity = 0.7  # Word2Vec similarity threshold (0 to 1)
top_amount_of_words = 10  # Number of similar words to retrieve

output = {}
i = 0

for wet in model.key_to_index:  # Iterate over Word2Vec vocabulary
    i += 1
    if (i % 1000 == 0):
        print(f"Processed {i} words, output size: {len(output)}")
    
    output[wet] = []
    
    # Get similar words from Word2Vec
    similar_words = []
    similar_words.append((wet, 1.0))  # Include the word itself
    try:
        similarities = model.most_similar(wet, topn=top_amount_of_words)
        similar_words.extend(similarities)
    except KeyError:
        print(f"Word '{wet}' not in model vocabulary, skipping.")
        continue
    
    for wet2, sim in similar_words:
        if sim >= word_similarity:
            # Find ontology topics related to wet2 (substring matching and descendants)
            related_by_name = [topic for topic in ontology.topics if wet2.lower() in topic.lower()]
            descendants = ontology.get_all_descendants_of_topics([wet2]) if wet2 in ontology.topics else []
            topics = list(set(related_by_name + descendants))
            
            for topic in topics:
                # Compute Levenshtein similarity using rapidfuzz (normalized to 0-1)
                m = Levenshtein.normalized_similarity(topic, wet2)  # Using Levenshtein normalized similarity
                if m >= min_similarity:
                    output[wet].append({
                        "topic": topic,
                        "sim_t": m,
                        "wet": wet2,
                        "sim_w": sim
                    })

# Save the cached model
try:
    with open('token-to-cso-combined_v1.json', 'w') as outfile:
        json.dump(output, outfile, indent=4)
    print("Saved mappings to 'token-to-cso-combined_v1.json'")
except Exception as e:
    print(f"Error saving JSON file: {e}")

Computer Science Ontology loaded.
Computer Science Ontology loaded.
Processed 1000 words, output size: 999
Processed 2000 words, output size: 1999
Processed 3000 words, output size: 2999
Processed 4000 words, output size: 3999
Processed 5000 words, output size: 4999
Processed 6000 words, output size: 5999
Processed 7000 words, output size: 6999
Processed 8000 words, output size: 7999
Processed 9000 words, output size: 8999
Processed 10000 words, output size: 9999
Processed 11000 words, output size: 10999
Processed 12000 words, output size: 11999
Processed 13000 words, output size: 12999
Processed 14000 words, output size: 13999
Processed 15000 words, output size: 14999
Processed 16000 words, output size: 15999
Processed 17000 words, output size: 16999
Saved mappings to 'token-to-cso-combined_v1.json'


In [13]:

"""
Updated script to combine Word2Vec and CSO ontology
Running in main.ipynb
"""

import sys
from rapidfuzz import fuzz  # For Levenshtein similarity
import json
from gensim.models import KeyedVectors

# Add path to ontology.py
sys.path.append(r"C:\Users\Faisal Ramzan\Desktop\kmi_project_cso\cso-reader-main\cso_reader")

from ontology import Ontology

# Load Word2Vec model
model_path = '9M[256-10]_sg.bin'
try:
    model = KeyedVectors.load_word2vec_format(model_path, binary=True)
except FileNotFoundError:
    print(f"Error: Model file '{model_path}' not found.")
    raise

# Load ontology
try:
    ontology = Ontology()
    print("Computer Science Ontology loaded.")
except Exception as e:
    print(f"Error loading ontology: {e}")
    raise

# Parameters
min_similarity = 0.90  # Levenshtein similarity threshold (0 to 1)
word_similarity = 0.7  # Word2Vec similarity threshold (0 to 1)
top_amount_of_words = 10  # Number of similar words to retrieve

output = {}
i = 0

for wet in model.key_to_index:  # Iterate over Word2Vec vocabulary
    i += 1
    if (i % 1000 == 0):
        print(f"Processed {i} words, output size: {len(output)}")
    
    output[wet] = []
    
    # Get similar words from Word2Vec
    similar_words = []
    similar_words.append((wet, 1.0))  # Include the word itself
    try:
        similarities = model.most_similar(wet, topn=top_amount_of_words)
        similar_words.extend(similarities)
    except KeyError:
        print(f"Word '{wet}' not in model vocabulary, skipping.")
        continue
    
    for wet2, sim in similar_words:
        if sim >= word_similarity:
            # Find ontology topics related to wet2 (substring matching and descendants)
            related_by_name = [topic for topic in ontology.topics if wet2.lower() in topic.lower()]
            descendants = ontology.get_all_descendants_of_topics([wet2]) if wet2 in ontology.topics else []
            topics = list(set(related_by_name + descendants))
            
            for topic in topics:
                # Compute Levenshtein similarity using rapidfuzz (normalized to 0-1)
                m = fuzz.ratio(topic, wet2) / 100.0  # rapidfuzz returns 0-100, normalize to 0-1
                if m >= min_similarity:
                    output[wet].append({
                        "topic": topic,
                        "sim_t": m,
                        "wet": wet2,
                        "sim_w": sim
                    })

# Save the cached model
try:
    with open('token-to-cso-combined_v2.json', 'w') as outfile:
        json.dump(output, outfile, indent=4)
    print("Saved mappings to 'token-to-cso-combined_v2.json'")
except Exception as e:
    print(f"Error saving JSON file: {e}")

Computer Science Ontology loaded.
Computer Science Ontology loaded.
Processed 1000 words, output size: 999
Processed 2000 words, output size: 1999
Processed 3000 words, output size: 2999
Processed 4000 words, output size: 3999
Processed 5000 words, output size: 4999
Processed 6000 words, output size: 5999
Processed 7000 words, output size: 6999
Processed 8000 words, output size: 7999
Processed 9000 words, output size: 8999
Processed 10000 words, output size: 9999
Processed 11000 words, output size: 10999
Processed 12000 words, output size: 11999
Processed 13000 words, output size: 12999
Processed 14000 words, output size: 13999
Processed 15000 words, output size: 14999
Processed 16000 words, output size: 15999
Processed 17000 words, output size: 16999
Saved mappings to 'token-to-cso-combined_v2.json'


## 4 char

In [14]:
"""
Updated script to combine Word2Vec and CSO ontology with first 4-char matching
"""

import sys
from rapidfuzz.distance import Levenshtein  # For Levenshtein similarity
import json
from gensim.models import KeyedVectors

# Add path to ontology.py
sys.path.append(r"C:\Users\Faisal Ramzan\Desktop\kmi_project_cso\cso-reader-main\cso_reader")

from ontology import Ontology

# Load Word2Vec model
model_path = '9M[256-10]_sg.bin'
try:
    model = KeyedVectors.load_word2vec_format(model_path, binary=True)
except FileNotFoundError:
    print(f"Error: Model file '{model_path}' not found.")
    raise

# Load ontology
try:
    ontology = Ontology()
    print("Computer Science Ontology loaded.")
except Exception as e:
    print(f"Error loading ontology: {e}")
    raise

# Parameters
min_similarity = 0.9  # Levenshtein similarity threshold (0 to 1)
word_similarity = 0.7  # Word2Vec similarity threshold (0 to 1)
top_amount_of_words = 10  # Number of similar words to retrieve

output = {}
i = 0

for wet in model.key_to_index:  # Iterate over Word2Vec vocabulary
    i += 1
    if (i % 1000 == 0):
        print(f"Processed {i} words, output size: {len(output)}")
    
    output[wet] = []
    
    # Get similar words from Word2Vec
    similar_words = []
    similar_words.append((wet, 1.0))  # Include the word itself
    try:
        similarities = model.most_similar(wet, topn=top_amount_of_words)
        similar_words.extend(similarities)
    except KeyError:
        print(f"Word '{wet}' not in model vocabulary, skipping.")
        continue
    
    for wet2, sim in similar_words:
        if sim >= word_similarity:
            # Find ontology topics starting with first 4 characters of wet2
            topics = [topic for topic in ontology.topics if topic.lower().startswith(wet2.lower()[:4])]
            for topic in topics:
                # Compute Levenshtein similarity using rapidfuzz (normalized to 0-1)
                m = Levenshtein.normalized_similarity(topic, wet2)
                if m >= min_similarity:
                    try:
                        output[wet].append({
                            "topic": topic,
                            "sim_t": m,
                            "wet": wet2,
                            "sim_w": sim
                        })
                    except KeyError as e:
                        print(f"KeyError for word: {wet}, error: {e}")

# Save the cached model
try:
    with open('token-to-cso-combined_v3.json', 'w') as outfile:
        json.dump(output, outfile, indent=4)
    print("Saved mappings to 'token-to-cso-combined_v3.json'")
except Exception as e:
    print(f"Error saving JSON file: {e}")

Computer Science Ontology loaded.
Computer Science Ontology loaded.
Processed 1000 words, output size: 999
Processed 2000 words, output size: 1999
Processed 3000 words, output size: 2999
Processed 4000 words, output size: 3999
Processed 5000 words, output size: 4999
Processed 6000 words, output size: 5999
Processed 7000 words, output size: 6999
Processed 8000 words, output size: 7999
Processed 9000 words, output size: 8999
Processed 10000 words, output size: 9999
Processed 11000 words, output size: 10999
Processed 12000 words, output size: 11999
Processed 13000 words, output size: 12999
Processed 14000 words, output size: 13999
Processed 15000 words, output size: 14999
Processed 16000 words, output size: 15999
Processed 17000 words, output size: 16999
Saved mappings to 'token-to-cso-combined_v3.json'


In [None]:
# tomorrow, I have to merge this cashing model.py with pipline, and then it can start from step one till 8 and then pipeline can work accordingly

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Updated script to combine Word2Vec and CSO ontology for specific test words
Running in main.ipynb
"""

import sys
from rapidfuzz import fuzz  # For Levenshtein similarity
import json
from gensim.models import KeyedVectors

# Add path to ontology.py
sys.path.append(r"C:\Users\Faisal Ramzan\Desktop\kmi_project_cso\cso-reader-main\cso_reader")

from ontology import Ontology

# Load Word2Vec model
model_path = '9M[256-10]_sg.bin'
try:
    model = KeyedVectors.load_word2vec_format(model_path, binary=True)
except FileNotFoundError:
    print(f"Error: Model file '{model_path}' not found.")
    raise

# Load ontology
try:
    ontology = Ontology()
    print("Computer Science Ontology loaded.")
except Exception as e:
    print(f"Error loading ontology: {e}")
    raise

# Parameters
min_similarity = 0.94  # Levenshtein similarity threshold (0 to 1)
word_similarity = 0.7  # Word2Vec similarity threshold (0 to 1)
top_amount_of_words = 10  # Number of similar words to retrieve

# Test words
test_words = ['web', 'semantic_web', 'learning', 'ontology']  # Add or modify words here

output = {}
i = 0

for wet in test_words:  # Iterate over test words only
    i += 1
    print(f"\nProcessing word {i}: '{wet}'")
    
    if wet not in model.key_to_index:
        print(f"Word '{wet}' not in model vocabulary, skipping.")
        continue
    
    output[wet] = []
    
    # Get similar words from Word2Vec
    similar_words = []
    similar_words.append((wet, 1.0))  # Include the word itself
    try:
        similarities = model.most_similar(wet, topn=top_amount_of_words)
        similar_words.extend(similarities)
    except KeyError:
        print(f"Error retrieving similar words for '{wet}', skipping.")
        continue
    
    # Process similar words
    for wet2, sim in similar_words:
        if sim >= word_similarity:
            # Find ontology topics related to wet2 (substring matching and descendants)
            related_by_name = [topic for topic in ontology.topics if wet2.lower() in topic.lower()]
            descendants = ontology.get_all_descendants_of_topics([wet2]) if wet2 in ontology.topics else []
            topics = list(set(related_by_name + descendants))
            
            for topic in topics:
                # Compute Levenshtein similarity using rapidfuzz (normalized to 0-1)
                m = fuzz.ratio(topic, wet2) / 100.0  # rapidfuzz returns 0-100, normalize to 0-1
                if m >= min_similarity:
                    output[wet].append({
                        "topic": topic,
                        "sim_t": m,
                        "wet": wet2,
                        "sim_w": sim
                    })
    
    # Print mappings for the current word
    if output[wet]:
        print(f"Mappings for '{wet}':")
        for mapping in output[wet]:
            print(f"  Topic: {mapping['topic']}, Levenshtein Sim: {mapping['sim_t']:.4f}, "
                  f"Word2Vec Word: {mapping['wet']}, Word2Vec Sim: {mapping['sim_w']:.4f}")
    else:
        print(f"No mappings found for '{wet}' with given thresholds.")

# Save the cached model
try:
    with open('token-to-cso-combined.json', 'w') as outfile:
        json.dump(output, outfile, indent=4)
    print("\nSaved mappings to 'token-to-cso-combined_v2.json'")
except Exception as e:
    print(f"\nError saving JSON file: {e}")

Computer Science Ontology loaded.
Computer Science Ontology loaded.

Processing word 1: 'web'
No mappings found for 'web' with given thresholds.

Processing word 2: 'semantic_web'
Mappings for 'semantic_web':
  Topic: owl, Levenshtein Sim: 1.0000, Word2Vec Word: owl, Word2Vec Sim: 0.7862

Processing word 3: 'learning'
Mappings for 'learning':
  Topic: elearning, Levenshtein Sim: 0.9412, Word2Vec Word: learning, Word2Vec Sim: 1.0000

Processing word 4: 'ontology'
Mappings for 'ontology':
  Topic: ontology, Levenshtein Sim: 1.0000, Word2Vec Word: ontology, Word2Vec Sim: 1.0000
  Topic: ontologies, Levenshtein Sim: 1.0000, Word2Vec Word: ontologies, Word2Vec Sim: 0.8011
  Topic: owl, Levenshtein Sim: 1.0000, Word2Vec Word: owl, Word2Vec Sim: 0.7391

Saved mappings to 'token-to-cso-combined.json'


In [67]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Updated script to combine Word2Vec and CSO ontology for specific test words
Using rapidfuzz.distance.Levenshtein
Running in main.ipynb
"""

import sys
from rapidfuzz.distance import Levenshtein  # For Levenshtein similarity
import json
from gensim.models import KeyedVectors

# Add path to ontology.py
sys.path.append(r"C:\Users\Faisal Ramzan\Desktop\kmi_project_cso\cso-reader-main\cso_reader")

from ontology import Ontology

# Load Word2Vec model
model_path = '9M[256-10]_sg.bin'
try:
    model = KeyedVectors.load_word2vec_format(model_path, binary=True)
except FileNotFoundError:
    print(f"Error: Model file '{model_path}' not found.")
    raise

# Load ontology
try:
    ontology = Ontology()
    print("Computer Science Ontology loaded.")
except Exception as e:
    print(f"Error loading ontology: {e}")
    raise

# Parameters
min_similarity = 0.94  # Levenshtein similarity threshold (0 to 1)
word_similarity = 0.7  # Word2Vec similarity threshold (0 to 1)
top_amount_of_words = 10  # Number of similar words to retrieve

# Test words
test_words = ['web', 'semantic_web', 'learning', 'ontology']  # Add or modify words here

output = {}
i = 0

for wet in test_words:  # Iterate over test words only
    i += 1
    print(f"\nProcessing word {i}: '{wet}'")
    
    if wet not in model.key_to_index:
        print(f"Word '{wet}' not in model vocabulary, skipping.")
        continue
    
    output[wet] = []
    
    # Get similar words from Word2Vec
    similar_words = []
    similar_words.append((wet, 1.0))  # Include the word itself
    try:
        similarities = model.most_similar(wet, topn=top_amount_of_words)
        similar_words.extend(similarities)
    except KeyError:
        print(f"Error retrieving similar words for '{wet}', skipping.")
        continue
    
    for wet2, sim in similar_words:
        if sim >= word_similarity:
            # Find ontology topics related to wet2 (substring matching and descendants)
            related_by_name = [topic for topic in ontology.topics if wet2.lower() in topic.lower()]
            descendants = ontology.get_all_descendants_of_topics([wet2]) if wet2 in ontology.topics else []
            topics = list(set(related_by_name + descendants))
            
            for topic in topics:
                # Compute Levenshtein similarity using rapidfuzz.distance.Levenshtein
                m = Levenshtein.normalized_similarity(topic, wet2)  # Returns 0-1
                if m >= min_similarity:
                    output[wet].append({
                        "topic": topic,
                        "sim_t": m,
                        "wet": wet2,
                        "sim_w": sim
                    })
    
    # Print mappings for the current word
    if output[wet]:
        print(f"Mappings for '{wet}':")
        for mapping in output[wet]:
            print(f"  Topic: {mapping['topic']}, Levenshtein Sim: {mapping['sim_t']:.4f}, "
                  f"Word2Vec Word: {mapping['wet']}, Word2Vec Sim: {mapping['sim_w']:.4f}")
    else:
        print(f"No mappings found for '{wet}' with given thresholds.")

# Save the cached model
try:
    with open('token-to-cso-combined.json', 'w') as outfile:
        json.dump(output, outfile, indent=4)
    print("\nSaved mappings to 'token-to-cso-combined.json'")
except Exception as e:
    print(f"\nError saving JSON file: {e}")

Computer Science Ontology loaded.
Computer Science Ontology loaded.

Processing word 1: 'web'
No mappings found for 'web' with given thresholds.

Processing word 2: 'semantic_web'
Mappings for 'semantic_web':
  Topic: owl, Levenshtein Sim: 1.0000, Word2Vec Word: owl, Word2Vec Sim: 0.7862

Processing word 3: 'learning'
No mappings found for 'learning' with given thresholds.

Processing word 4: 'ontology'
Mappings for 'ontology':
  Topic: ontology, Levenshtein Sim: 1.0000, Word2Vec Word: ontology, Word2Vec Sim: 1.0000
  Topic: ontologies, Levenshtein Sim: 1.0000, Word2Vec Word: ontologies, Word2Vec Sim: 0.8011
  Topic: owl, Levenshtein Sim: 1.0000, Word2Vec Word: owl, Word2Vec Sim: 0.7391

Saved mappings to 'token-to-cso-combined.json'
