## Script to extract cso_concepts

In [10]:
# Step 1: Import required libraries
import sys
import os

# Add the current directory to Python path
sys.path.append(os.getcwd())

# Step 2: Import the cso_script function
from cso_script import extract_cso_concepts

# Step 3: Set the file path to your TTL file
ttl_path = os.path.join(os.getcwd(), "cso_label", "CSO", "CSO.3.4.1.ttl")

# Step 4: Set the output CSV path
output_csv = os.path.join(os.getcwd(), "cso_label", "cso_label_counts.csv")

# Step 5: Extract concepts
try:
    cso_concepts = extract_cso_concepts(ttl_path, save_csv=True, csv_path=output_csv)
    
    # Step 6: Preview results
    print(f"Total concepts extracted: {len(cso_concepts)}")
    print("\nSample Concepts (first 10):")
    for concept in cso_concepts[:10]:
        print(f"- {concept}")
        
except Exception as e:
    print(f"Error occurred: {str(e)}")

Saved 14612 labels to CSV: c:\Users\Faisal Ramzan\Desktop\kmi_project_cso\cso_label\cso_label_counts.csv
Extracted 14612 unique CSO concepts.
Total concepts extracted: 14612

Sample Concepts (first 10):
- computer science
- automated pattern recognition
- subtraction technique
- nonrigid registration
- non-rigid registration
- manifold learning
- nonlinear dimensionality reduction
- locality preserving projections
- locality preserving projection
- gait recognition


## Running main script

In [31]:
# Import required libraries
import sys
import os
import time
import pandas as pd
from concept_find_replace import process_dataset
import nltk
nltk.download('punkt_tab')

# Set paths for input files
dataset_path = os.path.join(os.getcwd(), "paper_dataset", "paper_dataset.csv")
cso_path = os.path.join(os.getcwd(), "cso_label", "cso_label_counts.csv")

# Set path for output file
save_path = os.path.join(os.getcwd(), "paper_dataset", "processed_title_abstract_v2.csv")

# Set sample size (use None for full dataset)
sample_size = 1000  # Change this to any number you want, or None for full dataset

try:
    start_time = time.time()
    
    # Read the dataset with sampling
    if sample_size:
        df = pd.read_csv(dataset_path, nrows=sample_size)
        print(f"Processing first {sample_size} rows of the dataset")
    else:
        df = pd.read_csv(dataset_path)
        print("Processing entire dataset")
    
    # Save the sampled dataset temporarily
    temp_dataset = "temp_dataset.csv"
    df.to_csv(temp_dataset, index=False)
    
    # Process the sampled dataset
    processed_df = process_dataset(
        dataset_path=temp_dataset,
        cso_path=cso_path,
        save_path=save_path
    )
    
    # Clean up temporary file
    if os.path.exists(temp_dataset):
        os.remove(temp_dataset)
    
    end_time = time.time()
    processing_time = end_time - start_time
    
    # Display results
    print(f"\nProcessing completed in {processing_time:.2f} seconds")
    print("\nSample of processed titles and abstracts:")
    print("\nFirst 2 processed titles:")
    print(processed_df['title_processed'].head(2))
    print("\nFirst 2 processed abstracts:")
    print(processed_df['abstract_processed'].head(2))
    
    print(f"\nProcessed data saved to: {save_path}")
    
except Exception as e:
    print(f"Error occurred: {str(e)}")

[nltk_data] Downloading package punkt_tab to C:\Users\Faisal
[nltk_data]     Ramzan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Processing first 1000 rows of the dataset

Processing completed in 1891.58 seconds

Sample of processed titles and abstracts:

First 2 processed titles:
0    a new approach of 3d watermarking based on ima...
1    attractor neural_networks with activitydepende...
Name: title_processed, dtype: object

First 2 processed abstracts:
0    in this paper a robust 3d triangular mesh wate...
1    we studied an autoassociative neural_network w...
Name: abstract_processed, dtype: object

Processed data saved to: c:\Users\Faisal Ramzan\Desktop\kmi_project_cso\paper_dataset\processed_title_abstract_v2.csv


## Tokenization

In [None]:
# Import required libraries
import sys
import os
import time
import pandas as pd
from concept_find_replace import process_dataset


# Set paths for input files
dataset_path = os.path.join(os.getcwd(), "paper_dataset", "paper_dataset.csv")
cso_path = os.path.join(os.getcwd(), "cso_label", "cso_label_counts.csv")

# Set path for output file
save_path = os.path.join(os.getcwd(), "paper_dataset", "processed_title_abstract_v2.csv")

# Set sample size (use None for full dataset)
sample_size = 2  # Change this to any number you want, or None for full dataset

try:
    start_time = time.time()
    
    # Read the dataset with sampling
    if sample_size:
        df = pd.read_csv(dataset_path, nrows=sample_size)
        print(f"Processing first {sample_size} rows of the dataset")
    else:
        df = pd.read_csv(dataset_path)
        print("Processing entire dataset")
    
    # Save the sampled dataset temporarily
    temp_dataset = "temp_dataset.csv"
    df.to_csv(temp_dataset, index=False)
    
    # Process the sampled dataset
    processed_df = process_dataset(
        dataset_path=temp_dataset,
        cso_path=cso_path,
        save_path=save_path
    )
    
    # Clean up temporary file
    if os.path.exists(temp_dataset):
        os.remove(temp_dataset)
    
    end_time = time.time()
    processing_time = end_time - start_time
    
    # Display results
    print(f"\nProcessing completed in {processing_time:.2f} seconds")
    
    print("\n=== Processed Text Results ===")
    print("\nFirst 2 processed titles:")
    print(processed_df['title_processed'].head(2))
    print("\nFirst 2 processed abstracts:")
    print(processed_df['abstract_processed'].head(2))
    
    print("\n=== N-gram Results ===")
    if 'tokens' in processed_df.columns:
        print("\nSample tokenized text:")
        print(processed_df['tokens'].head(1))
    
    if 'trigrams' in processed_df.columns:
        print("\nSample trigrams:")
        for idx, trigram in enumerate(processed_df['trigrams'].head(2)):
            print(f"\nDocument {idx + 1} trigrams:")
            print(trigram)
    
    print(f"\nProcessed data saved to: {save_path}")
    
except Exception as e:
    print(f"Error occurred: {str(e)}")

Processing first 2 rows of the dataset

Processing completed in 2.05 seconds

=== Processed Text Results ===

First 2 processed titles:
0    a new approach of 3d watermarking based on ima...
1    attractor neural_networks with activitydepende...
Name: title_processed, dtype: object

First 2 processed abstracts:
0    in this paper a robust 3d triangular mesh wate...
1    we studied an autoassociative neural_network w...
Name: abstract_processed, dtype: object

=== N-gram Results ===

Sample tokenized text:
0    [a, new, approach, of, 3d, watermarking, based...
Name: tokens, dtype: object

Sample trigrams:

Document 1 trigrams:
['a', 'new', 'approach', 'of', '3d', 'watermarking', 'based_on', 'image_segmentation', 'in', 'this', 'paper', 'a', 'robust', '3d', 'triangular', 'mesh', 'watermarking', 'algorithm', 'based_on', '3d', 'segmentation', 'is', 'proposed', 'in', 'this', 'algorithm', 'three', 'classes', 'of', 'watermarking', 'are', 'combined', 'first', 'we', 'segment', 'the', 'original',

## Model word2vec

In [54]:
import sys
sys.path.append(os.getcwd())
from concept_find_replace import process_dataset
from train_word2vec import train_and_save_word2vec

# Set file paths
dataset_path = os.path.join(os.getcwd(), "paper_dataset", "paper_dataset.csv")
cso_path = os.path.join(os.getcwd(), "cso_label", "cso_label_counts.csv")

# Set path for output file
processed_output = os.path.join(os.getcwd(), "paper_dataset", "processed_title_abstract_v2.csv")


model_output = "word2vec_cso.model"

# Step 1: Process and get trigrams
df = process_dataset(dataset_path, cso_path, save_path=processed_output)

# Step 2: Train Word2Vec
model = train_and_save_word2vec(processed_output, model_output)

# Step 3: Preview (optional)
print("\n Sample Output:")
for i, row in df.head(5).iterrows():
    print(f"\n Paper {i+1} Title:\n{row['title_processed']}")
    print(f"\n Abstract:\n{row['abstract_processed']}")
    print(f"\n Trigrams:\n{row['trigrams']}")
    print('-'*100)


PermissionError: [Errno 13] Permission denied: 'c:\\Users\\Faisal Ramzan\\Desktop\\kmi_project_cso\\paper_dataset\\processed_title_abstract_v2.csv'

In [42]:
import sys
sys.path.append('.')  # Ensure current directory is in path
from train_word2vec import train_and_save_word2vec

trigrams_csv_path = 'paper_dataset/processed_title_abstract_v2.csv'  # Update if your path is different
model_save_path = 'word2vec_cso.model'

# Train and save the Word2Vec model
model = train_and_save_word2vec(trigrams_csv_path, model_save_path)

ModuleNotFoundError: No module named 'train_word2vec'