# Optimized ConceptNet Preprocessing
This notebook demonstrates the usage of the optimized ConceptNet preprocessor that handles large files efficiently.

In [1]:
import sys
sys.path.append('../Py Scripts')
from conceptnet_processor_v3 import ConceptNetStreamProcessor, preprocess_conceptnet
import pandas as pd
import numpy as np
import time

## 1. Initial Preprocessing
First, we'll preprocess the raw ConceptNet data. This only needs to be done once.

In [2]:
# Preprocess both English and German ConceptNet data
# Comment this out if you've already run it once
preprocess_conceptnet(
    input_dir='../Data/Input',
    output_dir='../Data/Processed',
    languages=['en', 'de'],
    min_weight=1.0
)

Processing en ConceptNet data...
Saved processed en data to ..\Data\Processed\conceptnet_en_processed.parquet
Processing de ConceptNet data...
Saved processed de data to ..\Data\Processed\conceptnet_de_processed.parquet
Preprocessing complete. Use load_processed_data() to load the results.


## 2. Loading Processed Data
Now we can load samples of the processed data efficiently.

In [3]:
# Initialize processor
processor = ConceptNetStreamProcessor(
    input_dir='../Data/Input',
    output_dir='../Data/Processed'
)

In [4]:
# Measure loading time for a 10% sample
start_time = time.time()

# Load a 10% sample of English data
english_sample = processor.load_processed_data(
    lang='en',
    sample_size=0.1  # 10% sample
)

load_time = time.time() - start_time
print(f"Loaded {len(english_sample)} English assertions in {load_time:.2f} seconds")
print(f"Memory usage: {english_sample.memory_usage().sum() / (1024*1024):.2f} MB")

# Display sample of the data
english_sample.head()

Loaded 314339 English assertions in 0.13 seconds
Memory usage: 24.23 MB


Unnamed: 0,clean_start,clean_end,relation,weight,source_lang,target_lang
34536,silverware,kitchen,AtLocation,3.464,en,en
155482,demonstrative_pronoun,a,DerivedFrom,1.0,en,en
130980,chairperson,chair,DerivedFrom,1.0,en,en
17957,cat,fight,AtLocation,1.0,en,en
102641,askt,ask,DerivedFrom,1.0,en,en


In [5]:
# Load a 10% sample of German data
start_time = time.time()

german_sample = processor.load_processed_data(
    lang='de',
    sample_size=0.1  # 10% sample
)

load_time = time.time() - start_time
print(f"Loaded {len(german_sample)} German assertions in {load_time:.2f} seconds")
print(f"Memory usage: {german_sample.memory_usage().sum() / (1024*1024):.2f} MB")

# Display sample of the data
german_sample.head()

Loaded 107503 German assertions in 0.08 seconds
Memory usage: 8.22 MB


Unnamed: 0,clean_start,clean_end,relation,weight,source_lang,target_lang
199003,n,pflichtethik,DistinctFrom,1.0,de,de
1332,v,preisen,Antonym,1.0,de,de
43281,fernsehen,v,DerivedFrom,1.0,de,de
128079,stierend,v,DerivedFrom,1.0,de,de
183696,n,schlichtwohnung,DistinctFrom,1.0,de,de


## 3. Flexible Data Loading
Let's demonstrate different sampling options.

In [6]:
# Load with a specific row limit (regardless of total size)
english_top_10k = processor.load_processed_data(
    lang='en',
    max_rows=10000
)

print(f"Loaded top {len(english_top_10k)} English assertions")
english_top_10k.head()

Dask loading failed with error: 'DataFrame' object has no attribute 'compute'
Falling back to pandas for data loading
Loaded top 10000 English assertions


Unnamed: 0,clean_start,clean_end,relation,weight,source_lang,target_lang
0,n,1,Antonym,1.0,en,en
1,n,24_hour_clock,Antonym,1.0,en,en
2,n,12_hour_clock,Antonym,1.0,en,en
3,n,3,Antonym,1.0,en,en
4,n,d.c,Antonym,1.0,en,en


In [7]:
# Load a very small sample for quick experimentation
english_mini = processor.load_processed_data(
    lang='en',
    sample_size=0.01  # 1% sample
)

print(f"Loaded {len(english_mini)} English assertions (1% sample)")
english_mini.head()

Loaded 31434 English assertions (1% sample)


Unnamed: 0,clean_start,clean_end,relation,weight,source_lang,target_lang
34536,silverware,kitchen,AtLocation,3.464,en,en
155482,demonstrative_pronoun,a,DerivedFrom,1.0,en,en
130980,chairperson,chair,DerivedFrom,1.0,en,en
17957,cat,fight,AtLocation,1.0,en,en
102641,askt,ask,DerivedFrom,1.0,en,en


## 4. Data Statistics
Let's analyze the processed data.

In [8]:
def analyze_dataset(df, name):
    """Print statistics about a ConceptNet dataset"""
    print(f"\n=== {name} Dataset Analysis ===")
    print(f"Total assertions: {len(df)}")
    print(f"Unique source concepts: {df['clean_start'].nunique()}")
    print(f"Unique target concepts: {df['clean_end'].nunique()}")
    print(f"Unique relation types: {df['relation'].nunique()}")
    print(f"Weight range: {df['weight'].min():.2f} - {df['weight'].max():.2f}")
    
    # Top relations
    print("\nTop relation types:")
    print(df['relation'].value_counts().head(5))
    
    # Top concepts
    print("\nTop source concepts:")
    print(df['clean_start'].value_counts().head(5))
    
    print("\nTop target concepts:")
    print(df['clean_end'].value_counts().head(5))

# Analyze English sample
analyze_dataset(english_sample, "English")

# Analyze German sample
analyze_dataset(german_sample, "German")


=== English Dataset Analysis ===
Total assertions: 314339
Unique source concepts: 48192
Unique target concepts: 113043
Unique relation types: 34
Weight range: 1.00 - 12.34

Top relation types:
relation
RelatedTo      155389
FormOf          37944
DerivedFrom     32436
HasContext      23304
IsA             19812
Name: count, dtype: int64[pyarrow]

Top source concepts:
clean_start
n       161273
a        27978
v        24373
en_1      5752
wn        3709
Name: count, dtype: int64[pyarrow]

Top target concepts:
clean_end
n           14535
wn           3824
plant        2042
artifact     1856
a            1694
Name: count, dtype: int64[pyarrow]

=== German Dataset Analysis ===
Total assertions: 107503
Unique source concepts: 43891
Unique target concepts: 35445
Unique relation types: 9
Weight range: 1.00 - 2.83

Top relation types:
relation
FormOf         30982
RelatedTo      25407
IsA            18621
DerivedFrom    15525
Synonym        11871
Name: count, dtype: int64[pyarrow]

Top source 