In [3]:
URL_FILE_1 = "https://raw.githubusercontent.com/giankev/wikidata_cultural_classifier/refs/heads/main/dataset_parser.py"
URL_FILE_2 = "https://raw.githubusercontent.com/giankev/wikidata_cultural_classifier/refs/heads/main/wiki_extractor.py"
URL_FILE_3 = "https://raw.githubusercontent.com/giankev/wikidata_cultural_classifier/refs/heads/main/custom_dataset.py"

print("Scaricando il file 1...")
!wget {URL_FILE_1}

print("\nScaricando il file 2...")
!wget {URL_FILE_2}

print("\nScaricando il file 3...")
!wget {URL_FILE_3}

print("\nDownload completati. Contenuto della directory corrente:")

Scaricando il file 1...
--2025-05-03 19:41:12--  https://raw.githubusercontent.com/giankev/wikidata_cultural_classifier/refs/heads/main/dataset_parser.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9788 (9.6K) [text/plain]
Saving to: ‘dataset_parser.py.4’


2025-05-03 19:41:12 (22.4 MB/s) - ‘dataset_parser.py.4’ saved [9788/9788]


Scaricando il file 2...
--2025-05-03 19:41:12--  https://raw.githubusercontent.com/giankev/wikidata_cultural_classifier/refs/heads/main/wiki_extractor.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Leng

In [4]:
!pip install xgboost pandas scikit-learn wikidata


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
# Imports
import os
import xgboost as xgb
import pandas as pd
import numpy as np

from custom_dataset import CustomData

In [22]:
# Percorsi file
RAW_TEST_CSV        = '~/content/valid.csv'
PROCESSED_TEST_CSV  = '~/content/valid_processed.csv'

# Flag per forzare il ricalcolo
FORCE_REPROCESS = False

df_processed = None

# --- Try to load the already processed file, if it exists ---
if not FORCE_REPROCESS and os.path.exists(PROCESSED_TEST_CSV):
    print(f"Loading preprocessed data: {PROCESSED_TEST_CSV}...")
    try:
        df_processed = pd.read_csv(PROCESSED_TEST_CSV)
        print("Loading data done.")
    except Exception as e:
        print(f"Loading error {PROCESSED_TEST_CSV}, starting recomputing: {e}")
        df_processed = None  # Force recomputing

# --- If not existing or forced to, recompute the features ---
if df_processed is None:
    print("Processing test data non labeled...")
    try:
        # 1. Load the initial CSV file
        df_test_raw = pd.read_csv(RAW_TEST_CSV)
        
        # 2. Instantiate preprocessor and add features
        processor_test = CustomData(df_test_raw)
        df_test_featured = processor_test.add_feature()
        
        if df_test_featured is not None:
            # 3. Apply preprocess (encoding, scaling, ecc.)
            df_processed = processor_test.preprocess_data(df_test_featured)
            
            if df_processed is not None:
                # 4. Save result on an output CSV
                df_processed.to_csv(PROCESSED_TEST_CSV, index=False)
                print(f"Test data processed and saved in {PROCESSED_TEST_CSV}.")
            else:
                print("WARNING: Test data is None or empty, saving failed.")
        else:
            print("ERROR: Failed to add preprocessed features.")
        
    except Exception as e:
        print(f"ERROR during data preprocessing: {e}")
        df_processed = None

# --- final check ---
if df_processed is not None:
    print("\n--- Test processed ---")
    print(f"Shape of preprocessed data: {df_processed.shape}")
else:
    print("\nERROR: could not load preprocessed data.")

Processing test data non labeled...
Adding feature...

Fetch Summary:
  Attempted: 300
  Successful Fetches (Extractor created): 300
  Failed Fetches (Extractor is None): 0
Feature added...

--- Preprocessing DataFrame (Initial rows: 300) ---
Handling Missing Values: Nessuna riga con NaN trovata.
--- Preprocessing Completo (Final rows: 300, Final columns: 33) ---
Test data processed and saved in ~/content/valid_processed.csv.

--- Test processed ---
Shape of preprocessed data: (300, 33)


In [25]:
# Load the test data
try:
    csv_path = '~/content/valid_augmented.csv'
    df_aug = pd.read_csv(csv_path)
    print(f"Augmented data shape: {df_aug.shape}")
except Exception as e:
    print(f"ERROR - could not load augmented data: {e}")

# Display the first few rows
df_aug.head()

Augmented data shape: (300, 12)


Unnamed: 0,item,name,type,category,subcategory,label,title,page_length,num_links,mean_sitelinks_count,median_sitelinks_count,std_sitelinks_count
0,http://www.wikidata.org/entity/Q15786,1. FC Nürnberg,entity,sports,sports club,cultural representative,1. FC Nürnberg,14951,92,48.6,33.0,53.699644
1,http://www.wikidata.org/entity/Q268530,77 Records,entity,music,record label,cultural exclusive,77 Records,1254,33,26.875,12.0,38.68442
2,http://www.wikidata.org/entity/Q216153,A Bug's Life,entity,comics and anime,animated film,cultural representative,A Bug's Life,32226,159,46.348387,38.0,40.983017
3,http://www.wikidata.org/entity/Q593,A Gang Story,entity,films,film,cultural exclusive,A Gang Story,324,6,96.8,21.0,111.348821
4,http://www.wikidata.org/entity/Q192185,Aaron Copland,entity,performing arts,choreographer,cultural representative,Aaron Copland,53274,227,34.306667,25.0,36.933805


In [26]:
# augmented data with Custom Data
df_processed.columns.tolist()

['type',
 'label',
 'number_sitelinks',
 'sitelinks_translation_entropy',
 'number_claims',
 'po_P495',
 'po_P1343',
 'po_P2596',
 'po_P17',
 'number_of_P31',
 'sum_cultural_claims',
 'po_P172',
 'po_P1268',
 'po_P136',
 'category_architecture',
 'category_biology',
 'category_books',
 'category_comics and anime',
 'category_fashion',
 'category_films',
 'category_food',
 'category_geography',
 'category_gestures and habits',
 'category_history',
 'category_literature',
 'category_media',
 'category_music',
 'category_performing arts',
 'category_philosophy and religion',
 'category_politics',
 'category_sports',
 'category_transportation',
 'category_visual arts']

In [27]:
# loaded augmented data from wikipedia scraping
df_aug.columns.tolist()

['item',
 'name',
 'type',
 'category',
 'subcategory',
 'label',
 'title',
 'page_length',
 'num_links',
 'mean_sitelinks_count',
 'median_sitelinks_count',
 'std_sitelinks_count']

In [58]:
data_aug_aligned = df_aug.reindex(df_processed.index)
print(f"\nShape train_aug_aligned dopo reindex: {data_aug_aligned.shape}")

if df_processed.index.equals(df_aug.index):
    print("Features merged successfully")
else:
    print("ERROR: could not merge features by index")

data_aug_aligned = data_aug_aligned.drop(columns=['item', 'name', 'type', 'category', 'subcategory', 'title'])
df_val_aug_concat = pd.concat([df_processed, data_aug_aligned], axis=1)
y_val = df_val_aug_concat['label']
df_val_aug_concat = df_val_aug_concat.drop(columns=['label'])

df_val_aug_concat.columns = df_val_aug_concat.columns.str.lower()

features = df_val_aug_concat.columns.tolist()

for name in features:
    print(name)


Shape train_aug_aligned dopo reindex: (300, 12)
Features merged successfully
type
number_sitelinks
sitelinks_translation_entropy
number_claims
po_p495
po_p1343
po_p2596
po_p17
number_of_p31
sum_cultural_claims
po_p172
po_p1268
po_p136
category_architecture
category_biology
category_books
category_comics and anime
category_fashion
category_films
category_food
category_geography
category_gestures and habits
category_history
category_literature
category_media
category_music
category_performing arts
category_philosophy and religion
category_politics
category_sports
category_transportation
category_visual arts
page_length
num_links
mean_sitelinks_count
median_sitelinks_count
std_sitelinks_count


In [59]:
df = df_val_aug_concat.apply(pd.to_numeric, errors='coerce')

print("Shape before scaling:")
print(f"  Data: {df.shape}")

Shape before scaling:
  Data: (300, 37)


In [60]:
# Load the XGBoost model
model_path = '~/content/xgb_best_model_77.json'
booster = xgb.Booster()
booster.load_model(model_path)
booster.feature_names = [f.lower() for f in booster.feature_names]

df.columns = booster.feature_names

# Convert test data to DMatrix for inference
dtest = xgb.DMatrix(df)

# Run inference
y_pred_raw = booster.predict(dtest)
y_pred = np.argmax(y_pred_raw, axis=1)

In [61]:
# Map numeric predictions to class labels
label_map = {
    0: "cultural representative",
    1: "cultural exclusive",
    2: "cultural agnostic"
}

df['label'] = [label_map[i] for i in y_pred]

In [62]:
# Save to a new CSV
output_path = '~/content/test_out_noLLM.csv'
df.to_csv(output_path, index=False)

print(f"Saved predictions to {output_path}")

Saved predictions to ~/content/test_out_noLLM.csv


In [None]:
import os
import json
import pandas as pd
import numpy as np  # Import numpy

label_mapping = {
    'cultural agnostic': 0,
    'cultural representative': 1,
    'cultural exclusive': 2
}

y_pred = y_pred.map(label_mapping)
y_val= y_val.map(label_mapping)

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    precision_recall_fscore_support,
    classification_report
)
import seaborn as sns
import matplotlib.pyplot as plt

LABELS = ["cultural agnostic", "cultural representative", "cultural exclusive"]

def evaluate(preds, labels):
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    acc = np.mean(preds == labels)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# === Plotting ===
os.makedirs("nonlm_based", exist_ok=True)
cm = confusion_matrix(y_val, y_pred, labels=[0, 1, 2], normalize="true")
df_cm = pd.DataFrame(cm, index=LABELS, columns=LABELS)

plt.figure(figsize=(8, 6))
sns.heatmap(df_cm, annot=True, fmt=".2f", cmap="Blues", cbar=True)
plt.title("Confusion Matrix (Ultra Non-LM Based + Stacking)")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.tight_layout()
plt.savefig("nonlm_based/confusion_matrix_ultra_stacking.png")

# Classification Report
report = classification_report(y_val, y_pred, target_names=LABELS, zero_division=0)
print(report)
with open("nonlm_based/classification_report_ultra_stacking.json", "w") as f:
    json.dump(classification_report(y_val, y_pred, target_names=LABELS, output_dict=True), f, indent=2)

print("\n\u2705 Ultra Stacking Mode Completed!")

TypeError: the first argument must be callable