In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
def prepare_data(X, y, test_size=0.2, random_state=42):
    """
    Prepare data for SVM classification
    
    Parameters:
    - X: Feature matrix (embeddings)
    - y: Target labels
    - test_size: Proportion of test split
    - random_state: Seed for reproducibility
    
    Returns:
    - Train and test splits
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=test_size, 
        random_state=random_state, 
        stratify=y  # Ensures balanced class distribution
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

In [3]:
def train_svm(X_train, y_train, kernel='rbf', C=1.0, random_state=42):
    """
    Train SVM classifier
    
    Parameters:
    - X_train: Scaled training features
    - y_train: Training labels
    - kernel: SVM kernel type
    - C: Regularization parameter
    
    Returns:
    - Trained SVM classifier
    """
    svm_classifier = SVC(
        kernel=kernel,  # 'linear', 'rbf', 'poly', 'sigmoid'
        C=C,            # Regularization strength
        random_state=random_state,
        probability=True  # Enable probability estimates
    )
    
    svm_classifier.fit(X_train, y_train)
    return svm_classifier


In [4]:
def evaluate_svm(classifier, X_test, y_test):
    """
    Evaluate SVM classifier
    
    Parameters:
    - classifier: Trained SVM model
    - X_test: Scaled test features
    - y_test: Test labels
    
    Returns:
    - Classification metrics
    """
    # Predictions
    y_pred = classifier.predict(X_test)
    
    # Detailed classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Cross-validation score
    cv_scores = cross_val_score(classifier, X_test, y_test, cv=5)
    print(f"\nCross-validation Scores: {cv_scores}")
    print(f"Mean CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [5]:
def tune_svm_hyperparameters(X_train, y_train):
    """
    Perform grid search for SVM hyperparameter tuning
    
    Parameters:
    - X_train: Scaled training features
    - y_train: Training labels
    
    Returns:
    - Best parameters
    """
    from sklearn.model_selection import GridSearchCV
    
    # Define parameter grid
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf', 'linear', 'poly'],
        'gamma': ['scale', 'auto', 0.1, 1]
    }
    
    # Grid Search
    grid_search = GridSearchCV(
        SVC(), 
        param_grid, 
        cv=5, 
        scoring='accuracy'
    )
    
    grid_search.fit(X_train, y_train)
    
    print("Best Parameters:", grid_search.best_params_)
    print("Best Cross-validation Score:", grid_search.best_score_)
    
    return grid_search.best_estimator_

In [6]:
import joblib
def save_svm_model(classifier, scaler, model_path='svm_model.joblib', scaler_path='scaler.joblib'):
    """
    Save the trained SVM model and its scaler
    
    Parameters:
    - classifier: Trained SVM classifier
    - scaler: Feature scaler used for preprocessing
    - model_path: Path to save the SVM model
    - scaler_path: Path to save the scaler
    """
    # Save the SVM model
    joblib.dump(classifier, model_path)
    
    # Save the scaler
    joblib.dump(scaler, scaler_path)
    
    print(f"Model saved to {model_path}")
    print(f"Scaler saved to {scaler_path}")

In [7]:
def load_svm_model(model_path='svm_model.joblib', scaler_path='scaler.joblib'):
    """
    Load the saved SVM model and scaler
    
    Parameters:
    - model_path: Path to the saved SVM model
    - scaler_path: Path to the saved scaler
    
    Returns:
    - Loaded SVM classifier
    - Loaded scaler
    """
    # Load the SVM model
    classifier = joblib.load(model_path)
    
    # Load the scaler
    scaler = joblib.load(scaler_path)
    
    print(f"Model loaded from {model_path}")
    print(f"Scaler loaded from {scaler_path}")
    
    return classifier, scaler

In [8]:
# Prediction Function
def predict_with_saved_model(new_embeddings, classifier, scaler):
    """
    Make predictions using the saved model
    
    Parameters:
    - new_embeddings: New data to predict (should be 1536-dimensional)
    - classifier: Loaded SVM classifier
    - scaler: Loaded scaler
    
    Returns:
    - Predictions
    - Prediction probabilities
    """

     # Ensure input is 2D array
    if new_embeddings.ndim == 1:
        new_embeddings = new_embeddings.reshape(1, -1)
        
    # Scale the new data using the saved scaler
    new_embeddings_scaled = scaler.transform(new_embeddings)
    
    # Predict classes
    predictions = classifier.predict(new_embeddings_scaled)
    
    # Predict probabilities (if probability=True was set during training)
    prediction_probs = classifier.predict_proba(new_embeddings_scaled)
    
    return predictions, prediction_probs

In [9]:
def main(X, y):
    # Prepare data
    X_train, X_test, y_train, y_test, scaler = prepare_data(X, y)
    
    # Optional: Hyperparameter Tuning
    # best_classifier = tune_svm_hyperparameters(X_train, y_train)
    
    # Train SVM
    svm_classifier = train_svm(X_train, y_train)
    
    # Evaluate
    evaluate_svm(svm_classifier, X_test, y_test)
    
    return svm_classifier, scaler

In [10]:
def map_category(text):
    """
    Map category text to corresponding integer value.
    
    Parameters:
    - text: Category text
    
    Returns:
    - Mapped integer value
    """
    category_mapping = {
        'CORE': 1,
        'USER': 2,
        'PLATFORM': 3,
        'HARDWARE': 4
    }
    
    return category_mapping.get(text, 1)

def unmap_category(text):
    """
    Map category text to corresponding integer value.
    
    Parameters:
    - text: Category text
    
    Returns:
    - Mapped integer value
    """
    category_mapping = {
        1: 'CORE',
        2: 'USER',
        3: 'PLATFORM',
        4: 'HARDWARE'
    }
    
    return category_mapping.get(text, 'CORE')

In [11]:

from sentence_transformers import SentenceTransformer

def sentence_transformer_embedding(text):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode(text)
    return embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
import pandas as pd

llm_ner = pd.read_csv('data/all_ner_result.csv')
entities = []
label = []
for i, row in llm_ner.iterrows():
    entities.append(row['entity'])
    label.append(map_category(row['category']))

X = sentence_transformer_embedding(entities)




In [13]:
classifier, scaler = main(X, label)

Classification Report:
              precision    recall  f1-score   support

           1       0.95      0.98      0.96       277
           2       0.98      0.96      0.97       112
           3       0.84      0.73      0.78        22
           4       0.75      0.60      0.67        10

    accuracy                           0.95       421
   macro avg       0.88      0.82      0.84       421
weighted avg       0.95      0.95      0.95       421


Confusion Matrix:
[[271   2   2   2]
 [  5 107   0   0]
 [  6   0  16   0]
 [  3   0   1   6]]

Cross-validation Scores: [0.91764706 0.92857143 0.9047619  0.89285714 0.94047619]
Mean CV Score: 0.9169 (+/- 0.0337)


In [14]:
save_svm_model(classifier, scaler)

Model saved to svm_model.joblib
Scaler saved to scaler.joblib


In [15]:
import neo4j

host = "localhost"
username = "neo4j"
password = "1234qwer"

# driver = neo4j.GraphDatabase.driver("neo4j://100.27.33.222:7687",
#   auth=neo4j.basic_auth("neo4j", "price-oxygens-scores")
# )
driver = neo4j.GraphDatabase.driver(f'bolt://localhost:7687', auth=(username, password))
session = driver.session()

def create_graph(query):
    session.run(query)

In [16]:
records, summary, key = driver.execute_query("""
MATCH (n:CORE) RETURN n.name""")
core_entities = []
for record in records:
    name = record['n.name']
    core_entities.append(name)

embeddings = sentence_transformer_embedding(core_entities)



In [17]:
loaded_classifier, loaded_scaler = load_svm_model()

Model loaded from svm_model.joblib
Scaler loaded from scaler.joblib


In [18]:
for i in range(len(core_entities)):
    predictions, _ = predict_with_saved_model(embeddings[i], loaded_classifier, loaded_scaler)
    session.run(f"""
    MATCH (n:CORE)
    WHERE n.name = $name
    SET n:{unmap_category(predictions[0])} RETURN n.name, labels(n) as labels
    """, name=core_entities[i])
