In [21]:
### MODEL ###
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans
from sklearn.cross_decomposition import CCA
from sklearn.linear_model import Ridge
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
### EDA ###
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial import procrustes

In [4]:
### SCALER ###
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [25]:
### METRIC ###
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import ndcg_score

In [6]:
### ETC ###
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import time
from tqdm import tqdm
from box import Box

In [67]:
config ={
    "data_path":"./data",
    "X_file_name" : "tag_name_openai_emb_df.csv",
    "y_file_name" : "tag_node_emb_origin.csv",
    "norm" : False, 
    "scaling" : True,
    "valid_size":0.1,
    "test_size" : 0.1,
    "random_state" : 42,
    "n_topics":5,
    "sim_path":"./cosine_similarity_matrix.npy",
    "candidate_k":50,
    "negative_k":100,
}
config = Box(config)

In [38]:
class MakeLGBDataSet:
    def __init__(self,config):
        self.config = config
        self.X = self.load_data(config.X_file_name)
        self.total_size=len(self.X)
        self.y = self.load_data(config.y_file_name)
        self.groups, self.group_score=self.create_sim_candidates(config.candidate_k,
                                                                 config.negative_k)
        self.X_flattened, self.y_flattened, self.group_sizes =self.make_flatten(self.X, 
                                                                                self.groups, 
                                                                                self.group_score, 
                                                                                self.total_size)
        self.train_lgbdataset, self.valid_lgbdataset, self.X_test, self.y_test = self.random_split(
                                                        self.X_flattened, 
                                                        self.y_flattened, 
                                                        self.group_sizes, 
                                                        config.test_size,
                                                        config.valid_size)

    def load_data(self, file_name, data_path="./data"):
        df=pd.read_csv(os.path.join(data_path, file_name))
        tqdm.pandas(desc="Processing embeddings")
        df["embedding"] = df["embedding"].apply(lambda x: np.array(eval(x)))
        return df["embedding"]
    
    def create_sim_candidates(self, candidate_k=30, negative_k=60):
        # def normalize_for_lda(X):
        #     scaler = MinMaxScaler()
        #     X_normalized = scaler.fit_transform(X)
        #     return X_normalized
        cosine_similarity_matrix=np.load(self.config.sim_path)        
        groups = []  
        group_score = [] 
        
        for i, row in enumerate(cosine_similarity_matrix):
            # Positive candidates
            top_k_indices = np.argsort(-row)[:candidate_k]
            top_k_scores = row[top_k_indices]
            
            # Negative candidates
            negative_indices = np.random.choice(
                [idx for idx in range(len(row)) if idx not in top_k_indices],
                size=negative_k,
                replace=False
            )
            negative_scores = np.zeros(len(negative_indices)) 

            all_indices = np.concatenate([top_k_indices, negative_indices])
            all_scores = np.concatenate([top_k_scores, negative_scores])
            
            groups.append(all_indices)
            group_score.append(all_scores)
    
        return groups, group_score
    
    def create_hard_negatives(self, candidate_k=30, hard_negative_k=60):
        cosine_similarity_matrix=np.load(self.config.sim_path)
        groups = []
        group_scores = []
    
        for i, row in enumerate(cosine_similarity_matrix):
            # Positive candidates
            top_k_indices = np.argsort(-row)[:candidate_k]
            top_k_scores = row[top_k_indices]
            
            # Hard Negative candidates (Middle-ranked)
            hard_neg_indices = np.argsort(-row)[candidate_k:candidate_k + hard_negative_k]
            hard_neg_scores = row[hard_neg_indices]

            all_indices = np.concatenate([top_k_indices, hard_neg_indices])
            all_scores = np.concatenate([top_k_scores, hard_neg_scores])
            
            groups.append(all_indices)
            group_scores.append(all_scores)
        
        return groups, group_scores
        
    def create_lda_candidates(self, n_topics=config.n_topics, candidate_k=30, negative_k=60):
        cosine_similarity_matrix=np.load(config.sim_path)
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
        lda.fit(cosine_similarity_matrix)
        node_topic_distribution = lda.components_.T
        node_topics_k = np.argsort(node_topic_distribution, axis=1)[:, -candidate_k:]
        
        groups = [] 
        for topic in range(n_topics):
            group_indices = np.where(node_topics_k == topic)[0]
            groups.append(group_indices)
        
    def make_flatten(self, X, groups, group_score, total_size):
        X_flattened = []
        y_flattened = []
        group_sizes = []
        
        for datapoint in range(total_size):
            candidates = groups[datapoint]
            scores = group_score[datapoint]
            for idx, score in zip(candidates,scores):
                X_flattened.append(X.values[idx])
                y_flattened.append(score) 
            group_sizes.append(len(candidates))
            
        y_flattened = (np.array(y_flattened) * 100).astype(int)
        print(f"Scaled and flattened scores: {y_flattened}")
        return np.array(X_flattened), y_flattened, np.array(group_sizes)
    
    def create_lgbdataset(self, X_flattened, y_flattened, group_sizes, reference=None):
        lgbdataset=lgb.Dataset(X_flattened, label=y_flattened, 
                               group=group_sizes, reference=reference)
        return lgbdataset
    
    def split(self, X_flattened, y_flattened, group_sizes, test_size=0.2, valid_size=0.1):
        num_queries = len(group_sizes)  
        
        train_size = int((1 - test_size - valid_size) * num_queries)
        valid_size = int(valid_size * num_queries)
        test_size = num_queries - train_size - valid_size

        train_indices = np.arange(train_size)
        valid_indices = np.arange(train_size, train_size + valid_size)
        test_indices = np.arange(train_size + valid_size, num_queries)

        cumulative_sizes = np.cumsum(group_sizes)

        train_flat_indices = np.arange(cumulative_sizes[train_indices[-1]]) if len(train_indices) > 0 else []
        valid_flat_indices = np.arange(cumulative_sizes[train_indices[-1]], cumulative_sizes[valid_indices[-1]]) if len(valid_indices) > 0 else []
        test_flat_indices = np.arange(cumulative_sizes[valid_indices[-1]], cumulative_sizes[-1])

        X_train = X_flattened[train_flat_indices]
        X_valid = X_flattened[valid_flat_indices]
        X_test = X_flattened[test_flat_indices]
        
        y_train = y_flattened[train_flat_indices]
        y_valid = y_flattened[valid_flat_indices]
        y_test = y_flattened[test_flat_indices]
        
        group_train = group_sizes[train_indices]
        group_valid = group_sizes[valid_indices]
        group_test = group_sizes[test_indices]
        
        train_lgbdataset = self.create_lgbdataset(X_train, y_train, group_train)
        valid_lgbdataset = self.create_lgbdataset(X_valid, y_valid, group_valid, train_lgbdataset)

        return train_lgbdataset, valid_lgbdataset, X_test, y_test

    def random_split(self, X_flattened, y_flattened, group_sizes, test_size=0.2, valid_size=0.1, random_state=42):
        num_queries = len(group_sizes)

        query_indices = np.arange(num_queries)
        query_indices, group_sizes = shuffle(query_indices, group_sizes, random_state=random_state)

        train_size = int((1 - test_size - valid_size) * num_queries)
        valid_size = int(valid_size * num_queries)
        test_size = num_queries - train_size - valid_size

        train_indices = query_indices[:train_size]
        valid_indices = query_indices[train_size:train_size + valid_size]
        test_indices = query_indices[train_size + valid_size:]

        cumulative_sizes = np.cumsum(group_sizes)

        def get_flat_indices(indices):
            flat_indices = []
            for idx in indices:
                start_idx = cumulative_sizes[idx - 1] if idx > 0 else 0
                end_idx = cumulative_sizes[idx]
                flat_indices.extend(np.arange(start_idx, end_idx))
            return np.array(flat_indices)

        train_flat_indices = get_flat_indices(train_indices)
        valid_flat_indices = get_flat_indices(valid_indices)
        test_flat_indices = get_flat_indices(test_indices)

        X_train = X_flattened[train_flat_indices]
        X_valid = X_flattened[valid_flat_indices]
        X_test = X_flattened[test_flat_indices]

        y_train = y_flattened[train_flat_indices]
        y_valid = y_flattened[valid_flat_indices]
        y_test = y_flattened[test_flat_indices]

        group_train = group_sizes[train_indices]
        group_valid = group_sizes[valid_indices]
        group_test = group_sizes[test_indices]

        train_lgbdataset = self.create_lgbdataset(X_train, y_train, group_train)
        valid_lgbdataset = self.create_lgbdataset(X_valid, y_valid, group_valid, train_lgbdataset)

        return train_lgbdataset, valid_lgbdataset, X_test, y_test    
            
    def get_lgbdataset(self):
        return self.train_lgbdataset, self.valid_lgbdataset
    
    def get_test(self):
        return self.X_test, self.y_test

In [14]:
def metric(y_true, y_pred):
    ndcg_at_k = ndcg_score([y_true], [y_pred], k=10)  
    
    print(f"NDCG@10: {ndcg_at_k:.4f}")
    
    y_true_reshaped = y_true.reshape(-1, 1)
    y_pred_reshaped = y_pred.reshape(-1, 1)

    map_score = average_precision_score(y_true_reshaped, y_pred_reshaped)
    print(f"MAP: {map_score:.4f}")
    
    def mean_reciprocal_rank(y_true, y_pred):
        indices = np.argsort(-y_pred)
        reciprocal_rank = 0.0
        for i, index in enumerate(indices):
            if y_true[index] > 0:
                reciprocal_rank = 1 / (i + 1)
                break
        return reciprocal_rank

    mrr = mean_reciprocal_rank(y_true, y_pred)
    print(f"MRR: {mrr:.4f}")
    
    def accuracy_at_k(y_true, y_pred, k):
        top_k_indices = np.argsort(-y_pred)[:k] 
        top_k_true = np.argsort(-y_true)[:k]
        return len(set(top_k_indices) & set(top_k_true)) / k

    acc_at_10 = accuracy_at_k(y_true, y_pred, 10)
    
    print(f"Accuracy@10: {acc_at_10:.4f}")
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    print(f"MSE: {mse:.4f}, RMSE: {rmse:.4f}")

In [87]:
def main(config):
    lgbdataset=MakeLGBDataSet(config)
    train_lgbdataset, valid_lgbdataset=lgbdataset.get_lgbdataset()
    X_test, y_true=lgbdataset.get_test()
    
    params = {
        "objective": "lambdarank",
        "metric": "ndcg",
        "ndcg_eval_at": [1, 5, 10],
        "learning_rate": 0.05,
        "num_leaves": 10,
        "label_gain": list(range(101)),
        "max_depth":15,
        "bagging_fraction":0.5,
        "bagging_freq":10
    }
    # print(f"Label gain mapping: {params['label_gain']}")
    model = lgb.train(params, train_lgbdataset, valid_sets=[valid_lgbdataset], 
                    num_boost_round=100)
    y_pred = model.predict(X_test)
    # print(f"Predicted scores range: {y_pred.min()} to {y_pred.max()}")
    # y_pred_normalized = (y_pred - y_pred.min()) / (y_pred.max() - y_pred.min())
    # print(f"Normalized Predicted scores range: {y_pred_normalized.min()} to {y_pred_normalized.max()}")
    metric(y_true, y_pred)
    return model

In [None]:
model = main(config)