In [None]:
import pandas as pd
import warnings

path = r'/home/dwd/proj/Diversity-Improvement-in-CBR/CleanedDATA V12-05-2021.csv'
df = pd.read_csv(path, sep=';', encoding='windows-1252')

warnings.filterwarnings('ignore')

series = df['Publication identifier,,,,,,,,,,,,,,,,,,']
for i in range(len(series)):
    j_comma = series[i].find(',')
    if j_comma > 0:
        series[i] = series[i][:j_comma]
    
df.rename(columns={'Publication identifier,,,,,,,,,,,,,,,,,,': 'Publication identifier'}, inplace=True)
# df.to_csv(path, sep=';', index=False, encoding='windows-1252')

In [None]:
from typing import List, Dict, Tuple
from utils.case import Query, Case

def build_cb_from_dataframe(df: pd.DataFrame, attribute_names: List[str] = None) -> List[Case]:
    '''Build a case base from a dataframe.
    Args:
        df: The dataframe.
        attribute_names [optional]: The names of the attributes to include in the case base. If None, 
        include all attributes. Default is None.
    Returns:
        A list of cases.
    '''
    if isinstance(attribute_names, list):
        df = df[attribute_names].copy()
    
    Case.case_num = 0 # initialize the case base
    cb = []
    for i in range(len(df)):
        s = df.iloc[i]
        c = Case.from_series(s)
        cb.append(c)
    return cb

# Build the case base
attribute_names = ['Task', 'Case study type', 'Case study', 'Online/Off-line', 'Input for the model',
                    'Model Approach', 'Model Type', 'Models', 'Data Pre-processing', 'Complementary notes', 'Publication identifier',
                    'Performance indicator', 'Performance', 'Publication Year']
df = df[attribute_names]
cb = build_cb_from_dataframe(df, attribute_names)
# Build the query
df_qry = df[attribute_names[:5]]
query = Query.from_series(df_qry.iloc[0])

def retrieve_topk(query: Query, case_base: List[Case], weights: List[float] | Dict[str, float], k: int = None) -> List[Tuple[Case, float]]:
    '''Retrieve the top-k most similar cases to the query from the case base.
    Args:
        query: The query case.
        case_base: The case base.
        weights: The weights for the attributes.
        k [optional]: The number of cases to retrieve. If None, return all cases. Default is None.
    Returns:
        A list of tuples, each tuple contains a case and its similarity to the query.
    '''
    # Calculate the similarity between the query and all cases in the case base
    sims = []
    for c in case_base:
        sim = query.cal_sim(c, weights)
        sims.append((c, sim))
    # Sort the cases by similarity
    sims = sorted(sims, key=lambda x: x[1], reverse=True)
    if k is None:
        return sims
    else:
        return sims[:k]

# Retrieve the top-5 most similar cases to the query.
retrieve_topk(query, cb, weights=[1, 1, 1, 1, 1])

In [None]:
# TODO: What kind of CB is easy to maintain ? 

class CaseBase:
    # attribute: sim_func 
    def __init__(self, cases: List[Case]):
        self.cases = cases

    def retrieve_topk(self, query: Query, weights: List[float] | Dict[str, float], k: int = None) -> List[Tuple[Case, float]]:
        '''Retrieve the top-k most similar cases to the query from the case base.
        Args:
            query: The query case.
            k [optional]: The number of cases to retrieve. If None, return all cases. Default is None.
        Returns:
            A list of tuples, each tuple contains a case and its similarity to the query.
        '''
        # Calculate the similarity between the query and all cases in the case base
        sims = []
        for c in self.cases:
            sim = query.cal_sim(c, weights)
            sims.append((c, sim))
        # Sort the cases by similarity
        sims = sorted(sims, key=lambda x: x[1], reverse=True)
        if k is None:
            return sims
        else:
            return sims[:k]