In [1]:
import pandas as pd
import warnings

path = r'/home/dwd/proj/Diversity-Improvement-in-CBR/CleanedDATA V12-05-2021.csv'
df = pd.read_csv(path, sep=';', encoding='windows-1252')

warnings.filterwarnings('ignore')

series = df['Publication identifier,,,,,,,,,,,,,,,,,,']
for i in range(len(series)):
    j_comma = series[i].find(',')
    if j_comma > 0:
        series[i] = series[i][:j_comma]
    
df.rename(columns={'Publication identifier,,,,,,,,,,,,,,,,,,': 'Publication identifier'}, inplace=True)
# df.to_csv(path, sep=';', index=False, encoding='windows-1252')

In [2]:
from utils.case import Query, Case, Description, GC, Solution
from utils.casebase import CaseBase, MCNN_CaseBase
from utils.utils import retrieve_topk

# Build the case base from the dataframe
attr_names = ['Task', 'Case study type', 'Case study', 'Online/Off-line', 'Input for the model',
                    'Model Approach', 'Model Type', 'Models', 'Data Pre-processing', 'Complementary notes', 'Publication identifier',
                    'Performance indicator', 'Performance', 'Publication Year']
df = df[attr_names]
cb = CaseBase.from_dataframe(df)

# Build the query
df_qry = df[attr_names[:5]]
query = Query.from_series(df_qry.iloc[0])

# Retrieve the top-5 most similar cases to the query.
retrieve_topk(query, cb, weights=[1, 1, 1, 1, 1], k=5)


[(Case 0, 1.0),
 (Case 11, 1.0),
 (Case 1, 0.9800000000000001),
 (Case 47, 0.9800000000000001),
 (Case 48, 0.9800000000000001)]

In [11]:
# Apply CBM method - MCNN - to organize the case base.
# Initialize the MCNN Case Base
mcnn_cb = MCNN_CaseBase(cb.cases, desc_attrs=attr_names[:5], sol_attrs=attr_names[5:], 
                        thr_desc=0.7, thr_sol=0.7, _seed=42)

print("Number of descriptions:", len(mcnn_cb.descriptions))
print("Number of solutions:", len(mcnn_cb.solutions))

mcnn_cb.retrieve_topk(query, k=5)



[Case 111, Case 256, Case 239, Case 176, Case 5, Case 219, Case 243, Case 230, Case 9, Case 123, Case 211, Case 51, Case 262, Case 216, Case 240, Case 220, Case 85, Case 121, Case 221, Case 245, Case 134, Case 202, Case 112, Case 72, Case 66, Case 29, Case 120, Case 65, Case 235, Case 78, Case 238, Case 3, Case 247, Case 128, Case 146, Case 164, Case 170, Case 105, Case 167, Case 196, Case 4, Case 89, Case 15, Case 73, Case 61, Case 84, Case 131, Case 133, Case 242, Case 42, Case 214, Case 236, Case 45, Case 132, Case 255, Case 215, Case 259, Case 119, Case 181, Case 27, Case 99, Case 104, Case 244, Case 260, Case 21, Case 60, Case 161, Case 149, Case 200, Case 79, Case 209, Case 10, Case 191, Case 251, Case 30, Case 135, Case 46, Case 106, Case 2, Case 193, Case 168, Case 124, Case 148, Case 122, Case 175, Case 103, Case 47, Case 19, Case 188, Case 77, Case 95, Case 81, Case 38, Case 254, Case 13, Case 241, Case 22, Case 64, Case 33, Case 100, Case 185, Case 0, Case 159, Case 190, Cas

[(GC 111, Solution 111, 0.8406335052300437),
 (GC 111, Solution 176, 0.8406335052300437),
 (GC 111, Solution 5, 0.8406335052300437),
 (GC 111, Solution 239, 0.8406335052300437),
 (GC 111, Solution 211, 0.8406335052300437)]

In [13]:
retrieve_topk(query, mcnn_cb, weights=[1, 1, 1, 1, 1], k=5)

[(GC 111, Solution 111, 0.8406335052300437),
 (GC 111, Solution 176, 0.8406335052300437),
 (GC 111, Solution 5, 0.8406335052300437),
 (GC 111, Solution 239, 0.8406335052300437),
 (GC 111, Solution 211, 0.8406335052300437)]

In [5]:
import random
random.seed(42)
l = list(range(10))
random.shuffle(l)

print(l)

[7, 3, 2, 8, 5, 6, 9, 4, 0, 1]


### TODO: test & evaluate (similarity, diversity, etc.)