In [1]:
import pandas as pd
import warnings

path = r'/home/dwd/proj/Diversity-Improvement-in-CBR/CleanedDATA V12-05-2021.csv'
df = pd.read_csv(path, sep=';', encoding='windows-1252')

warnings.filterwarnings('ignore')

series = df['Publication identifier,,,,,,,,,,,,,,,,,,']
for i in range(len(series)):
    j_comma = series[i].find(',')
    if j_comma > 0:
        series[i] = series[i][:j_comma]
    
df.rename(columns={'Publication identifier,,,,,,,,,,,,,,,,,,': 'Publication identifier'}, inplace=True)
# df.to_csv(path, sep=';', index=False, encoding='windows-1252')

In [2]:
from utils.case import Query, Case, Description, GC, Solution
from utils.casebase import CaseBase, MCNN_CaseBase
from utils.utils import retrieve_topk

# Build the case base from the dataframe
attr_names = ['Task', 'Case study type', 'Case study', 'Online/Off-line', 'Input for the model',
                    'Model Approach', 'Model Type', 'Models', 'Data Pre-processing', 'Complementary notes', 'Publication identifier',
                    'Performance indicator', 'Performance', 'Publication Year']
df = df[attr_names]
cb = CaseBase.from_dataframe(df)

# Build the query
df_qry = df[attr_names[:5]]
query = Query.from_series(df_qry.iloc[0])

# Retrieve the top-5 most similar cases to the query.
retrieve_topk(query, cb, weights=[1, 1, 1, 1, 1], k=5)


[(Case 0, 1.0),
 (Case 11, 1.0),
 (Case 1, 0.9800000000000001),
 (Case 47, 0.9800000000000001),
 (Case 48, 0.9800000000000001)]

In [3]:
descriptions = []
solutions = []
for c in cb.cases:
    desc, sol = c.to_desc_sol_pair(desc_attrs=attr_names[:5], sol_attrs=attr_names[5:])
    descriptions.append(desc)
    solutions.append(sol)

# Generalize
def add_description(desc: Description, cb: MCNN_CaseBase):

    if len(cb.descriptions) == 0:
        cb.descriptions.append(desc)
    else:    
        desc0, sim = cb.retrieve_topk(desc, k=1)[0]
        if sim > cb.thr_desc:
            # Generalize descriptions to GC
            if isinstance(desc0, GC):
                # Add the description to the existing GC
                desc0.add_description(desc)
            else:
                # Create a new GC from 2 descriptions
                gc = GC(descriptions=[desc0, desc], _id=desc0._id)
                # Update the case base
                cb.descriptions.remove(desc0)
                cb.descriptions.append(gc)
        else:
            cb.descriptions.append(desc)

# TODO: relink to description
def add_solution(sol: Solution, cb: MCNN_CaseBase):
    if len(cb.solutions) == 0:
        cb.solutions.append(sol)
    else:
        sol0, sim = cb.retrieve_topk_sol(sol, k=1)[0]
        # Generalize solutions
        if sim > cb.thr_sol:
            if sol0.perf >= sol.perf:
                # Become a child of the existing solution
                sol0.children.append(sol)
                # Relink corresponding descriptions
                sol0.desc += sol.desc
                sol.desc.clear()
            else:
                sol.children.append(sol0)
                # Absorb all the children of the previously existing solution
                for sol0_chd in sol0.children:
                    sol.children.append(sol0_chd)
                sol0.children.clear()
                # Relink corresponding descriptions
                sol.desc += sol0.desc
                sol0.desc.clear()
                # Update the solution base
                cb.solutions.remove(sol0)
                cb.solutions.append(sol)
        else:
            cb.solutions.append(sol)

# Initialize the MCNN Case Base
mcnn_cb = MCNN_CaseBase(thr_desc=0.6, thr_sol=0.6)
# Add the descriptions to the MCNN Case Base
# TODO: randomize
for desc in descriptions:
    add_description(desc, mcnn_cb)

for sol in solutions:
    add_solution(sol, mcnn_cb)


In [7]:
for sol in mcnn_cb.solutions:
    print(sol)

Solution 0: {Solution 1, Solution 2, Solution 3, Solution 4, Solution 5, Solution 11}
Solution 6: {Solution 7, Solution 9, Solution 10, Solution 16, Solution 17, Solution 60, Solution 61, Solution 63, Solution 64, Solution 65, Solution 79, Solution 80, Solution 81, Solution 82, Solution 83, Solution 84, Solution 85, Solution 105, Solution 106, Solution 107, Solution 128, Solution 129, Solution 159, Solution 160, Solution 176, Solution 177, Solution 178, Solution 183, Solution 187, Solution 194, Solution 195, Solution 199, Solution 210, Solution 211, Solution 212, Solution 218, Solution 219, Solution 223, Solution 225, Solution 234, Solution 254, Solution 261, Solution 262}
Solution 8: {Solution 18, Solution 19, Solution 87, Solution 88, Solution 97, Solution 98}
Solution 12: {Solution 13, Solution 54, Solution 55, Solution 120, Solution 179, Solution 189, Solution 196, Solution 204, Solution 205, Solution 208, Solution 209, Solution 217}
Solution 14: {Solution 15, Solution 180, Solutio

In [5]:
desc_num = 0
sol_num = 0
for desc in mcnn_cb.descriptions:
    if isinstance(desc, GC):
        desc_num += len(desc.descriptions)
        sol_num += len(desc.solutions)
    else:
        desc_num += 1
        sol_num += 1

print(f'Number of Descriptions: {desc_num}')
print(f'Number of Solutions: {sol_num}')

Number of Descriptions: 263
Number of Solutions: 263


In [8]:
desc_num = 0
sol_num = 0
for sol in mcnn_cb.solutions:
    desc_num += len(sol.desc)
    sol_num += len(sol.children) + 1

print(f'Number of Descriptions: {desc_num}')
print(f'Number of Solutions: {sol_num}')

Number of Descriptions: 263
Number of Solutions: 263
