# SABBO
Version: 2.0<br>
Contents:<br>
Kaplan-Meier-Curve


###### tsolve = survivalTime

In [1]:
import csv
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter


In [2]:
# Read CSV
csv_file = 'QUBOClusteringData.csv'

originDf = pd.read_csv(csv_file)

originDf.head()
    

Unnamed: 0,tsolve,alg,nvars,seed,runseed,Unnamed: 5
0,8.0,UEDA,5,1,1,
1,7.0,GR,5,1,1,
2,13.0,RS,5,1,1,
3,73.0,UEDA,5,1,2,
4,4.0,GR,5,1,2,


#### Functions:

### Kaplan-Meier-Curve
Algorithms end only after unkown time due to unknown issues.
The seed might or might not affect its performance, too. Thus, there are multiple ways in order to retrieve q.

The first Kaplan-Meier-Curves take the different seed into account and assign the last time depending on the seed.<br>
The second Kaplan-Meier-Curves take only the very last time for the number of variables regardless of the seed.

In [3]:
# make multiple ideal curves into one image and plot not-solved ones
nvarsUEDAset = nvarsGRset = nvarsRSset = set()
nseedsUEDAset = nseedsGRset = nseedsRSset = set()

for index, row in originDf.iterrows():
    if row["alg"] == "UEDA":
        nvarsUEDAset.add(int(row["nvars"]))       # {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}
        nseedsUEDAset.add(int(row["seed"]))       # {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
    elif row["alg"] == "GR":
        nvarsGRset.add(int(row["nvars"]))         # {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}
        nseedsGRset.add(int(row["seed"]))         # {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
    elif row["alg"] == "RS":
        nvarsRSset.add(int(row["nvars"]))         # {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}
        nseedsRSset.add(int(row["seed"]))         # {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

nvarsUEDA = list(nvarsUEDAset)
nvarsGR = list(nvarsGRset)
nvarsRS = list(nvarsRSset)
nseedsUEDA = list(nseedsUEDAset)
nseedsGR = list(nseedsGRset)
nseedsRS = list(nseedsRSset)

In [30]:
# Make the m-n-q Dataframe with assignment of each seeds highest runtime
def makeMNQDf(nvars, alg):
    OrigDf = originDf[(originDf["alg"] == alg) & (originDf["nvars"] == nvars)]

    maxTSolverPerSeedUEDA = OrigDf[OrigDf["alg"] == alg].groupby("seed")['tsolve'].max().to_dict()

    # Tester: Look into it for nvars == 10
    MNDf = OrigDf[(OrigDf["alg"] == alg) & (OrigDf["nvars"] == nvars)].copy()
    
    for index, row in MNDf.iterrows():
        if pd.isna(row["tsolve"]):
            if pd.isna(maxTSolverPerSeedUEDA[row["seed"]]):
                maxRuntimeOverall = np.nanmax(list(maxTSolverPerSeedUEDA.values()))
                MNDf.loc[index, "q SeedZeit"] = maxRuntimeOverall
            else:
                MNDf.loc[index, "q SeedZeit"] = maxTSolverPerSeedUEDA[row["seed"]]

    ArgDf = MNDf.set_index("tsolve").sort_index()

    # make mDict
    allTimes = ArgDf.index.tolist()
    allTimes = [value for value in allTimes if not pd.isna(value)]  # [3.0, 29.0, 31.0, 31.0, 35.0, 45.0, 59.0, 60.0, 65.0, 69.0, 78.0, 81.0, 82.0, 83.0, 85.0, 89.0, 92.0, 96.0, 99.0, 100.0, 102.0, 104.0, 106.0, 107.0, 109.0, 112.0, 115.0, 122.0, 137.0, 144.0, 159.0, 165.0, 168.0, 174.0, 181.0, 212.0, 230.0, 234.0, 235.0, 245.0, 246.0, 258.0, 295.0, 304.0, 313.0, 327.0, 341.0, 343.0, 366.0, 374.0, 375.0, 386.0, 404.0, 415.0, 436.0, 438.0, 474.0, 500.0]

    tmp = Counter(allTimes)
    mDict = dict(tmp)
    mDict[0] = 0
    mDict = dict(sorted(mDict.items(), key=lambda x: x[0]))

    # make qDict
    qDict = {key: 0 for key in mDict}
    values = list(MNDf["q SeedZeit"].dropna())
    for value in values:
        qDict[value] += 1


    mDf = pd.DataFrame.from_dict(mDict, orient="index", columns=["m Died"])
    qDf = pd.DataFrame.from_dict(qDict, orient="index", columns=["q Censored"])
    dfOverall = pd.merge(mDf, qDf, left_index=True, right_index=True, how="outer")
    
    # make nDict
    totalRuns = len(ArgDf)  # 100
    nCurrent = totalRuns
    nDict = {key: 0 for key in mDict}
    
    a = 1
    for index, row in dfOverall.iterrows():
        if index == 0.0:
            nDict[index] = totalRuns
            nCurrent = totalRuns
            a = a + 1
        else:
            nDict[index] = nCurrent - row["m Died"] - row["q Censored"]
            nCurrent = nDict[index]    
    values = list(nDict.values())
    keys = list(nDict.keys())
    values.append(totalRuns)
    values = sorted(values, reverse=True) # [100, 100, 90, 89, 88, 87, 78, 69, 0]
    values.pop()
    nDict = dict(zip(keys, values))


    nDf = pd.DataFrame.from_dict(nDict, orient="index", columns=["n at Risk"])
    
    dfOverall = pd.merge(dfOverall, nDf, left_index=True, right_index=True, how="outer")

    return dfOverall

pd.set_option("display.max_rows", 20)
MNQDf = makeMNQDf(10, "UEDA")
MNQDf

Unnamed: 0,m Died,q Censored,n at Risk
0.0,0,0,100
3.0,1,0,100
7.0,1,0,99
12.0,1,0,98
16.0,1,0,97
...,...,...,...
328.0,1,0,7
407.0,1,0,6
411.0,1,0,5
416.0,1,0,4


In [31]:

MNQDf["S(t)"] = (MNQDf["n at Risk"] - MNQDf["m Died"])/MNQDf["n at Risk"]
pDict = {}

for index, row in MNQDf.iterrows():
    if index == 0.0:
        pDict[index] = row["S(t)"]
        tmp = pDict[index]
    else:
        pDict[index] = tmp * row["S(t)"]
        tmp = pDict[index]
pDf = pd.DataFrame.from_dict(pDict, orient="index", columns=["p(S(t))"])
MNQDf = pd.merge(MNQDf, pDf, left_index=True, right_index=True, how="outer")


MNQDf

Unnamed: 0,m Died,q Censored,n at Risk,S(t),p(S(t))
0.0,0,0,100,1.000000,1.000000
3.0,1,0,100,0.990000,0.990000
7.0,1,0,99,0.989899,0.980000
12.0,1,0,98,0.989796,0.970000
16.0,1,0,97,0.989691,0.960000
...,...,...,...,...,...
328.0,1,0,7,0.857143,0.106178
407.0,1,0,6,0.833333,0.088481
411.0,1,0,5,0.800000,0.070785
416.0,1,0,4,0.750000,0.053089
