In [14]:
import numpy as np
import pandas as pd
import copy
import uuid
from scipy import stats
from sklearn.linear_model import LinearRegression
from scipy.optimize import curve_fit
import plotly.express as px
import plotly.graph_objects as go
from math import log10, floor
import os
import glob

if not os.path.exists('./shr'):
    os.makedirs('./shr')

if os.path.exists('./shr/log.txt'):
    os.remove('./shr/log.txt')


def showdfres(dataf, folder='./shr/', calclin=True, calclog=True):
    try:
        return showdfres_int(dataf, folder=folder, calclin=calclin, calclog=calclog)
    except Exception as e:
        print(e)
        return None
    
def cs(f_obs, f_exp):
    return stats.chisquare(f_obs=f_obs, f_exp=np.sum(f_obs)/np.sum(f_exp) * f_exp)

def printfile(text, filename='./shr/log.txt'):
    with open(filename, 'a') as file:
        file.write(text+'\n')

def onestring(*args):
    return " ".join([str(x) for x in args])


def round_to_n(x, n=3):
    if x == 0 or np.isclose(x, 0):
        return 0
    #print(x)
    return round(x, -int(floor(log10(abs(x))))+n)


def showdfres_int(dataf, folder='./shr/', calclin=True, calclog=True):
    dataframe_ = copy.deepcopy(dataf)
    #print(dataframe_.shape, dataframe_.info())

    if dataframe_.empty:
        print("Empty dataframe")
        return
    

    cols = ['Theta', 'convSanity1', 'convSanity2', 'gconfm1A', 'gconfm1B', 'gconfm1Aminmax', 'EntropyConvexityDist']
    colcombinations = [
        ('Theta', 'convSanity1', False),
        ('convSanity1', 'NumberOfComponents', False), 
        ('convSanity1', 'NumberOfPoints', False), 
        ('Theta', 'NumberOfComponents', False),
        ('Theta', 'NumberOfPoints', False),
        ('convSanity1', 'convSanity2', True), 
        ('convSanity1', 'gconfm1A', True), 
        ('convSanity1', 'gconfm1B', True), 
        ('convSanity1', 'gconfm1Aminmax', True), 
        ('convSanity1', 'EntropyConvexityDist', True),
    ]
    sizecols = ['NumberOfPoints', 'NumberOfComponents', 'n', 'NumberOfOrigins']
    sizecol = [sizecols[2]]

    dictcols = {
        'NumberOfPoints': 'Number of Points',
        'NumberOfComponents': 'Number of Components',
        'n': 'n',
        'NumberOfOrigins': 'Number of Origins',
        'Theta': 'Theta',
        'convSanity1': 'CM_1(A)',
        'convSanity2': 'CM_2(A)',
        'gconfm1A': 'CM_3a(A)',
        'gconfm1B': 'CM_3b(A)',
        'gconfm1Aminmax': 'CM_3c(A)',
        'EntropyConvexityDist': 'CM_4(A)'
    }

    fnamedict = {
        'convSanity1Theta': 'San1Theta',
        'ThetaNumberOfComponents': 'ThetaNoComp',
        'ThetaNumberOfPoints': 'ThetaNoPoints',
        'convSanity1convSanity2sampled': 'San1San2',
        'convSanity1gconfm1A': 'San1San3a',
        'convSanity1gconfm1B': 'San1San3c',
        'convSanity1gconfm1Aminmax': 'San1San3b',
        'convSanity1EntropyConvexityDist': 'San1San4'
    }

    def getfname(colx, coly):
        col = colx+coly
        if col in fnamedict:
            return fnamedict[col]
        return col


    def getcolname(col):
        if col in dictcols:
            return dictcols[col]
        return col

    allcols = dataframe_.columns.tolist()
    print(allcols)

    for col in allcols:
        printfile(onestring(col, " min: ", round_to_n(dataframe_[col].min()), " max: ", round_to_n(dataframe_[col].max()), " mean: ", round_to_n(dataframe_[col].mean()), " std: ", round_to_n(dataframe_[col].std())), filename=folder+'log.txt')
        #print(col, dataframe_[col].min(), dataframe_[col].max(), dataframe_[col].mean(), dataframe_[col].std())
    
    npmaxval = np.power(np.finfo(np.float64).max, 2**-7)
    
    def func(x, a, b, c, d):
        res = np.array((a * np.exp(-b * (x - c)) + d), dtype=np.float64)
        for i in range(len(res)):
            if np.isnan(res[i]) or np.isinf(res[i]):
                res[i] = +npmaxval
        return res

    


    LinModel = LinearRegression()

    indicies = dataframe_.index.to_numpy()
    #print(indicies)

    sizecolres = dataframe_[sizecol[0]].to_numpy()

    figs = {}

    for colx, coly, calcregression in colcombinations: # itertools.product(cols, cols):
        nm = colx+coly
        if colx == coly:
             continue
        print(colx, coly)

        xold = dataframe_[colx].to_numpy()
        yold = dataframe_[coly].to_numpy()

        x = []
        y = []

        for i in range(0,max(indicies),1):
            ind2 = np.where(indicies == i)[0]
            x.append([np.sum(xold[ind2]*sizecolres[ind2])/np.sum(sizecolres[ind2])])
            y.append([np.sum(yold[ind2]*sizecolres[ind2])/np.sum(sizecolres[ind2])])


        x = np.array(x, dtype=np.float64)
        y = np.array(y, dtype=np.float64)

        x = x.ravel().reshape(-1)
        y = y.ravel().reshape(-1)

        if calcregression:

            maxy = np.max(y)+0.1

            l1 = np.repeat(-maxy*3, 1).tolist()+np.repeat(-maxy*2, 2).tolist()+np.repeat(-maxy*3, 1).tolist()
            l2 = (np.array(l1)*-1).tolist()

            bd = (l1, l2)
            p00 = np.random.rand(4)*maxy*1.2

        
            logpopt, logpcov = None, None
            try:
                if calclog:
                    logpopt, logpcov = curve_fit(func, x, y, bounds=bd, p0=p00, maxfev=100000)
                    print("y:", y, "^y log:", func(x, *logpopt))
                    gfr1 = cs(f_obs=y, f_exp=func(x, *logpopt))
                    print("gfr1", gfr1)
                    #goodness_of_fit(y, func(x, *logpopt))
                    print("Logistic fit values", colx, coly, logpopt, logpcov)
                    print("logistic p-Values", gfr1.pvalue, gfr1.statistic)

                    printfile(onestring("logistic", colx, coly, tuple([round_to_n(tx) for tx in logpopt.tolist()]), gfr1.pvalue), filename=folder+'log.txt')
            except Exception as e:
                print(e)
                continue

            if calclin:
                x = x.ravel().reshape(-1, 1)
                y = y.ravel()

                linres = LinModel.fit(x, y)

                gfr2 = cs(y, linres.predict(x))
                print("y:", y, "^y lin:", linres.predict(x))
                print("gfr2", gfr2)
                print("linear p-Values", gfr2.pvalue, gfr2.statistic)

                print("Linear", colx, coly, linres.score(x,y), linres.coef_, linres.intercept_)

                printfile(onestring("linear", colx, coly, (round_to_n(linres.coef_[0]), round_to_n(linres.intercept_)), gfr2.pvalue), filename=folder+'log.txt')

        figs[nm] = px.scatter(dataframe_, x=colx, y=coly, color=sizecols[2], size=sizecols[2])

        ux = np.unique(dataframe_[colx].to_numpy())
        mmin = 0 if np.min(ux) > 0 else np.min(ux)
        mmax = 1 if np.max(ux) < 1 else np.max(ux)

        miny = np.min(dataframe_[coly].to_numpy())
        miny = miny*1.25 if miny < 0 else miny*0.75
        miny = 0 if miny > 0 else miny - 0.25*np.mean(dataframe_[coly].to_numpy())
        maxy = np.max(dataframe_[coly].to_numpy())
        maxy = maxy*1.25

        uxls = np.linspace(mmin, mmax, 100)

        if calcregression:
            if logpopt is not None and calclog:
                figs[nm].add_trace(go.Scatter(x=uxls, y=func(uxls, *logpopt), mode='lines', name='Logistic'))

            if linres is not None and calclin:
                figs[nm].add_trace(go.Scatter(x=uxls, y=linres.predict(uxls.reshape(-1,1)), mode='lines', name='Linear'))
        
        figs[nm].update_layout(
            xaxis_title=getcolname(colx),
            yaxis_title=getcolname(coly),
            font=dict(
                family="Courier New, monospace",
                size=15,
                color="#7f7f7f"
            ),
            yaxis_range=[miny,maxy]
        )
        #figs[nm].write_image("shr/_{colx}_{coly}_{uuid}_.png".format(colx=colx, coly=coly, uuid=uuid.uuid4()))
        figs[nm].write_image("{folder}{nm}.png".format(folder=folder,nm=getfname(colx, coly)))
        #figs[nm].show()

In [15]:
fdone = []#['./shr/line/', './shr/grid/']

for file in glob.glob('./shr/**/*_df*.pkl', recursive=True):
    print(file)
    bn = os.path.dirname(file)+"/"
    print(bn)
    if bn in fdone:
        continue
    df = pd.read_pickle(file)
    showdfres(df, folder=bn, calclin=True, calclog=False)

./shr/line/line_df_9aee79d2-d79b-4d7c-8a15-e3785fb5bb9e.pkl
./shr/line/
['Theta', 'convSanity1', 'convSanity1time', 'convSanity2', 'convSanity2time', 'gconfm1A', 'gconfm1Atime', 'gconfm1B', 'gconfm1Btime', 'gconfm1Aminmax', 'gconfm1Aminmaxtime', 'EntropyConvexityDist', 'EntropyConvexityDisttime', 'NumberOfPoints', 'NumberOfComponents', 'n', 'NumberOfOrigins']
Theta convSanity1
convSanity1 NumberOfComponents
convSanity1 NumberOfPoints
Theta NumberOfComponents
Theta NumberOfPoints
convSanity1 convSanity2
y: [0.98924731 0.98924731 0.98924731 0.98924731 1.         0.98924731
 1.         1.         0.98924731 0.98924731 1.         1.
 1.         1.        ] ^y lin: [0.98924731 0.98924731 0.98924731 0.98924731 1.         0.98924731
 1.         1.         0.98924731 0.98924731 1.         1.
 1.         1.        ]
gfr2 Power_divergenceResult(statistic=0.0, pvalue=1.0)
linear p-Values 1.0 0.0
Linear convSanity1 convSanity2 1.0 [0.02150538] 0.9784946236559141
convSanity1 gconfm1A
y: [0.45652174

In [16]:
clusters = ["*","3","4"]
dledges = ["*", "0.0", "0.2", "0.4"]

def combinedfs(cluster = "*", dledge = "*"):
    df = pd.DataFrame()

    for file in glob.glob('./shr/delauney/*/{cluster}/{dledge}/*_df*.pkl'.format(cluster=cluster, dledge=dledge), recursive=True):
        print(file)
        bn = os.path.dirname(file)+"/"
        print(bn)
        df = pd.concat([df, pd.read_pickle(file)])
    
    return df

def chpath(cluster = "*", dledge = "*"):
    path = './shr/delauney/combined/'
    if cluster != "*":
        path += cluster
    if dledge != "*":
        path += "_"+dledge
    if cluster == "*" and dledge == "*":
        path += "all"
    if not os.path.exists(path):
        os.makedirs(path)
    return path+"/"

for cluster in clusters:
    for dledge in dledges:
        showdfres(combinedfs(cluster=cluster, dledge=dledge), folder=chpath(cluster, dledge), calclin=True, calclog=False)


./shr/delauney/150/1/0.2/delauney_df_d8029488-af50-4bab-bcd4-aae9d48b4e49.pkl
./shr/delauney/150/1/0.2/
./shr/delauney/150/1/0.0/delauney_df_d8029488-af50-4bab-bcd4-aae9d48b4e49.pkl
./shr/delauney/150/1/0.0/
./shr/delauney/150/1/0.4/delauney_df_d8029488-af50-4bab-bcd4-aae9d48b4e49.pkl
./shr/delauney/150/1/0.4/
./shr/delauney/150/3/0.2/delauney_df_d8029488-af50-4bab-bcd4-aae9d48b4e49.pkl
./shr/delauney/150/3/0.2/
./shr/delauney/150/3/0.0/delauney_df_d8029488-af50-4bab-bcd4-aae9d48b4e49.pkl
./shr/delauney/150/3/0.0/
./shr/delauney/150/3/0.4/delauney_df_d8029488-af50-4bab-bcd4-aae9d48b4e49.pkl
./shr/delauney/150/3/0.4/
./shr/delauney/300/1/0.2/delauney_df_d8029488-af50-4bab-bcd4-aae9d48b4e49.pkl
./shr/delauney/300/1/0.2/
./shr/delauney/300/1/0.0/delauney_df_d8029488-af50-4bab-bcd4-aae9d48b4e49.pkl
./shr/delauney/300/1/0.0/
./shr/delauney/300/1/0.4/delauney_df_d8029488-af50-4bab-bcd4-aae9d48b4e49.pkl
./shr/delauney/300/1/0.4/
./shr/delauney/300/3/0.2/delauney_df_d8029488-af50-4bab-bcd4-aae