In [236]:
import math 
import numpy as np
import pandas as pd
from scipy.special import gamma, digamma
from scipy.misc import derivative
from scipy.stats import multivariate_normal
from sklearn import datasets
import random 
from sklearn.cluster import KMeans


In [237]:
def trigamma(element):
    return derivative(digamma, element, dx=1e-6)
    

In [238]:
def alphabar(uj, vj):
    alphabar = []
    for i in range(len(uj)):
        alphabar.append(uj[i]/vj[i])
    return alphabar

def expected_lnalpha(uj, vj):
    lnalpha = []
    for i in range(len(uj)):
        lnalpha.append(digamma(uj[i]) - np.log(vj[i]))
    return lnalpha

def expected_lnlanda(g, h):
    exp = []
    for i in range(len(g)):
        exp.append(digamma(g[i]) - digamma(g[i] + h[i]))
    return exp

def exp_ln1landa(g, h):
    exp = []
    for i in range(len(g)):
        exp.append(digamma(h[i]) - digamma(g[i] + h[i]))
    return exp

def exp_lnalpha_lnalphabar2(uj):
    exp = []
    for i in range(len(uj)):
        exp.append(pow(digamma(uj[i]) - np.log(uj[i]), 2) + trigamma(uj[i]))
    return exp



In [258]:
class kd_tree:

    def __init__(self):
        self.leave_number = 0
        
    def add_leaves(self, arr):
        if self.leave_number == 0:
            self.leaves = [arr]
        elif self.leave_number > 0:
            self.leaves.append(arr)

        self.leave_number = self.leave_number + 1
    
    def return_leaves(self):
        return self.leaves
    
    def bubbleSort(self, arr, column): 
        n = len(arr.index) 
        arr1 = arr.copy()
        if n <= 1:
            return arr1
        for i in range(n-1): 
            for j in range(0, n-i-1): 
                if arr1.loc[j, column] > arr1.loc[j+1, column]:
                    tempjplus, tempj = arr1.loc[j+1, :].copy(), arr1.loc[j, :].copy()
                    arr1.loc[j+1, :], arr1.loc[j, :] = tempj, tempjplus
            
        return arr1
    
    def build_kdtree(self, arr, depth, start, current=0):
        n = len(arr)
        if n <= 1:
            self.add_leaves(arr)
    
        elif current < depth-1:
            arr1 = self.bubbleSort(arr, start % (len(arr.columns)-3))
            
            arr2 = arr1.loc[0:math.floor((n-1)/2), :].copy()
            arr2.reset_index(drop=True, inplace=True)
            self.build_kdtree(arr2, depth, (start+1), current + 1)
            
            arr3 = arr1.loc[math.floor((n-1)/2+1):, :].copy()
            arr3.reset_index(drop=True, inplace=True)
            self.build_kdtree(arr3, depth, (start+1), current + 1)
            
        elif current == depth-1:
             self.add_leaves(arr)
        

In [240]:
def Rj(uj, vj, gj, hj):
    
    alphabarj = alphabar(uj, vj)
    betabarj = alphabar(gj, hj)
    lnalphabar = expected_lnalpha(uj, vj)
    lnbetabar = expected_lnalpha(uj, vj)
    lnalpha_lnalphabar2 = exp_lnalpha_lnalphabar2(uj)
    lnbeta_lnbetabar2 = exp_lnalpha_lnalphabar2(gj)
    rbar = 0
    for l in range(len(uj)):
        rbar += np.log(gamma(alphabarj[l] + betabarj[l])/(gamma(alphabarj[l])*gamma(betabarj[l])))
        rbar += alphabarj[l] * (digamma(alphabarj[l] + betabarj[l]) - digamma(alphabarj[l])) * (lnalphabar[l] - np.log(alphabarj[l]))
        rbar += betabarj[l] * (digamma(alphabarj[l] + betabarj[l]) - digamma(betabarj[l])) * (lnbetabar[l] - np.log(betabarj[l]))
        rbar += 0.5 * pow(alphabarj[l], 2) * (trigamma(alphabarj[l] + betabarj[l]) - trigamma(alphabarj[l])) * lnalpha_lnalphabar2[l]
        rbar += 0.5 * pow(betabarj[l], 2) * (trigamma(alphabarj[l] + betabarj[l]) - trigamma(betabarj[l])) * lnbeta_lnbetabar2[l]
        rbar += alphabarj[l] * betabarj[l] * trigamma(alphabarj[l] + betabarj[l]) * (lnalphabar[l] - np.log(alphabarj[l])) * (lnbetabar[l] - np.log(betabarj[l]))
    return rbar
    
def Rtotal(u, v):
    r = []
    for i in range(len(u.index)):
        r.append(Rj(u.loc[i, :], v.loc[i, :]))
    return r

In [241]:
def RouTJ(j, xtt, uj, vj, gj, hj, a, b):
    routj = Rj(uj, vj, gj, hj)
    alphabarj = alphabar(uj, vj)
    betabarj = alphabar(gj, hj)
    xt = xtt.mean()
    for l in range(len(alphabarj)):
        routj += (alphabarj[l] - l) * np.log(xt[l])
        routj -= (alphabarj[l] + betabarj[l]) * np.log(1 + xt[l])
        
    routj += expected_lnlanda(a, b)[j]
    exp_landa_ab = exp_ln1landa(a, b)
    
    for i in range(j):
        routj += exp_landa_ab[i]
    return routj

def RouT(xt, u, v, g, h, a, b, m):
    rout = []
    for j in range(m):
        rout.append(RouTJ(j, xt, u.loc[j, :], v.loc[j, :], g.loc[j, :], h.loc[j, :], a, b))
        
    return rout
    
def Rou(x, u, v, g, h, a, b, m):
    rou = []
    for xt in x:
        rou.append(RouT(xt, u, v, g, h, a, b, m))
    df = pd.DataFrame(rou)
    return df

In [242]:
def zt_equalsto_j(rout, j):
    sum_exp_rout = 0
    for i in range(len(rout)):
        sum_exp_rout += math.exp(rout[i])
    
    return math.exp(rout[j]) / sum_exp_rout

In [243]:
def ustarjl(uj, vj, gj, hj, rou, l, j, x): #Same for g
    ustar = uj[l]
    
    alphabarj = alphabar(uj, vj)
    betabarj = alphabar(uj, vj)
    lnalphabarj = expected_lnalpha(uj, vj)
    lnbetabarj = expected_lnalpha(gj, hj)
    
    theBracket = digamma(alphabarj[l]+betabarj[l]) - digamma(alphabarj[l]) + betabarj[l]*trigamma(alphabarj[l]+betabarj[l])*(lnbetabarj[l]-np.log(betabarj[l]))    
    for i in range(len(x)):
        ustar += len(x[i]) * zt_equalsto_j(rou.loc[i, :], j) * alphabarj[l] * theBracket
    return ustar

def vstarjl(vj, rou, l, j, x): #Same for h
    vstar = vj[l]
    for i in range(len(x)):
        mean = np.log(1 / (1+x[i].mean()))
        vstar -= len(x[i]) * zt_equalsto_j(rou.loc[i, :], j) * mean[l]
    return vstar
        
def make_uvstar(u, v, g, h, rou, x): #Same for gh
    
    ustar = pd.DataFrame().reindex_like(u)
    vstar = pd.DataFrame().reindex_like(v)
    
    for j in range(len(u.index)):
        for l in range(len(u.columns)):
            ustar.loc[j, l] = ustarjl(u.loc[j, :], v.loc[j, :], g.loc[j, :], h.loc[j, :], rou, l, j, x)
            vstar.loc[j, l] = vstarjl(v.loc[j, :], rou, l, j, x)
    
    return ustar, vstar

def aj(x, rou, j):
    a = 1
    for i in range(len(x)):
        a += len(x[i]) * zt_equalsto_j(rou.loc[i, :], j)
    return a
    
def bj(phi, x, rou, j, m):
    b = phi[j]
    zequaltot = 0

    for i in range(len(x)):
        for k in range(j, m):
            zequaltot += zt_equalsto_j(rou.loc[i, :], k)
        b += len(x[i]) * zequaltot
    return b

In [244]:
def generalized_inverted_dir(yi, aj, bj):
    sum_j = 0 
    for ajl in aj:
        sum_j += ajl
    p = 1
    for d in range(len(aj)):
        p *= (gamma(aj[d]+bj[d]) * pow(yi[d], aj[d]-1)) / (gamma(aj[d]) * gamma(bj[d]) * pow(1+sum_j, aj[d]+bj[d]-bj[(d+1)%len(aj)]))
    return p
        
def gamma_distribution(ajl, ujl, vjl):
    return pow(vjl, ujl) * pow(ajl, ujl-1) * math.exp(-vjl * ajl) / gamma(ujl)

def p_xi_zai(x, z, alpha, beta, i):
    result = 1
    j = z[i]
    return generalized_inverted_dir(x.loc[i, :], alpha.loc[j, :], beta.loc[j, :])
        

def p_z_landa(z, landa, i):
    result = 1
    for j in range(len(landa)):
        if z[i] == j:
            result *= landa[j] 
            for s in range(j):
                result *= 1 - landa[s]
    
    return result

def p_lambdaj(phi, landa, j):
    return phi[j] * pow(1 - landa[j], phi[j]-1)


def qp_alpha(alpha, u, v, j, l):
    return gamma_distribution(alpha.loc[j, l], u.loc[j, l], v.loc[j, l])

def q_lambdaj(landa, a, b, j):
    return (gamma(a[j] + b[j]) / (gamma(a[j]) * gamma(b[j]))) * pow(landa[j], a[j]-1) * pow(1-landa[j], b[j]-1)

def q_z(x, u, v, g, h, a, b, i, z):
    result = 1
    for j in range(len(u)):
        if j == z[i]:
            result *= rij(x, u, v, g, h, a, b, i, j)
    return result
    
def rij(x, u, v, g, h, a, b, i, j):
    rou = roui(x, u, v, g, h, a, b, i)
    sum = 0
    for k in range(len(rou)):
        sum += math.exp(rou[k])
        
    return math.exp(rou[j]) / sum  
    
def rouij(x, uj, vj, gj, hj, a, b, i, j):
    alphabarj = alphabar(uj, vj)
    betabarj = alphabar(gj, hj)
    result = Rj(uj, vj, gj, hj)
    for l in range(len(alphabarj)):
        result += (alphabarj[l] - 1) * np.log(x.loc[i, l])
        result -= (alphabarj[l] + betabarj[l]) * np.log(1 + x.loc[i, l])
    temp = expected_lnlanda(a, b)
    result += temp[j]
    
    ln1landa = exp_ln1landa(a, b)
    for s in range(j):
        result += ln1landa[s]
        
    return result

def roui(x, u, v, g, h, a, b, i):
    roui = []
    for j in range(len(u)):
        roui.append(rouij(x, u.loc[j, :], v.loc[j, :], g.loc[j, :], h.loc[j, :], a, b, i, j))
    
    return roui

def F(x, ustar, vstar, gstar, hstar, a, b, landa, phi, z, u, v, g, h): # They are expected value. I should change them later.
    f = 0
    alpha = u.div(v)
    beta = g.div(h)
    for i in range(len(x.columns)):
        inside_log = q_z(x, ustar, vstar, gstar, hstar, a, b, i, z)/(p_z_landa(z, landa, i)*p_xi_zai(x, z, alpha, beta, i))
        f += np.log(inside_log)
    for j in range(len(landa)):
        f += np.log(q_lambdaj(landa, a, b, j) / p_lambdaj(phi, landa, j))
        for l in range(len(alpha.index)):
            f += qp_alpha(alpha, ustar, vstar, j, l) / qp_alpha(alpha, u, v, j, l)
#             print(pow(v.loc[j,l], u.loc[j,l]))
        for l in range(len(beta.index)):
            f += qp_alpha(beta, gstar, hstar, j, l) / qp_alpha(beta, g, h, j, l)
    
    return f
    
            

In [245]:
def set_z(dataset, rou):
    max_index = rou.idxmax(axis = 1) 
    max_index = [y - min(max_index) for y in max_index]
    for i in range(len(dataset)):
        for j in range(len(dataset[i])):
            dataset[i].loc[j, 'result'] = max_index[i]
    
    return dataset

def set_precision(dataset, rou):
    newdata = set_z(dataset, rou)
    length = 0
    true_positive = 0
    for i in range(len(dataset)):
        for j in range(len(dataset[i])):
            length = length + 1 
            if(newdata[i].loc[j, 'result'] == newdata[i].loc[j, 'target']):
                true_positive = true_positive + 1
    
    return (true_positive / length)
        

In [266]:
data = datasets.load_iris()
xx = pd.DataFrame(np.array(data.data))
xx['target'] = data.target
kmeans = KMeans(n_clusters=3, random_state=0).fit(xx)
xx['label'] = kmeans.labels_
iris = []
iris.append(xx[xx['label'] == 0])
iris.append(xx[xx['label'] == 1])
iris.append(xx[xx['label'] == 2])

for i in iris:
    i.reset_index(drop=True, inplace=True)

f_iris = iris.copy()

ui = np.array([[6000, 5000, 4000, 5000], [5000, 4000, 5000, 4000], [5000, 2000, 4000, 5000]])
u = pd.DataFrame(ui)
vi = np.array([[4000, 6000, 6000, 4000], [6000, 4000, 3500, 5000], [2000, 5000, 6000, 4000]])
v = pd.DataFrame(vi)
gi = np.array([[5000, 6000, 5000, 5000], [3000, 5000, 8000, 5000], [4000, 7000, 3000, 5000]])
g = pd.DataFrame(gi)
hi = np.array([[5000, 7000, 6000, 3000], [5000, 7000, 6000, 3000], [7000, 5000, 6000, 4000]])
h = pd.DataFrame(hi)

phi = [1, 2, 3]
a = [300, 200, 600]
b = [400, 500, 500]
landa = [0.1, 0.2, 0.3]

# kd = kd_tree()
# kd.build_kdtree(iris, 2, 0)
# iris = kd.return_leaves()

M = 3
start = 0
iris_rou = Rou(iris, u, v, g, h, a, b, M)
print(iris)
for kasra in range(3):
    print("\n222222222222222222\n")
    rand = iris_rou.idxmax(axis = 1)
    c = rand[random.randint(0, len(rand)-1)]

    length = len(iris)
    start += 1
    for l in range(length):
        xt = iris.pop(0)
        rout = iris_rou.loc[l, :]
        flag = 0
        for i in range(M):
            if(zt_equalsto_j(rout, i) > zt_equalsto_j(rout, c)):
                flag = 1
                break
        
        if(flag == 0):
            kd1 = kd_tree()
            kd1.build_kdtree(xt, 2, start)
            x_two_part = kd1.return_leaves()
            iris.append(x_two_part[0])
            iris.append(x_two_part[1])
            del xt
        else:
            iris.append(xt)
            
    uu = u.copy()
    uu.loc[M, :] = u.loc[c, :].copy()
    vv = v.copy()
    vv.loc[M, :] = v.loc[c, :].copy()
    gg = g.copy()
    gg.loc[M, :] = g.loc[c, :].copy() 
    hh = h.copy()
    hh.loc[M, :] = h.loc[c, :].copy() 
    
    a.append(a[c])
    b.append(b[c])
    
    iris_rou = Rou(iris, uu, vv, gg, hh, a, b, M+1)
    
    for l in range(len(u.columns)):
        uu.loc[c, l] = ustarjl(uu.loc[c, :], vv.loc[c, :], gg.loc[c, :], hh.loc[c, :], iris_rou, l, c, iris)
        vv.loc[c, l] = vstarjl(vv.loc[c, :], iris_rou, l, c, iris)
        gg.loc[c, l] = ustarjl(gg.loc[c, :], hh.loc[c, :], uu.loc[c, :], vv.loc[c, :], iris_rou, l, c, iris)
        hh.loc[c, l] = vstarjl(hh.loc[c, :], iris_rou, l, c, iris)

        uu.loc[M, l] = ustarjl(uu.loc[M, :], vv.loc[M, :], gg.loc[M, :], hh.loc[M, :], iris_rou, l, M, iris)
        vv.loc[M, l] = vstarjl(vv.loc[M, :], iris_rou, l, M, iris)
        gg.loc[M, l] = ustarjl(gg.loc[M, :], hh.loc[M, :], uu.loc[M, :], vv.loc[M, :], iris_rou, l, M, iris)
        hh.loc[M, l] = vstarjl(hh.loc[M, :], iris_rou, l, M, iris)
    
    u, v = make_uvstar(uu, vv, gg, hh, iris_rou, iris)
    g, h = make_uvstar(gg, hh, uu, vv, iris_rou, iris)

    
    a[c] = aj(iris, iris_rou, c)
    a[M] = aj(iris, iris_rou, M)
    
    phi.append(random.random())
    landa.append(random.random())
    
    b[c] = bj(phi, iris, iris_rou, c, M+1)
    b[M] = bj(phi, iris, iris_rou, M, M+1)
    
    iris_rou = Rou(iris, u, v, g, h, a, b, M+1)
    
    temp_iris = set_z(iris, iris_rou)
    
    z = []
    for xt in temp_iris:
        z.extend(xt.loc[:, 'result'])
    print(z)
    
#     print(F(f_iris, u, v, g, h, a, b, landa, phi, z, uu, vv, gg, hh))
    M += 1

    print(c)
    print("\n")
    print(iris_rou)
    print("\n")
#     print(u)
#     print("\n")
#     print(v)
#     print("\n")
#     print(g)
#     print("\n")
#     print(h)
#     print("\n")
#     print(a)
#     print("\n")
#     print(b)
#     print("\n")

# print(iris_rou)
    
print(set_precision(iris, iris_rou))


    
    

[      0    1    2    3  target  label
0   6.3  3.3  6.0  2.5       2      0
1   5.8  2.7  5.1  1.9       2      0
2   7.1  3.0  5.9  2.1       2      0
3   6.3  2.9  5.6  1.8       2      0
4   6.5  3.0  5.8  2.2       2      0
5   7.6  3.0  6.6  2.1       2      0
6   7.3  2.9  6.3  1.8       2      0
7   6.7  2.5  5.8  1.8       2      0
8   7.2  3.6  6.1  2.5       2      0
9   6.5  3.2  5.1  2.0       2      0
10  6.4  2.7  5.3  1.9       2      0
11  6.8  3.0  5.5  2.1       2      0
12  5.7  2.5  5.0  2.0       2      0
13  5.8  2.8  5.1  2.4       2      0
14  6.4  3.2  5.3  2.3       2      0
15  6.5  3.0  5.5  1.8       2      0
16  7.7  3.8  6.7  2.2       2      0
17  7.7  2.6  6.9  2.3       2      0
18  6.0  2.2  5.0  1.5       2      0
19  6.9  3.2  5.7  2.3       2      0
20  5.6  2.8  4.9  2.0       2      0
21  7.7  2.8  6.7  2.0       2      0
22  6.3  2.7  4.9  1.8       2      0
23  6.7  3.3  5.7  2.1       2      0
24  7.2  3.2  6.0  1.8       2      0
25  6.2  2.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
2


           0          1          2          3
0 -14.540740 -15.625414 -15.039478 -14.807112
1 -15.197163 -16.267743 -15.723483 -15.493149
2  -5.357076  -5.708696  -6.756365  -6.534776
3 -11.957124 -12.923148 -12.673913 -12.443532
4 -13.027284 -1

In [262]:
yi = np.array([[4, 6, 9], [45, 20, 12], [5, 12, 7], [11, 20, 3], [40, 4, 4], [5, 5, 5], [6, 6, 6], [7, 7, 7], [8, 8, 8], [9, 9, 9]])
y = pd.DataFrame(yi)

ui = np.array([[6000, 5000, 4000, 5000], [5000, 4000, 5000, 4000], [5000, 2000, 4000, 5000]])
u = pd.DataFrame(ui)

vi = np.array([[4000, 6000, 6000, 4000], [6000, 4000, 3500, 5000], [2000, 5000, 6000, 4000]])
v = pd.DataFrame(vi)

gi = np.array([[5000, 6000, 5000, 5000], [3000, 5000, 8000, 5000], [4000, 7000, 3000, 5000]])
g = pd.DataFrame(gi)

hi = np.array([[5000, 7000, 6000, 3000], [5000, 7000, 6000, 3000], [7000, 5000, 6000, 4000]])
h = pd.DataFrame(hi)

a = np.array([300, 200, 600])

b = np.array([400, 500, 500])

data = datasets.load_iris()
iris = pd.DataFrame(np.array(data.data))
xx = iris.copy()
kmeans = KMeans(n_clusters=3, random_state=0).fit(xx)
xx['label'] = kmeans.labels_
xxx = []
xxx.append(xx[xx['label'] == 0])
xxx.append(xx[xx['label'] == 1])
xxx.append(xx[xx['label'] == 2])
print(Rou(xxx, u, v, g, h, a, b, 3))
print(xx)

           0          1          2
0 -15.074449 -16.154062 -14.227888
1  -5.340111  -5.698338  -5.422237
2 -12.803415 -13.799205 -12.145992
       0    1    2    3  label
0    5.1  3.5  1.4  0.2      1
1    4.9  3.0  1.4  0.2      1
2    4.7  3.2  1.3  0.2      1
3    4.6  3.1  1.5  0.2      1
4    5.0  3.6  1.4  0.2      1
..   ...  ...  ...  ...    ...
145  6.7  3.0  5.2  2.3      0
146  6.3  2.5  5.0  1.9      2
147  6.5  3.0  5.2  2.0      0
148  6.2  3.4  5.4  2.3      0
149  5.9  3.0  5.1  1.8      2

[150 rows x 5 columns]
