In [1]:
import torch
import numpy as np
import random
import pandas as pd
from scipy.optimize import minimize
from numpy import array

In [2]:
def calc_prob(X, K, pMu, pSigma):
    N = X.shape[0]
    D = X.shape[1]
    Px = np.zeros((N, K))
    for i in range(K):
        Xshift = X-np.tile(pMu[i], (N, 1))
        lambda_flag = np.e**(-5)
        conv = pSigma[i]+lambda_flag*np.eye(D)
        inv_pSigma = np.linalg.inv(conv)
        tmp = np.sum(np.dot(Xshift, inv_pSigma)*Xshift, axis=1)
        coef = (2*np.pi)**(-D/2)*np.sqrt(np.linalg.det(inv_pSigma))
        Px[:, i] = coef*np.e**(-1/2*tmp)
    return Px

def gmm(X, K):       
    threshold = np.e**(-15)
    N = X.shape[0]
    D = X.shape[1]
    pMu = centroids
    pPi = np.zeros((1, K))
    pSigma = np.zeros((K, D, D))
    dist = np.tile(np.sum(X*X, axis=1).reshape(N,1), (1, K))+np.tile(np.sum(pMu*pMu, axis=1), (N, 1))-2*np.dot(X, pMu.T)
    labels = np.argmin(dist,axis=1)
    for i in range(K):
        index = labels == i
        Xk = X[index,:]
        pPi[:,i] = (Xk.shape[0])/N
        pSigma[i] = np.cov(Xk.T)
    Loss = -float("inf")
    while True:
        Px = calc_prob(X, K, pMu, pSigma)
        pGamma = Px*np.tile(pPi, (N, 1))
        pGamma = pGamma/np.tile(np.sum(pGamma, axis=1).reshape(N,1), (1, K))
        Nk = np.sum(pGamma, axis=0)
        pMu = np.dot(np.dot(np.diag(1/Nk), pGamma.T), X)
        pPi = Nk/N
        for i in range(K):
            Xshift = X-np.tile(pMu[i], (N, 1))
            pSigma[i] = np.dot(Xshift.T, np.dot(np.diag(pGamma[:, i]), Xshift))/Nk[i]
        L = np.sum(np.log(np.dot(Px, pPi.T)), axis=0)
        if L-Loss < threshold:
            break
        Loss = L
    return Px,pMu,pSigma,pPi

In [3]:
if __name__ == "__main__":        
    data = pd.read_csv('E:/pythonjupyter/03 COVID-19&air pollution/data/ft_data/wuhan_2018-2020_1.csv', encoding = "gbk")
    data = pd.DataFrame(data)
    data.iloc[:,2:] = data.iloc[:,2:].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) 
    tar_domain = data.iloc[6016:,:]
    tar_domain_ori = tar_domain.iloc[:,3:]
    Data_tar = tar_domain_ori.values
    N = Data_tar.shape[0]
    K = 5
    rndp = random.sample(np.arange(N).tolist(),K)
    centroids = Data_tar[rndp,:]
    Px_tar,pMu_tar,pSigma_tar,pPi_tar = gmm(Data_tar, 5)  

In [4]:
def target_function(source):
    temp1 = []
    for i in range(0, 5):
        temp = source[source['cluster'] == i].iloc[:,2:25] * pPi_tar[i]
        temp1.append(temp)
    temp2 = pd.concat([temp1[0], temp1[1], temp1[2], temp1[3], temp1[4]])
    source = pd.concat([source.iloc[:,:2],temp2],axis = 1)
    return source

tar_domain = tar_domain.reset_index()
tar_domain = tar_domain.iloc[:,1:]

cluster_tar = pd.DataFrame(Px_tar)
cluster_tar['cluster'] = cluster_tar.idxmax(1)
tar_domain['cluster'] = cluster_tar['cluster']
tar_domain = target_function(tar_domain)

In [5]:
def source_function(source):
    temp1 = []
    for i in range(0, 5):
        temp = source[source['cluster'] == i].iloc[:,2:25] * pPi[i]
        temp1.append(temp)
    temp2 = pd.concat([temp1[0], temp1[1], temp1[2], temp1[3], temp1[4]])
    source = pd.concat([source.iloc[:,:2],temp2],axis = 1)
    return source

def guassian_kernel(source, target, kernel_mul=2.0, kernel_num=5, fix_sigma=None):
    n_samples = int(source.size()[0])+int(target.size()[0])    
    total = torch.cat([source, target], dim=0)
    total0 = total.unsqueeze(0).expand(int(total.size(0)), int(total.size(0)), int(total.size(1)))
    total1 = total.unsqueeze(1).expand(int(total.size(0)), int(total.size(0)), int(total.size(1)))
    L2_distance = ((total0-total1)**2).sum(2)
    if fix_sigma:
        bandwidth = fix_sigma
    else:
        bandwidth = torch.sum(L2_distance.data) / (n_samples**2-n_samples)
    bandwidth /= kernel_mul ** (kernel_num // 2)
    bandwidth_list = [bandwidth * (kernel_mul**i) for i in range(kernel_num)]
    kernel_val = [torch.exp(-L2_distance / bandwidth_temp) for bandwidth_temp in bandwidth_list]
    return sum(kernel_val)
 
def mmd(source, target, kernel_mul=2.0, kernel_num=5, fix_sigma=None):
    n = int(source.size()[0])
    m = int(target.size()[0])
    kernels = guassian_kernel(source, target,kernel_mul=kernel_mul, kernel_num=kernel_num, fix_sigma=fix_sigma)
    XX = kernels[:n, :n] 
    YY = kernels[n:, n:]
    XY = kernels[:n, n:]
    YX = kernels[n:, :n]
    XX = torch.div(XX, n * n).sum(dim=1).view(1,-1)  
    XY = torch.div(XY, -n * m).sum(dim=1).view(1,-1) 
    YX = torch.div(YX, -m * n).sum(dim=1).view(1,-1) 
    YY = torch.div(YY, m * m).sum(dim=1).view(1,-1)  
    loss = (XX + XY).sum() + (YX + YY).sum()
    return loss

def min_mmd (pPi, sign=1.0):
    return loss

def func_deriv(pPi, sign=1):
    jac_x0 = sign * (2 * pPi[0])
    jac_x1 = sign * (2 * pPi[1])
    jac_x2 = sign * (2 * pPi[2])
    jac_x3 = sign * (2 * pPi[3])
    jac_x4 = sign * (2 * pPi[4])
    return np.array([jac_x0, jac_x1, jac_x2, jac_x3, jac_x4])

In [7]:
if __name__ == "__main__":
    src_domain = data.head(6016)
    data_1_ori = src_domain.iloc[:,3:]
    Data = data_1_ori.values
    N = Data.shape[0]
    D = Data.shape[1]
    K = 5
    loss_array1 = []
    src_domain = data.head(6016)
    data_1_ori = src_domain.iloc[:,3:] 
    Data = data_1_ori.values
    Px,pMu,pSigma,pPi = gmm(Data, 5) 
    cluster_1 = pd.DataFrame(Px)
    cluster_1['cluster'] = cluster_1.idxmax(1)
    src_domain['cluster'] = cluster_1['cluster']
    src_domain = source_function(src_domain)
    src_domain_1 = src_domain.iloc[:,3:] 
    tar_domain_1 = tar_domain.iloc[:,3:] 
    data_1 = torch.from_numpy(src_domain_1.values)
    data_2 = torch.from_numpy(tar_domain_1.values)
    loss = mmd(data_1,data_2)
    cons = ({'type': 'eq','fun': lambda x: np.array([x[0] + x[1] + x[2] + x[3] + x[4] - 1]),
                 'jac': lambda x: np.array([1, 1, 1, 1, 1])})
    bnds = ((0, 1), (0, 1),(0, 1),(0, 1),(0, 1))
    x0 = np.array([0.0, 1.0, 0.0, 0.0, 0.0])
    res = minimize(min_mmd, x0, bounds=bnds,constraints=cons)  
    loss_array = loss.numpy()
    loss_array1.append(loss_array) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
